diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index b74bbf538d..5448c7bdb7 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -52,6 +52,7 @@ def register_custom_op(cls):
 # Import the submodule containing specializations of ElementwiseBinaryOperation
 # Note: This will automatically register all decorated classes into this domain
 import finn.custom_op.fpgadataflow.elementwise_binary
+import finn.custom_op.fpgadataflow.elementwise_functions
 from finn.custom_op.fpgadataflow.addstreams import AddStreams
 from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp
 from finn.custom_op.fpgadataflow.concat import StreamingConcat
diff --git a/src/finn/custom_op/fpgadataflow/elementwise_functions.py b/src/finn/custom_op/fpgadataflow/elementwise_functions.py
new file mode 100644
index 0000000000..7ac39b06da
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/elementwise_functions.py
@@ -0,0 +1,348 @@
+# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+
+from finn.custom_op.fpgadataflow import register_custom_op
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+# Generic implementation for elementwise function operations
+class ElementwiseFunctionOperation(HWCustomOp):
+    # Specifies the elementwise operation to be implemented
+    #   Format: (Identifier, Python, C++, RTL)
+    _operation: tuple[str, np.ufunc, str, str] | None = None
+
+    # Numpy operation available as property
+    @property
+    def npy_op(self) -> np.ufunc:
+        return self._operation[1]
+
+    # C++ operation template available as property
+    @property
+    def cpp_op(self) -> str:
+        return self._operation[2]
+
+    # RTL operation template available as property
+    @property
+    def rtl_op(self) -> str:
+        return self._operation[3]
+
+    # Initializes the operator given an onnx graph node
+    def __init__(self, onnx_node, **kwargs):
+        # Just forward all arguments to the init method of the CustomOp base
+        super().__init__(onnx_node, **kwargs)
+
+    # Defines attributes which must be present on this node
+    def get_nodeattr_types(self):
+        # Start from parent operator class attributes
+        attrs = HWCustomOp.get_nodeattr_types(self)
+        # Update attributes dictionary for new custom operator
+        attrs.update(
+            {
+                # Data type of the input elements
+                "inp_dtype": ("s", True, ""),
+                # Data type of the output elements
+                "out_dtype": ("s", True, ""),
+                # Shape of the input
+                "inp_shape": ("ints", True, [1]),
+                # Shape of the output, must be equal to the input shape
+                "out_shape": ("ints", True, [1]),
+                # Number of elements in the last dimensions processed in parallel
+                "PE": ("i", False, 1),
+                # FPGA resource type for memories/internal buffers of the operator
+                "ram_style": ("s", False, "auto", {"auto", "block", "distributed", "ultra"}),
+                # memory mode for the const value
+                # internal_embedded -- embedded parameters
+                # internal_decoupled -- streaming parameters with streamer packaged inside IP
+                "mem_mode": (
+                    "s",
+                    False,
+                    "internal_embedded",
+                    {"internal_embedded", "internal_decoupled"},
+                ),
+                # Input and output FIFO depths for multi-I/O nodes
+                "inFIFODepths": ("ints", False, [2]),
+                "outFIFODepths": ("ints", False, [2]),
+            }
+        )
+        # Return updated attribute dictionary
+        return attrs
+
+    # Datatype attribute as property for convenience
+    @property
+    def inp_dtype(self):
+        # Note: Converts from string to QONNX data type
+        return DataType[self.get_nodeattr("inp_dtype")]
+
+    # Datatype attribute as property for convenience
+    @property
+    def out_dtype(self):
+        # Note: Converts from string to QONNX data type
+        return DataType[self.get_nodeattr("out_dtype")]
+
+    # Shape attribute as property for convenience
+    @property
+    def inp_shape(self):
+        return self.get_nodeattr("inp_shape")
+
+    # Shape attribute as property for convenience
+    @property
+    def out_shape(self):
+        return self.get_nodeattr("out_shape")
+
+    # Number of parallel processed elements as property for convenience
+    @property
+    def pe(self):
+        return self.get_nodeattr("PE")
+
+    # Infers the datatype of the node output
+    def infer_node_datatype(self, model: ModelWrapper):
+        # Get the node wrapped by this custom op
+        node = self.onnx_node
+        # Test for changing left-hand-side input datatype
+        if model.get_tensor_datatype(node.input[0]) != self.inp_dtype:
+            # Get the new datatype
+            new_dtype = model.get_tensor_datatype(node.input[0])
+            # Issue a warning message
+            warnings.warn(f"{node.name}: inp_dtype changing from {self.inp_dtype} to {new_dtype}")
+            # Set the new datatype attribute
+            self.set_nodeattr("inp_dtype", new_dtype.name)
+        # Force the output data type stored as a node attribute
+        model.set_tensor_datatype(node.output[0], self.out_dtype)
+
+    def execute_node(self, context, graph):
+        # Get the node wrapped by this custom op
+        node = self.onnx_node
+        # Get the inputs out of the execution context
+        inp = context[node.input[0]]
+        # Note: Need to make sure these have the right type for the Numpy API
+        # Note: Always simulate integer inputs in int64, numpy casting is
+        # weird....
+        inp = inp.astype(np.int64) if self.inp_dtype.is_integer() else inp
+        # Apply elementwise operation in numpy and insert
+        # result into the execution context
+        out = self.npy_op(inp)
+        # Make sure the output has the right type, e.g. turn all booleans into
+        # integers (actually floats as the container type)
+        # Note: This is relevant for logical ops, ==, <=, >=, etc.
+        # Note: Somehow QONNX does not like boolean tensors
+        # context[node.output[0]] = out.astype(self.out_dtype.to_numpy_dt())
+        # TODO: Apparently it is not? Verify this behavior...
+        context[node.output[0]] = out.astype(np.float32)
+
+    # Note: End of QONNX CustomOp region, below is FINN HWCustomOp stuff
+
+    # Gets the datatype of input at index ind
+    def get_input_datatype(self, ind=0):
+        # There is only one input
+        return self.inp_dtype
+
+    # Gets the datatype of the output at index ind
+    def get_output_datatype(self, ind=0):
+        # There is only one output, the type is set as an attribute
+        return self.out_dtype
+
+    # Gets the shape of the input at index ind without folding
+    def get_normal_input_shape(self, ind=0):
+        # Input shape is stored as a node attribute
+        return self.inp_shape
+
+    # Gets the shape of the output at index ind without folding
+    def get_normal_output_shape(self, ind=0):
+        # The output shape is stored as a node attribute
+        return self.out_shape
+
+    # Gets the shape of the input at index ind with folding
+    def get_folded_input_shape(self, ind=0):
+        # Get the normal shape before applying folding
+        *num_inputs, num_elems = self.get_normal_input_shape(ind=ind)
+        # Valid folding requires the PE to divide the number of elements
+        assert num_elems % self.pe == 0, "PE must divide last axis"
+        # Folding along the last dimension
+        return *num_inputs, num_elems // self.pe, self.pe
+
+    # Gets the shape of the output at index ind with folding
+    def get_folded_output_shape(self, ind=0):
+        # Get the normal shape before applying folding
+        *num_inputs, num_elems = self.get_normal_output_shape(ind=ind)
+        # Valid folding requires the PE to divide the number of elements
+        assert num_elems % self.pe == 0, "PE must divide last axis"
+        # Folding along the last dimension
+        return *num_inputs, num_elems // self.pe, self.pe
+
+    # Widths of the input data stream of the input at index ind
+    def get_instream_width(self, ind=0):
+        # Get the number of bits used to represent the input
+        i_bits = self.get_input_datatype(ind).bitwidth()
+        # Parallelism is the number of elements in the last dimension of the
+        # folded input
+        *_, elems = self.get_folded_input_shape(ind)
+        # Width of a stream receiving input elements in parallel
+        return elems * i_bits
+
+    # Widths of the output data stream of the output at index ind
+    def get_outstream_width(self, ind=0):
+        # Get the number of bits used to represent the output
+        o_bits = self.get_output_datatype(ind).bitwidth()
+        # Parallelism is the number of elements in the last dimension of the
+        # folded output
+        *_, elems = self.get_folded_output_shape(ind)
+        # Width of a stream producing output elements in parallel
+        return elems * o_bits
+
+    # Minimizes the width of the accumulator data type, 'accumulator width' here
+    # due to convention, it is actually the output data type
+    def minimize_accumulator_width(self, model: ModelWrapper):
+        # If the input is not an integer, the bit-width cannot be
+        # minimized
+        if not self.inp_dtype.is_integer():
+            # Check the annotated tensor data type corresponds to the stored
+            # attribute
+            assert (
+                model.get_tensor_datatype(self.onnx_node.output[0]) == self.out_dtype
+            ), f"Output type mismatch for {self.onnx_node.name}"
+            # Exit here, returning the not-minimized data type
+            return self.out_dtype
+        # Call the output type derivation specialized by the concrete operator
+        # implementation
+        out_dtype = self._derive_out_dtype(model)
+        # Set the new output data type as attribute
+        self.set_nodeattr("out_dtype", out_dtype.name)
+        # Annotate the output tensor with the new data type
+        model.set_tensor_datatype(self.onnx_node.output[0], out_dtype)
+        # Return the minimized output data type
+        # Note: Probably not required by MinimizeAccumulatorWidth transformation
+        return out_dtype
+
+    # Derives the optimal width of the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Depends on the actual operation performed and must be specialized by
+        # the concrete implementations
+        raise NotImplementedError(
+            f"_derive_out_dtype of {self.__class__.__name__} is not implemented!"
+        )
+
+    # Derives the expected cycles for the elementwise operation given the
+    # folding configuration
+    def get_exp_cycles(self):
+        # Number of iterations required to process the whole folded input stream
+        #   Note: This is all but the PE (last, parallelized) dimension
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+
+# Derive a specialization to implement the Relu activation function
+@register_custom_op
+class ElementwiseRelu(ElementwiseFunctionOperation):
+    @property
+    def npy_op(self):
+        def relu(x):
+            return np.maximum(x, 0)
+
+        return relu
+
+    @property
+    def cpp_op(self):
+        odt_hls_name = self.out_dtype.get_hls_datatype_str()
+        return "({0} > 0 ? (%s){0} : (%s)0)" % (odt_hls_name, odt_hls_name)
+
+    @property
+    def rtl_op(self):
+        return None
+
+    def _derive_out_dtype(self, model: ModelWrapper):
+        if self.inp_dtype.is_integer():
+            inp_bw = self.inp_dtype.bitwidth()
+            # The output would be unsigned with same bit-width as input
+            # if input was unsigned, else one bit less
+            out_bw = inp_bw - 1 if self.inp_dtype.signed() else inp_bw
+            return DataType[f"UINT{out_bw}"]
+
+        # output datatype is input datatype for all other data-formats
+        return self.inp_dtype
+
+
+# Derive a specialization to implement elementwise exponent of the input
+@register_custom_op
+class ElementwiseExp(ElementwiseFunctionOperation):
+    @property
+    def npy_op(self):
+        return np.exp
+
+    @property
+    def cpp_op(self):
+        # TODO: extend to fixed-point datatypes
+        assert self.out_dtype.get_canonical_name().startswith("FLOAT")
+        odt_hls_name = self.out_dtype.get_hls_datatype_str()
+        # Explicitly use the overloads, using hls::exp results in minor errors
+        if self.out_dtype.get_canonical_name() == "FLOAT32":
+            return "(hls::expf((%s){0}))" % (odt_hls_name)
+        elif self.out_dtype.get_canonical_name() == "FLOAT16":
+            return "(hls::half_exp((%s){0}))" % (odt_hls_name)
+
+    @property
+    def rtl_op(self):
+        return None
+
+    def _derive_out_dtype(self, model: ModelWrapper):
+        if self.inp_dtype.get_canonical_name() == "FLOAT16":
+            return DataType["FLOAT16"]
+        return DataType["FLOAT32"]
+
+
+# Derive a specialization to implement elementwise erf of the input
+@register_custom_op
+class ElementwiseErf(ElementwiseFunctionOperation):
+    @property
+    def npy_op(self):
+        import scipy.special
+
+        return scipy.special.erf
+
+    @property
+    def cpp_op(self):
+        # TODO: extend to fixed-point datatypes
+        assert self.out_dtype.get_canonical_name().startswith("FLOAT")
+        odt_hls_name = self.out_dtype.get_hls_datatype_str()
+        # Explicitly use the overloads, using hls::erf results in minor errors
+        if self.out_dtype.get_canonical_name() == "FLOAT32":
+            return "(hls::erff((%s){0}))" % (odt_hls_name)
+        elif self.out_dtype.get_canonical_name() == "FLOAT16":
+            return "(hls::half_erf((%s){0}))" % (odt_hls_name)
+
+    @property
+    def rtl_op(self):
+        return None
+
+    def _derive_out_dtype(self, model: ModelWrapper):
+        if self.inp_dtype.get_canonical_name() == "FLOAT16":
+            return DataType["FLOAT16"]
+        return DataType["FLOAT32"]
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index e80a581b57..01aa1e1c5a 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -53,6 +53,7 @@ def register_custom_op(cls):
 # Import the submodule containing specializations of ElementwiseBinaryOperation
 # Note: This will automatically register all decorated classes into this domain
 import finn.custom_op.fpgadataflow.hls.elementwise_binary_hls
+import finn.custom_op.fpgadataflow.hls.elementwise_functions_hls
 from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls
 from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls
 from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/elementwise_functions_hls.py b/src/finn/custom_op/fpgadataflow/hls/elementwise_functions_hls.py
new file mode 100644
index 0000000000..f11b30d6e5
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/elementwise_functions_hls.py
@@ -0,0 +1,439 @@
+# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import textwrap
+
+import finn.custom_op.fpgadataflow.elementwise_functions as elementwise_functions
+from finn.custom_op.fpgadataflow.elementwise_functions import (
+    ElementwiseFunctionOperation,
+)
+from finn.custom_op.fpgadataflow.hls import register_custom_op
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.basic import CppBuilder
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+# Mapping of memory resource attributes to the corresponding C++ HLS
+# pragma directives
+RAM_STYLES = {"auto": "AUTO", "block": "BRAM", "distributed": "LUTRAM", "ultra": "URAM"}
+
+
+# HLS Backend specialization of the elementwise function operation operator
+class ElementwiseFunctionOperation_hls(
+    # CapWords convention
+    ElementwiseFunctionOperation,
+    HLSBackend,
+):
+    # Node attributes matching the HLS operator
+    def get_nodeattr_types(self):
+        # Start from parent operator class attributes
+        attrs = ElementwiseFunctionOperation.get_nodeattr_types(self)
+        # Add the HLSBackend default attributes on top
+        attrs.update(HLSBackend.get_nodeattr_types(self))
+        # Add/Specialize implementation specific attributes here...
+        # Return the updated attributes dictionary
+        return attrs
+
+    # Maximum width of any ap_int used in this operator
+    def get_ap_int_max_w(self):
+        # Find the width of the input
+        i_bits_max = self.get_instream_width(ind=0)
+        # Width of the output, there is just one output
+        # Note: there is one output per replica
+        o_bits_max = self.get_outstream_width(ind=0)
+        # Find the biggest of the inputs/outputs
+        return max([i_bits_max, o_bits_max])
+
+    # Note: End of shape and datatype utilities
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Generates c++ code and tcl script for ip generation."""
+        super().code_generation_ipgen(model, fpgapart, clk)
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "internal_decoupled":
+            self.generate_hdl_memstream(fpgapart)
+
+    # Generates list of C++ includes to be placed at the top of the generated
+    # code
+    def global_includes(self):
+        # Currently nothing to include
+        self.code_gen_dict["$GLOBALS$"] = ['#include "flatten.hpp"']
+
+    # Generates C++ code of type alias, global constant and macro definitions
+    def defines(self, var):
+        # Insert constants and type aliases into the dictionary
+        self.code_gen_dict["$DEFINES$"] = [
+            # Input and output element datatypes
+            f"using InpType = {self.inp_dtype.get_hls_datatype_str()};",
+            f"using OutType = {self.out_dtype.get_hls_datatype_str()};",
+            # Width of single elements to avoid using ::width attribute which is
+            # not present for datatype float
+            f"static constexpr auto InpWidth = {self.inp_dtype.bitwidth()};",
+            f"static constexpr auto OutWidth = {self.out_dtype.bitwidth()};",
+            # Datatype of elements packed into the input stream
+            f"using InpPacked = ap_uint<{self.get_instream_width(ind=0)}>;",
+            # Datatype of elements packed into the output stream
+            f"using OutPacked = ap_uint<{self.get_outstream_width(ind=0)}>;",
+            # Input and output HLS stream datatypes
+            "using InpStream = hls::stream<InpPacked>;",
+            "using OutStream = hls::stream<OutPacked>;",
+        ]
+
+    # Generates C++ code for reading data from .npy (numpy format) for testing
+    # in C++ simulation
+    def read_npy_data(self):
+        # Input data is stored in numpy files in the code generation dictionary
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        # Prepare empty stream reading to append optionals
+        self.code_gen_dict["$READNPYDATA$"] = []
+
+        # Generate function calls for reading the input files into the input
+        # streams
+        npy_type = "half" if self.inp_dtype.get_hls_datatype_str() == "half" else "float"
+        self.code_gen_dict["$READNPYDATA$"] += [
+            # Generate function call reading from file into the input stream
+            #   Note: Inputs can be represented as numpy floats or halfs
+            f"npy2apintstream<InpPacked, InpType, InpWidth, {npy_type}>(",
+            f'"{code_gen_dir}/input_0.npy", in0_V, false',
+            ");",
+        ]
+
+    # Generates C++ code for declaring all streams involved in C++ simulation
+    # for testing
+    def strm_decl(self):
+        # Allways add the output stream to the declarations
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = [
+            # Note: Assumes stream type aliases to be set in defines
+            "OutStream out0_V;"
+        ]
+
+        # Generate a stream declaration
+        self.code_gen_dict["$STREAMDECLARATIONS$"] += [
+            # Note: Assumes stream type aliases to be set in defines
+            "InpStream in0_V;"
+        ]
+
+    # Generates C++ code for calling the computation part of the operator
+    def docompute(self):
+        # Get the folded shapes of all tensors involved without PE axis
+        inp_shape = self.get_folded_input_shape(ind=0)[:-1]
+        out_shape = self.get_folded_output_shape(ind=0)[:-1]
+
+        # Code generation of array index strings
+        def make_index_string(shape):
+            # Generate index operation [i]
+            return "".join([f"[i{d}]" for d in range(len(shape))])
+
+        inp_index = make_index_string(inp_shape)
+
+        # Generate C++ code for declaring an array of the buffer shapes
+        inp_shape = "".join([f"[{size}]" for size in inp_shape])
+
+        # Number of dimensions of the output. All shapes will be
+        # aligned to this number of dimensions.
+        # Note: +1 for the PE dimension
+        ndim = len(out_shape) + 1
+
+        # For-Loop template for nested loops over arbitrary many levels
+        def for_loop(level, size):
+            return f"for(std::size_t i{level} = 0; i{level}<{size}; ++i{level})"
+
+        # Type of memory to use for storing constant parameters
+        ram_style = RAM_STYLES[self.get_nodeattr("ram_style")]
+
+        # Write the body of the top-level function
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            # @formatter:off  Disable formatter for mixed Python and C++
+            # For streamed inputs, generate local buffer of non-broadcast size
+            # but broadcasts dimensions un-squeezed to size 1. For constant
+            # inputs, use the generated parameters of the same name.
+            # For streamed inputs, implement a simple dual-port RAM partitioned
+            # on the last, i.e., the PE, axis for parallel access.
+            f"""
+            InpType inp{inp_shape}[{self.pe}];
+            #pragma HLS ARRAY_PARTITION variable=inp complete dim={ndim}
+            #pragma HLS BIND_STORAGE variable=inp type=RAM_S2P impl={ram_style}
+            """,
+            # Buffer to hold the parallel output elements: Implement a simple
+            # dual-port RAM for the output buffer, partitioned on the last,
+            # i.e., the PE, axis for parallel access.
+            # Note: The PE output should be rather small, force this into
+            # distributed memory here.
+            # TODO: Maybe reconsider this later?
+            f"""
+            OutType out[{self.pe}];
+            #pragma HLS ARRAY_PARTITION variable=out complete dim=1
+            #pragma HLS BIND_STORAGE variable=out type=RAM_S2P impl=LUTRAM
+            """,
+            # Perfect loop nest over all folded output dimensions
+            *[for_loop(dim, size) + " {" for dim, size in enumerate(out_shape)],
+            # Pipeline the loops. This should be possible as there is no code
+            # between the loop levels, i.e., this is a perfect loop nest.
+            """
+            #pragma HLS pipeline II=1 style=flp
+            """,
+            # Read from the input stream
+            f"""
+            const auto buffer = Slice<InpType>{{}}(
+                in0_V.read()
+            );
+            for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{
+            #pragma HLS unroll
+                inp{inp_index}[pe] = buffer(pe, 0);
+            }}
+            """,
+            # Apply PE parallel elementwise operations by filling the operation
+            # template
+            f"""
+            for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{
+            #pragma HLS unroll
+                out[pe] = {self.cpp_op.format(
+                    f"inp{inp_index}[pe]"
+                )};
+            }}
+            """,
+            # Write the PE group into the output stream
+            """
+            out0_V.write(flatten(out));
+            """,
+            # Close all for-loop bodies of the generated nest
+            *["}" for _ in enumerate(out_shape)]
+            # @formatter:on  End of code generation
+        ]
+
+        # Post-process the generated code to remove unnecessary white space
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            textwrap.dedent(code) for code in self.code_gen_dict["$DOCOMPUTE$"]
+        ]
+
+    # Generates C++ code for reading the output stream and converting back to
+    # numpy format for testing in C** simulation
+    def dataoutstrm(self):
+        # Output data will be stored in numpy files in the code generation
+        # dictionary
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        # Get the expected shape of the folded output array formatted as a C++
+        # vector initializer
+        # Note: Valid formatting relies on correct placement of curly braces
+        # and line breaks: Open/close all three braces on the same line of code
+        # to avoid '\n' to be inserted into the string
+        shape = f"""{{{
+        ','.join((str(i) for i in self.get_folded_output_shape(ind=0)))
+        }}}"""
+        # Generate function call for reading from the output stream into the
+        # output file
+        npy_type = "half" if self.out_dtype.get_hls_datatype_str() == "half" else "float"
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            # Generate function call reading from stream into the output file
+            #   Note: Outputs can be numpy floats or halfs
+            f"apintstream2npy<OutPacked, OutType, OutWidth, {npy_type}>(",
+            f'out0_V, {shape}, "{code_gen_dir}/output_0.npy", false',
+            ");",
+        ]
+
+    # Generates C++ code for saving the output of C++ simulation to a file in
+    # numpy format
+    def save_as_npy(self):
+        # Note: This seems to be empty in ALL HLSBackends. Probably it was used
+        # for something before, which is now integrated into dataoutstrm()?
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    # Generates essentially the head of the C++ function from which the IP block
+    # will be generated during ipgen, i.e. actual synthesis
+    def blackboxfunction(self):
+        # Insert function head describing the top level interface of the
+        # attention operator
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            # Note: Assumes stream type aliases to be set in defines
+            f"void {self.onnx_node.name} (",
+            "  InpStream &in0_V,",
+            "  OutStream &out0_V",
+            ")",
+        ]
+
+    # Generates C++ pragmas to be inserted into the main function of the C++
+    # simulation and the ipgen-blackboxfunction as well
+    def pragmas(self):
+        # Check whether there are already pragmas in the code generation
+        # dictionary
+        if "$PRAGMAS$" not in self.code_gen_dict:
+            # If not, insert an empty list to collect more pragmas
+            self.code_gen_dict["$PRAGMAS$"] = []
+
+        # Add HLS interface directives specifying how to create RTL ports for
+        # the top-level function arguments
+        self.code_gen_dict["$PRAGMAS$"] += [
+            # Connect the output stream with an axi stream interface
+            "#pragma HLS INTERFACE axis port=out0_V",
+        ]
+        # Connect the lhs input stream with an axi stream interface
+        self.code_gen_dict["$PRAGMAS$"] += [
+            "#pragma HLS INTERFACE axis port=in0_V",
+        ]
+
+        # No block-level I/O protocol for the function return value
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+    # Returns the names of input and output interfaces grouped by protocol
+    def get_verilog_top_module_intf_names(self):
+        # Start collecting interface names in a dictionary starting with clock
+        # and reset
+        intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]}
+        # AXI stream input interfaces
+        intf_names["s_axis"] = [("in0_V", self.get_instream_width_padded(ind=0))]
+        # AXI stream output interfaces
+        intf_names["m_axis"] = [("out0_V", self.get_outstream_width_padded(ind=0))]
+        # No AXI-MM, AXI-Lite or protocol-less interfaces
+        intf_names["aximm"] = []
+        intf_names["axilite"] = []
+        intf_names["ap_none"] = []
+        # Return the interface name dictionary
+        return intf_names
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        if mode == "cppsim":
+            HLSBackend.execute_node(self, context, graph)
+        elif mode == "rtlsim":
+            # rtlsim execution needs to be overwritten here because the HLS code
+            # is dynamically generated which results in different interfaces
+            # Get the node wrapped by this custom op
+            node = self.onnx_node
+            # Input data is stored in numpy files in the code generation dictionary
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            # Get the inputs out of the execution context
+            inp = context[node.input[0]]
+            # Validate the shape of the inputs
+            assert list(inp.shape) == self.get_normal_input_shape(
+                ind=0
+            ), f"Input shape mismatch for {node.input[0]}"
+            # Reshape the inputs into folded form
+            inp = inp.reshape(self.get_folded_input_shape(ind=0))
+            # Path to store the intermediate inputs in numpy format
+            inp_filename = os.path.join(code_gen_dir, "input_0.npy")
+            # Save the folded inputs to file to be used by simulation
+            np.save(inp_filename, inp)
+            # Start collecting inputs/outputs to the RTL simulation in a dictionary
+            # Note: Prepare one output empty output list
+            io_dict = {"inputs": {}, "outputs": {"out0": []}}
+            # Type and width of the input tensors
+            inp_dtype = self.get_input_datatype(ind=0)
+            inp_width = self.get_instream_width(ind=0)
+
+            # Convert inputs to RTL simulation format
+            io_dict["inputs"]["in0"] = npy_to_rtlsim_input(inp_filename, inp_dtype, inp_width)
+
+            # Setup PyVerilator simulation of the node
+            sim = self.get_rtlsim()
+            # Reset the RTL simulation; finnxsi toggles the clock
+            super().reset_rtlsim(sim)
+            # Run the RTL Simulation
+            self.rtlsim_multi_io(sim, io_dict)
+
+            # Collect the output from RTL simulation
+            out = io_dict["outputs"]["out0"]
+            # Type and sizes of the output tensor
+            dtype = self.get_output_datatype(ind=0)
+            width = self.get_outstream_width(ind=0)
+            shape = self.get_folded_output_shape(ind=0)
+            # Path to store the intermediate numpy file
+            filename = os.path.join(code_gen_dir, "output_0.npy")
+            # Convert from RTL simulation format to numpy format
+            rtlsim_output_to_npy(out, filename, dtype, shape, width, dtype.bitwidth())
+            # Load the generated output numpy file
+            out = np.load(filename)
+            # Reshape the folded output and insert into the execution context
+            context[node.output[0]] = out.reshape(self.get_normal_output_shape(ind=0))
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+
+# HLS Backend specialization of the elementwise function operation operator
+# Specialized to include hls_math and link floating-point math IPs
+class ElementwiseMathFunctionOperation_hls(ElementwiseFunctionOperation_hls):
+    # Generates list of C++ includes to be placed at the top of the generated
+    # code
+    def global_includes(self):
+        super().global_includes()
+        # additional hls_math include
+        self.code_gen_dict["$GLOBALS$"] += ['#include "hls_math.h"']
+
+    def compile_singlenode_code(self):
+        """Builds the bash script for compilation using the CppBuilder from
+        finn.util.basic and executes the script to produce the executable."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        builder = CppBuilder()
+        # to enable additional debug features please uncommand the next line
+        # builder.append_includes("-DDEBUG")
+        builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp")
+        builder.append_includes("-I$FINN_ROOT/deps/cnpy/")
+        builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib")
+        builder.append_includes("-I$FINN_ROOT/custom_hls")
+        builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"]))
+        builder.append_includes("-I{}/include".format(os.environ["VITIS_PATH"]))
+        builder.append_includes("--std=c++14")
+        builder.append_includes("-O3")
+        builder.append_sources(code_gen_dir + "/*.cpp")
+        builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp")
+        builder.append_includes("-lz")
+        builder.append_includes('-Wl,-rpath,"$HLS_PATH/lnx64/lib/csim"')
+        builder.append_includes("-L$HLS_PATH/lnx64/lib/csim -lhlsmc++-GCC46")
+        builder.append_includes('-Wl,-rpath,"$HLS_PATH/lnx64/tools/fpo_v7_1"')
+        builder.append_includes("-L$HLS_PATH/lnx64/tools/fpo_v7_1 -lgmp -lmpfr")
+        builder.append_includes("-lIp_floating_point_v7_1_bitacc_cmodel")
+        builder.set_executable_path(code_gen_dir + "/node_model")
+        builder.build(code_gen_dir)
+        self.set_nodeattr("executable_path", builder.executable_path)
+
+
+# Derive a specialization to implement elementwise relu of the input
+@register_custom_op
+class ElementwiseRelu_hls(ElementwiseFunctionOperation_hls, elementwise_functions.ElementwiseRelu):
+    pass
+
+
+# Derive a specialization to implement elementwise exponent of the input
+@register_custom_op
+class ElementwiseExp_hls(
+    ElementwiseMathFunctionOperation_hls, elementwise_functions.ElementwiseExp
+):
+    pass
+
+
+# Derive a specialization to implement elementwise erf of the input
+@register_custom_op
+class ElementwiseErf_hls(
+    ElementwiseMathFunctionOperation_hls, elementwise_functions.ElementwiseErf
+):
+    pass
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 24806490c8..7044b298aa 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -46,6 +46,9 @@
 # Module containing specializations of elementwise binary operations
 import finn.custom_op.fpgadataflow.elementwise_binary as elementwise_binary
 
+# Module containing specializations of elementwise function operations
+import finn.custom_op.fpgadataflow.elementwise_functions as elementwise_functions
+
 
 class InferConvInpGen(Transformation):
     """Convert Im2Col layers to ConvolutionInputGenerator layers."""
@@ -2014,29 +2017,11 @@ def apply(self, model: ModelWrapper):  # noqa
         return model, graph_modified
 
 
-# Converts ReLU into ElementwiseMaximum(in, 0)
-class InferReLUAsElementwiseMax(Transformation):
-    # Filter function to filter out any operation involving any floating-point
-    # tensor
-    @staticmethod
-    def reject_unsupported_dtypes(model: ModelWrapper, node: NodeProto):
-        def dtype_ok(tname):
-            dt = model.get_tensor_datatype(tname)
-            if dt is None:
-                return False
-            if (
-                dt.is_integer()
-                or dt.is_fixed_point()
-                or dt in [DataType["FLOAT32"], DataType["FLOAT16"]]
-            ):
-                return True
-            else:
-                return False
-
-        return all([dtype_ok(tname) for tname in list(node.input) + list(node.output)])
-
+# Converts supported elementwise function operations to their FINN custom
+# operation
+class InferElementwiseFunctionOperation(Transformation):
     # Initializes the transformation method with an optional filter function
-    def __init__(self, _filter=reject_unsupported_dtypes):
+    def __init__(self, _filter=None):
         # Initialize the base class Transformation object
         super().__init__()
         # Register the filter function as attribute
@@ -2053,38 +2038,37 @@ def apply(self, model: ModelWrapper):  # noqa
             # Skip transforming nodes rejected by the filter
             if not self._filter(model, node):
                 continue
-            if node.op_type == "Relu":
+            # If a custom operation with corresponding name is implemented in
+            # the module, this operator is supported for conversion
+            if f"Elementwise{node.op_type}" in dir(elementwise_functions):
                 inp = node.input[0]
-                # add a second 0-valued input for ReLU
-                new_tname = model.make_new_valueinfo_name()
-                model.set_initializer(new_tname, np.asarray(0.0, dtype=np.float32))
-                # comparison of fp16 and uint2 is not possible in HLS
-                new_tdtype = (
-                    "FLOAT16"
-                    if model.get_tensor_datatype(inp).get_canonical_name() == "FLOAT16"
-                    else "UINT2"
-                )
-                # for the constant 0 input, use a small-width datatype
-                # (to avoid unnecessarily promoting output type to something larger)
-                model.set_tensor_datatype(new_tname, DataType[new_tdtype])
+                # if input is a constant, throw an error and
+                # ask user to run FoldConstants transform first
+                assert (
+                    model.get_initializer(inp) is None
+                ), """Input is a constant,
+                    please run FoldConstants from qonnx.transformation.fold_constants first."""
                 result = node.output[0]
 
                 # Need to "lift" potential scalar inputs to rank-1 tensors
                 lift_to_rank1(inp, model)
-                lift_to_rank1(new_tname, model)
+
+                inp_shape = model.get_tensor_shape(inp)
+                out_shape = model.get_tensor_shape(result)
+
+                idt0 = model.get_tensor_datatype(inp)
+                odt0 = model.get_tensor_datatype(result)
 
                 new_node = helper.make_node(
-                    "ElementwiseMax",
-                    [inp, new_tname],
+                    f"Elementwise{node.op_type}",
+                    [inp],
                     [result],
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
-                    lhs_shape=model.get_tensor_shape(inp),
-                    rhs_shape=model.get_tensor_shape(new_tname),
-                    out_shape=model.get_tensor_shape(result),
-                    lhs_dtype=str(model.get_tensor_datatype(inp)),
-                    rhs_dtype=str(model.get_tensor_datatype(new_tname)),
-                    out_dtype=str(model.get_tensor_datatype(result)),
+                    inp_shape=inp_shape,
+                    out_shape=out_shape,
+                    inp_dtype=str(idt0),
+                    out_dtype=str(odt0),
                 )
                 graph.node.insert(index + 1, new_node)
                 graph.node.remove(node)
diff --git a/tests/fpgadataflow/test_fpgadataflow_relu_elementwisemax.py b/tests/fpgadataflow/test_fpgadataflow_elementwise_functions.py
similarity index 67%
rename from tests/fpgadataflow/test_fpgadataflow_relu_elementwisemax.py
rename to tests/fpgadataflow/test_fpgadataflow_elementwise_functions.py
index 17ce600989..829c5952ef 100644
--- a/tests/fpgadataflow/test_fpgadataflow_relu_elementwisemax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_elementwise_functions.py
@@ -29,6 +29,7 @@
 import pytest
 
 import numpy as np
+import scipy.special
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.datatype import DataType
@@ -42,7 +43,7 @@
 from finn.core.onnx_exec import execute_onnx
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.convert_to_hw_layers import (
-    InferReLUAsElementwiseMax,
+    InferElementwiseFunctionOperation,
 )
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
@@ -57,65 +58,92 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 
-
-# Creates a model executing a ReLU operation
-def create_relu_model_onnx(inp_dtype, inp_shape):
-    # Create a node representing the binary elementwise operation
+# Mapping of ElementwiseFunctionOperation specializations to numpy reference
+# implementation functions
+NUMPY_REFERENCES = {
+    "ElementwiseRelu": lambda x: np.maximum(x, 0),
+    "ElementwiseExp": np.exp,
+    "ElementwiseErf": scipy.special.erf,
+}
+
+
+# Creates a model executing a elementwise function operation
+def create_elementwise_function_operation_onnx(op_type, inp_dtype, out_dtype, inp_shape):
+    # Remove "Elementwise" from op_type string which is the onnx ops op_type
+    onnx_op_type = op_type[11:]
+    # Automatically derive the output shape
+    out_shape = inp_shape
+    # Create a node representing the elementwise operation
     node = oh.make_node(
-        op_type="Relu",
+        op_type=onnx_op_type,
         inputs=["inp"],
         outputs=["out"],
     )
     if inp_dtype == "FLOAT16":
         inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT16, inp_shape)
-        out = oh.make_tensor_value_info("out", TensorProto.FLOAT16, inp_shape)
     else:
         inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, inp_shape)
-        out = oh.make_tensor_value_info("out", TensorProto.FLOAT, inp_shape)
+    if out_dtype == "FLOAT16":
+        out = oh.make_tensor_value_info("out", TensorProto.FLOAT16, out_shape)
+    else:
+        out = oh.make_tensor_value_info("out", TensorProto.FLOAT, out_shape)
     # Create a graph connecting the node to the inputs and outputs
-    graph = oh.make_graph([node], inputs=[inp], outputs=[out], name="relu-eltwisemax")
-    model = ModelWrapper(qonnx_make_model(graph, producer_name="relu-eltwisemax"))
+    graph = oh.make_graph([node], inputs=[inp], outputs=[out], name="elementwise-function")
+    model = ModelWrapper(qonnx_make_model(graph, producer_name="elementwise-function"))
 
-    # Add datatype annotation to the value info of tensors
+    # Add datatype annotation to the value info of input and output tensors
     model.set_tensor_datatype("inp", DataType[inp_dtype])
-    model.set_tensor_datatype("out", DataType[inp_dtype])
+    model.set_tensor_datatype("out", DataType[out_dtype])
 
     return model
 
 
+# Operator type to be tested
+@pytest.mark.parametrize(
+    "op_type",
+    [
+        # Test all Numpy references specified above
+        *NUMPY_REFERENCES.keys()
+    ],
+)
 # Data type of the input elements
-@pytest.mark.parametrize("inp_dtype", ["INT8", "FLOAT32", "FLOAT16", "FIXED<8,3>"])
+@pytest.mark.parametrize(
+    "inp_dtype",
+    ["FLOAT32", "FLOAT16", "INT6", "FIXED<8,3>"],
+)
 # Shape of the input
-@pytest.mark.parametrize("inp_shape", [[4], [3, 32, 1, 16]])
+@pytest.mark.parametrize("inp_shape", [[8]])
 # Number of elements to process in parallel
-@pytest.mark.parametrize("pe", [1, 2, 4])
+@pytest.mark.parametrize("pe", [1, 2])
 # Exec mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_relu_elementwisemax(inp_dtype, inp_shape, pe, exec_mode):
+def test_elementwise_function_operation(op_type, inp_dtype, inp_shape, pe, exec_mode):
+    if not op_type.endswith("Relu"):
+        if not inp_dtype.startswith("FLOAT"):
+            pytest.skip("Non-float inputs are not yet supported for functions except Relu.")
+    out_dtype = inp_dtype
     # Make dummy model for testing
-    model = create_relu_model_onnx(inp_dtype, inp_shape)
+    model = create_elementwise_function_operation_onnx(op_type, inp_dtype, out_dtype, inp_shape)
     # Prepare the execution context
-    context = {"inp": gen_finn_dt_tensor(DataType[inp_dtype], inp_shape)}
-    # Compute ground-truth output in software
-    o_ref = np.maximum(context["inp"], 0)
+    context = {
+        "inp": gen_finn_dt_tensor(DataType[inp_dtype], inp_shape),
+    }
+
+    # Get the numpy reference implementation for this operation
+    numpy_reference = NUMPY_REFERENCES[op_type]
 
     # Test running shape and data type inference on the model graph
     model = model.transform(InferDataTypes())
     model = model.transform(InferShapes())
 
     # Specializes all nodes to be implemented as HLS backend
-    model = model.transform(InferReLUAsElementwiseMax())
+    model = model.transform(InferElementwiseFunctionOperation())
 
     assert len(model.graph.node) == 1
-    assert model.graph.node[0].op_type == "ElementwiseMax"
-    # Execute the onnx model to collect the result
-    o_hw = execute_onnx(model, context)["out"]
-
-    # Compare the expected to the produced for exact equality
-    assert np.all(o_hw == o_ref)
+    assert model.graph.node[0].op_type == f"{op_type}"
 
     # Test running shape and data type inference on the model graph
     model = model.transform(InferDataTypes())
@@ -125,7 +153,7 @@ def test_relu_elementwisemax(inp_dtype, inp_shape, pe, exec_mode):
     model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
 
     assert len(model.graph.node) == 1
-    assert model.graph.node[0].op_type == "ElementwiseMax_hls"
+    assert model.graph.node[0].op_type == f"{op_type}_hls"
 
     getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe)
 
@@ -143,8 +171,17 @@ def test_relu_elementwisemax(inp_dtype, inp_shape, pe, exec_mode):
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
 
+    # Compute ground-truth output in software
+    inp = context["inp"]
+
+    o_expected = numpy_reference(inp)
     # Execute the onnx model to collect the result
-    o_sim = execute_onnx(model, context)["out"]
+    o_produced = execute_onnx(model, context)["out"]
 
-    # Compare the expected to the produced for exact equality
-    assert np.all(o_sim == o_ref)
+    if op_type.endswith("Relu"):
+        assert np.all(o_expected == o_produced)
+    else:
+        if inp_dtype == "FLOAT16":
+            assert np.allclose(o_expected, o_produced, rtol=1e-3, atol=2**-13)
+        else:
+            assert np.allclose(o_expected, o_produced)