diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index b74bbf538d..5448c7bdb7 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -52,6 +52,7 @@ def register_custom_op(cls): # Import the submodule containing specializations of ElementwiseBinaryOperation # Note: This will automatically register all decorated classes into this domain import finn.custom_op.fpgadataflow.elementwise_binary +import finn.custom_op.fpgadataflow.elementwise_functions from finn.custom_op.fpgadataflow.addstreams import AddStreams from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp from finn.custom_op.fpgadataflow.concat import StreamingConcat diff --git a/src/finn/custom_op/fpgadataflow/elementwise_functions.py b/src/finn/custom_op/fpgadataflow/elementwise_functions.py new file mode 100644 index 0000000000..7ac39b06da --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/elementwise_functions.py @@ -0,0 +1,348 @@ +# Copyright (C) 2025, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper + +from finn.custom_op.fpgadataflow import register_custom_op +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +# Generic implementation for elementwise function operations +class ElementwiseFunctionOperation(HWCustomOp): + # Specifies the elementwise operation to be implemented + # Format: (Identifier, Python, C++, RTL) + _operation: tuple[str, np.ufunc, str, str] | None = None + + # Numpy operation available as property + @property + def npy_op(self) -> np.ufunc: + return self._operation[1] + + # C++ operation template available as property + @property + def cpp_op(self) -> str: + return self._operation[2] + + # RTL operation template available as property + @property + def rtl_op(self) -> str: + return self._operation[3] + + # Initializes the operator given an onnx graph node + def __init__(self, onnx_node, **kwargs): + # Just forward all arguments to the init method of the CustomOp base + super().__init__(onnx_node, **kwargs) + + # Defines attributes which must be present on this node + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = HWCustomOp.get_nodeattr_types(self) + # Update attributes dictionary for new custom operator + attrs.update( + { + # Data type of the input elements + "inp_dtype": ("s", True, ""), + # Data type of the output elements + "out_dtype": ("s", True, ""), + # Shape of the input + "inp_shape": ("ints", True, [1]), + # Shape of the output, must be equal to the input shape + "out_shape": ("ints", True, [1]), + # Number of elements in the last dimensions processed in parallel + "PE": ("i", False, 1), + # FPGA resource type for memories/internal buffers of the operator + "ram_style": ("s", False, "auto", {"auto", "block", "distributed", "ultra"}), + # memory mode for the const value + # internal_embedded -- embedded parameters + # internal_decoupled -- streaming parameters with streamer packaged inside IP + "mem_mode": ( + "s", + False, + "internal_embedded", + {"internal_embedded", "internal_decoupled"}, + ), + # Input and output FIFO depths for multi-I/O nodes + "inFIFODepths": ("ints", False, [2]), + "outFIFODepths": ("ints", False, [2]), + } + ) + # Return updated attribute dictionary + return attrs + + # Datatype attribute as property for convenience + @property + def inp_dtype(self): + # Note: Converts from string to QONNX data type + return DataType[self.get_nodeattr("inp_dtype")] + + # Datatype attribute as property for convenience + @property + def out_dtype(self): + # Note: Converts from string to QONNX data type + return DataType[self.get_nodeattr("out_dtype")] + + # Shape attribute as property for convenience + @property + def inp_shape(self): + return self.get_nodeattr("inp_shape") + + # Shape attribute as property for convenience + @property + def out_shape(self): + return self.get_nodeattr("out_shape") + + # Number of parallel processed elements as property for convenience + @property + def pe(self): + return self.get_nodeattr("PE") + + # Infers the datatype of the node output + def infer_node_datatype(self, model: ModelWrapper): + # Get the node wrapped by this custom op + node = self.onnx_node + # Test for changing left-hand-side input datatype + if model.get_tensor_datatype(node.input[0]) != self.inp_dtype: + # Get the new datatype + new_dtype = model.get_tensor_datatype(node.input[0]) + # Issue a warning message + warnings.warn(f"{node.name}: inp_dtype changing from {self.inp_dtype} to {new_dtype}") + # Set the new datatype attribute + self.set_nodeattr("inp_dtype", new_dtype.name) + # Force the output data type stored as a node attribute + model.set_tensor_datatype(node.output[0], self.out_dtype) + + def execute_node(self, context, graph): + # Get the node wrapped by this custom op + node = self.onnx_node + # Get the inputs out of the execution context + inp = context[node.input[0]] + # Note: Need to make sure these have the right type for the Numpy API + # Note: Always simulate integer inputs in int64, numpy casting is + # weird.... + inp = inp.astype(np.int64) if self.inp_dtype.is_integer() else inp + # Apply elementwise operation in numpy and insert + # result into the execution context + out = self.npy_op(inp) + # Make sure the output has the right type, e.g. turn all booleans into + # integers (actually floats as the container type) + # Note: This is relevant for logical ops, ==, <=, >=, etc. + # Note: Somehow QONNX does not like boolean tensors + # context[node.output[0]] = out.astype(self.out_dtype.to_numpy_dt()) + # TODO: Apparently it is not? Verify this behavior... + context[node.output[0]] = out.astype(np.float32) + + # Note: End of QONNX CustomOp region, below is FINN HWCustomOp stuff + + # Gets the datatype of input at index ind + def get_input_datatype(self, ind=0): + # There is only one input + return self.inp_dtype + + # Gets the datatype of the output at index ind + def get_output_datatype(self, ind=0): + # There is only one output, the type is set as an attribute + return self.out_dtype + + # Gets the shape of the input at index ind without folding + def get_normal_input_shape(self, ind=0): + # Input shape is stored as a node attribute + return self.inp_shape + + # Gets the shape of the output at index ind without folding + def get_normal_output_shape(self, ind=0): + # The output shape is stored as a node attribute + return self.out_shape + + # Gets the shape of the input at index ind with folding + def get_folded_input_shape(self, ind=0): + # Get the normal shape before applying folding + *num_inputs, num_elems = self.get_normal_input_shape(ind=ind) + # Valid folding requires the PE to divide the number of elements + assert num_elems % self.pe == 0, "PE must divide last axis" + # Folding along the last dimension + return *num_inputs, num_elems // self.pe, self.pe + + # Gets the shape of the output at index ind with folding + def get_folded_output_shape(self, ind=0): + # Get the normal shape before applying folding + *num_inputs, num_elems = self.get_normal_output_shape(ind=ind) + # Valid folding requires the PE to divide the number of elements + assert num_elems % self.pe == 0, "PE must divide last axis" + # Folding along the last dimension + return *num_inputs, num_elems // self.pe, self.pe + + # Widths of the input data stream of the input at index ind + def get_instream_width(self, ind=0): + # Get the number of bits used to represent the input + i_bits = self.get_input_datatype(ind).bitwidth() + # Parallelism is the number of elements in the last dimension of the + # folded input + *_, elems = self.get_folded_input_shape(ind) + # Width of a stream receiving input elements in parallel + return elems * i_bits + + # Widths of the output data stream of the output at index ind + def get_outstream_width(self, ind=0): + # Get the number of bits used to represent the output + o_bits = self.get_output_datatype(ind).bitwidth() + # Parallelism is the number of elements in the last dimension of the + # folded output + *_, elems = self.get_folded_output_shape(ind) + # Width of a stream producing output elements in parallel + return elems * o_bits + + # Minimizes the width of the accumulator data type, 'accumulator width' here + # due to convention, it is actually the output data type + def minimize_accumulator_width(self, model: ModelWrapper): + # If the input is not an integer, the bit-width cannot be + # minimized + if not self.inp_dtype.is_integer(): + # Check the annotated tensor data type corresponds to the stored + # attribute + assert ( + model.get_tensor_datatype(self.onnx_node.output[0]) == self.out_dtype + ), f"Output type mismatch for {self.onnx_node.name}" + # Exit here, returning the not-minimized data type + return self.out_dtype + # Call the output type derivation specialized by the concrete operator + # implementation + out_dtype = self._derive_out_dtype(model) + # Set the new output data type as attribute + self.set_nodeattr("out_dtype", out_dtype.name) + # Annotate the output tensor with the new data type + model.set_tensor_datatype(self.onnx_node.output[0], out_dtype) + # Return the minimized output data type + # Note: Probably not required by MinimizeAccumulatorWidth transformation + return out_dtype + + # Derives the optimal width of the output data type + def _derive_out_dtype(self, model: ModelWrapper): + # Depends on the actual operation performed and must be specialized by + # the concrete implementations + raise NotImplementedError( + f"_derive_out_dtype of {self.__class__.__name__} is not implemented!" + ) + + # Derives the expected cycles for the elementwise operation given the + # folding configuration + def get_exp_cycles(self): + # Number of iterations required to process the whole folded input stream + # Note: This is all but the PE (last, parallelized) dimension + return np.prod(self.get_folded_output_shape()[:-1]) + + +# Derive a specialization to implement the Relu activation function +@register_custom_op +class ElementwiseRelu(ElementwiseFunctionOperation): + @property + def npy_op(self): + def relu(x): + return np.maximum(x, 0) + + return relu + + @property + def cpp_op(self): + odt_hls_name = self.out_dtype.get_hls_datatype_str() + return "({0} > 0 ? (%s){0} : (%s)0)" % (odt_hls_name, odt_hls_name) + + @property + def rtl_op(self): + return None + + def _derive_out_dtype(self, model: ModelWrapper): + if self.inp_dtype.is_integer(): + inp_bw = self.inp_dtype.bitwidth() + # The output would be unsigned with same bit-width as input + # if input was unsigned, else one bit less + out_bw = inp_bw - 1 if self.inp_dtype.signed() else inp_bw + return DataType[f"UINT{out_bw}"] + + # output datatype is input datatype for all other data-formats + return self.inp_dtype + + +# Derive a specialization to implement elementwise exponent of the input +@register_custom_op +class ElementwiseExp(ElementwiseFunctionOperation): + @property + def npy_op(self): + return np.exp + + @property + def cpp_op(self): + # TODO: extend to fixed-point datatypes + assert self.out_dtype.get_canonical_name().startswith("FLOAT") + odt_hls_name = self.out_dtype.get_hls_datatype_str() + # Explicitly use the overloads, using hls::exp results in minor errors + if self.out_dtype.get_canonical_name() == "FLOAT32": + return "(hls::expf((%s){0}))" % (odt_hls_name) + elif self.out_dtype.get_canonical_name() == "FLOAT16": + return "(hls::half_exp((%s){0}))" % (odt_hls_name) + + @property + def rtl_op(self): + return None + + def _derive_out_dtype(self, model: ModelWrapper): + if self.inp_dtype.get_canonical_name() == "FLOAT16": + return DataType["FLOAT16"] + return DataType["FLOAT32"] + + +# Derive a specialization to implement elementwise erf of the input +@register_custom_op +class ElementwiseErf(ElementwiseFunctionOperation): + @property + def npy_op(self): + import scipy.special + + return scipy.special.erf + + @property + def cpp_op(self): + # TODO: extend to fixed-point datatypes + assert self.out_dtype.get_canonical_name().startswith("FLOAT") + odt_hls_name = self.out_dtype.get_hls_datatype_str() + # Explicitly use the overloads, using hls::erf results in minor errors + if self.out_dtype.get_canonical_name() == "FLOAT32": + return "(hls::erff((%s){0}))" % (odt_hls_name) + elif self.out_dtype.get_canonical_name() == "FLOAT16": + return "(hls::half_erf((%s){0}))" % (odt_hls_name) + + @property + def rtl_op(self): + return None + + def _derive_out_dtype(self, model: ModelWrapper): + if self.inp_dtype.get_canonical_name() == "FLOAT16": + return DataType["FLOAT16"] + return DataType["FLOAT32"] diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index e80a581b57..01aa1e1c5a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -53,6 +53,7 @@ def register_custom_op(cls): # Import the submodule containing specializations of ElementwiseBinaryOperation # Note: This will automatically register all decorated classes into this domain import finn.custom_op.fpgadataflow.hls.elementwise_binary_hls +import finn.custom_op.fpgadataflow.hls.elementwise_functions_hls from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/elementwise_functions_hls.py b/src/finn/custom_op/fpgadataflow/hls/elementwise_functions_hls.py new file mode 100644 index 0000000000..f11b30d6e5 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/elementwise_functions_hls.py @@ -0,0 +1,439 @@ +# Copyright (C) 2025, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import textwrap + +import finn.custom_op.fpgadataflow.elementwise_functions as elementwise_functions +from finn.custom_op.fpgadataflow.elementwise_functions import ( + ElementwiseFunctionOperation, +) +from finn.custom_op.fpgadataflow.hls import register_custom_op +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.basic import CppBuilder +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +# Mapping of memory resource attributes to the corresponding C++ HLS +# pragma directives +RAM_STYLES = {"auto": "AUTO", "block": "BRAM", "distributed": "LUTRAM", "ultra": "URAM"} + + +# HLS Backend specialization of the elementwise function operation operator +class ElementwiseFunctionOperation_hls( + # CapWords convention + ElementwiseFunctionOperation, + HLSBackend, +): + # Node attributes matching the HLS operator + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = ElementwiseFunctionOperation.get_nodeattr_types(self) + # Add the HLSBackend default attributes on top + attrs.update(HLSBackend.get_nodeattr_types(self)) + # Add/Specialize implementation specific attributes here... + # Return the updated attributes dictionary + return attrs + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the width of the input + i_bits_max = self.get_instream_width(ind=0) + # Width of the output, there is just one output + # Note: there is one output per replica + o_bits_max = self.get_outstream_width(ind=0) + # Find the biggest of the inputs/outputs + return max([i_bits_max, o_bits_max]) + + # Note: End of shape and datatype utilities + + def code_generation_ipgen(self, model, fpgapart, clk): + """Generates c++ code and tcl script for ip generation.""" + super().code_generation_ipgen(model, fpgapart, clk) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_decoupled": + self.generate_hdl_memstream(fpgapart) + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # Currently nothing to include + self.code_gen_dict["$GLOBALS$"] = ['#include "flatten.hpp"'] + + # Generates C++ code of type alias, global constant and macro definitions + def defines(self, var): + # Insert constants and type aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Input and output element datatypes + f"using InpType = {self.inp_dtype.get_hls_datatype_str()};", + f"using OutType = {self.out_dtype.get_hls_datatype_str()};", + # Width of single elements to avoid using ::width attribute which is + # not present for datatype float + f"static constexpr auto InpWidth = {self.inp_dtype.bitwidth()};", + f"static constexpr auto OutWidth = {self.out_dtype.bitwidth()};", + # Datatype of elements packed into the input stream + f"using InpPacked = ap_uint<{self.get_instream_width(ind=0)}>;", + # Datatype of elements packed into the output stream + f"using OutPacked = ap_uint<{self.get_outstream_width(ind=0)}>;", + # Input and output HLS stream datatypes + "using InpStream = hls::stream;", + "using OutStream = hls::stream;", + ] + + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Prepare empty stream reading to append optionals + self.code_gen_dict["$READNPYDATA$"] = [] + + # Generate function calls for reading the input files into the input + # streams + npy_type = "half" if self.inp_dtype.get_hls_datatype_str() == "half" else "float" + self.code_gen_dict["$READNPYDATA$"] += [ + # Generate function call reading from file into the input stream + # Note: Inputs can be represented as numpy floats or halfs + f"npy2apintstream(", + f'"{code_gen_dir}/input_0.npy", in0_V, false', + ");", + ] + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + # Allways add the output stream to the declarations + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # Note: Assumes stream type aliases to be set in defines + "OutStream out0_V;" + ] + + # Generate a stream declaration + self.code_gen_dict["$STREAMDECLARATIONS$"] += [ + # Note: Assumes stream type aliases to be set in defines + "InpStream in0_V;" + ] + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + # Get the folded shapes of all tensors involved without PE axis + inp_shape = self.get_folded_input_shape(ind=0)[:-1] + out_shape = self.get_folded_output_shape(ind=0)[:-1] + + # Code generation of array index strings + def make_index_string(shape): + # Generate index operation [i] + return "".join([f"[i{d}]" for d in range(len(shape))]) + + inp_index = make_index_string(inp_shape) + + # Generate C++ code for declaring an array of the buffer shapes + inp_shape = "".join([f"[{size}]" for size in inp_shape]) + + # Number of dimensions of the output. All shapes will be + # aligned to this number of dimensions. + # Note: +1 for the PE dimension + ndim = len(out_shape) + 1 + + # For-Loop template for nested loops over arbitrary many levels + def for_loop(level, size): + return f"for(std::size_t i{level} = 0; i{level}<{size}; ++i{level})" + + # Type of memory to use for storing constant parameters + ram_style = RAM_STYLES[self.get_nodeattr("ram_style")] + + # Write the body of the top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # @formatter:off Disable formatter for mixed Python and C++ + # For streamed inputs, generate local buffer of non-broadcast size + # but broadcasts dimensions un-squeezed to size 1. For constant + # inputs, use the generated parameters of the same name. + # For streamed inputs, implement a simple dual-port RAM partitioned + # on the last, i.e., the PE, axis for parallel access. + f""" + InpType inp{inp_shape}[{self.pe}]; + #pragma HLS ARRAY_PARTITION variable=inp complete dim={ndim} + #pragma HLS BIND_STORAGE variable=inp type=RAM_S2P impl={ram_style} + """, + # Buffer to hold the parallel output elements: Implement a simple + # dual-port RAM for the output buffer, partitioned on the last, + # i.e., the PE, axis for parallel access. + # Note: The PE output should be rather small, force this into + # distributed memory here. + # TODO: Maybe reconsider this later? + f""" + OutType out[{self.pe}]; + #pragma HLS ARRAY_PARTITION variable=out complete dim=1 + #pragma HLS BIND_STORAGE variable=out type=RAM_S2P impl=LUTRAM + """, + # Perfect loop nest over all folded output dimensions + *[for_loop(dim, size) + " {" for dim, size in enumerate(out_shape)], + # Pipeline the loops. This should be possible as there is no code + # between the loop levels, i.e., this is a perfect loop nest. + """ + #pragma HLS pipeline II=1 style=flp + """, + # Read from the input stream + f""" + const auto buffer = Slice{{}}( + in0_V.read() + ); + for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{ + #pragma HLS unroll + inp{inp_index}[pe] = buffer(pe, 0); + }} + """, + # Apply PE parallel elementwise operations by filling the operation + # template + f""" + for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{ + #pragma HLS unroll + out[pe] = {self.cpp_op.format( + f"inp{inp_index}[pe]" + )}; + }} + """, + # Write the PE group into the output stream + """ + out0_V.write(flatten(out)); + """, + # Close all for-loop bodies of the generated nest + *["}" for _ in enumerate(out_shape)] + # @formatter:on End of code generation + ] + + # Post-process the generated code to remove unnecessary white space + self.code_gen_dict["$DOCOMPUTE$"] = [ + textwrap.dedent(code) for code in self.code_gen_dict["$DOCOMPUTE$"] + ] + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C** simulation + def dataoutstrm(self): + # Output data will be stored in numpy files in the code generation + # dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape(ind=0))) + }}}""" + # Generate function call for reading from the output stream into the + # output file + npy_type = "half" if self.out_dtype.get_hls_datatype_str() == "half" else "float" + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + # Generate function call reading from stream into the output file + # Note: Outputs can be numpy floats or halfs + f"apintstream2npy(", + f'out0_V, {shape}, "{code_gen_dir}/output_0.npy", false', + ");", + ] + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + # Note: This seems to be empty in ALL HLSBackends. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis + def blackboxfunction(self): + # Insert function head describing the top level interface of the + # attention operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # Note: Assumes stream type aliases to be set in defines + f"void {self.onnx_node.name} (", + " InpStream &in0_V,", + " OutStream &out0_V", + ")", + ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + # Check whether there are already pragmas in the code generation + # dictionary + if "$PRAGMAS$" not in self.code_gen_dict: + # If not, insert an empty list to collect more pragmas + self.code_gen_dict["$PRAGMAS$"] = [] + + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] += [ + # Connect the output stream with an axi stream interface + "#pragma HLS INTERFACE axis port=out0_V", + ] + # Connect the lhs input stream with an axi stream interface + self.code_gen_dict["$PRAGMAS$"] += [ + "#pragma HLS INTERFACE axis port=in0_V", + ] + + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary starting with clock + # and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} + # AXI stream input interfaces + intf_names["s_axis"] = [("in0_V", self.get_instream_width_padded(ind=0))] + # AXI stream output interfaces + intf_names["m_axis"] = [("out0_V", self.get_outstream_width_padded(ind=0))] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + HLSBackend.execute_node(self, context, graph) + elif mode == "rtlsim": + # rtlsim execution needs to be overwritten here because the HLS code + # is dynamically generated which results in different interfaces + # Get the node wrapped by this custom op + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Get the inputs out of the execution context + inp = context[node.input[0]] + # Validate the shape of the inputs + assert list(inp.shape) == self.get_normal_input_shape( + ind=0 + ), f"Input shape mismatch for {node.input[0]}" + # Reshape the inputs into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=0)) + # Path to store the intermediate inputs in numpy format + inp_filename = os.path.join(code_gen_dir, "input_0.npy") + # Save the folded inputs to file to be used by simulation + np.save(inp_filename, inp) + # Start collecting inputs/outputs to the RTL simulation in a dictionary + # Note: Prepare one output empty output list + io_dict = {"inputs": {}, "outputs": {"out0": []}} + # Type and width of the input tensors + inp_dtype = self.get_input_datatype(ind=0) + inp_width = self.get_instream_width(ind=0) + + # Convert inputs to RTL simulation format + io_dict["inputs"]["in0"] = npy_to_rtlsim_input(inp_filename, inp_dtype, inp_width) + + # Setup PyVerilator simulation of the node + sim = self.get_rtlsim() + # Reset the RTL simulation; finnxsi toggles the clock + super().reset_rtlsim(sim) + # Run the RTL Simulation + self.rtlsim_multi_io(sim, io_dict) + + # Collect the output from RTL simulation + out = io_dict["outputs"]["out0"] + # Type and sizes of the output tensor + dtype = self.get_output_datatype(ind=0) + width = self.get_outstream_width(ind=0) + shape = self.get_folded_output_shape(ind=0) + # Path to store the intermediate numpy file + filename = os.path.join(code_gen_dir, "output_0.npy") + # Convert from RTL simulation format to numpy format + rtlsim_output_to_npy(out, filename, dtype, shape, width, dtype.bitwidth()) + # Load the generated output numpy file + out = np.load(filename) + # Reshape the folded output and insert into the execution context + context[node.output[0]] = out.reshape(self.get_normal_output_shape(ind=0)) + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + +# HLS Backend specialization of the elementwise function operation operator +# Specialized to include hls_math and link floating-point math IPs +class ElementwiseMathFunctionOperation_hls(ElementwiseFunctionOperation_hls): + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + super().global_includes() + # additional hls_math include + self.code_gen_dict["$GLOBALS$"] += ['#include "hls_math.h"'] + + def compile_singlenode_code(self): + """Builds the bash script for compilation using the CppBuilder from + finn.util.basic and executes the script to produce the executable.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + builder = CppBuilder() + # to enable additional debug features please uncommand the next line + # builder.append_includes("-DDEBUG") + builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") + builder.append_includes("-I$FINN_ROOT/deps/cnpy/") + builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") + builder.append_includes("-I$FINN_ROOT/custom_hls") + builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) + builder.append_includes("-I{}/include".format(os.environ["VITIS_PATH"])) + builder.append_includes("--std=c++14") + builder.append_includes("-O3") + builder.append_sources(code_gen_dir + "/*.cpp") + builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp") + builder.append_includes("-lz") + builder.append_includes('-Wl,-rpath,"$HLS_PATH/lnx64/lib/csim"') + builder.append_includes("-L$HLS_PATH/lnx64/lib/csim -lhlsmc++-GCC46") + builder.append_includes('-Wl,-rpath,"$HLS_PATH/lnx64/tools/fpo_v7_1"') + builder.append_includes("-L$HLS_PATH/lnx64/tools/fpo_v7_1 -lgmp -lmpfr") + builder.append_includes("-lIp_floating_point_v7_1_bitacc_cmodel") + builder.set_executable_path(code_gen_dir + "/node_model") + builder.build(code_gen_dir) + self.set_nodeattr("executable_path", builder.executable_path) + + +# Derive a specialization to implement elementwise relu of the input +@register_custom_op +class ElementwiseRelu_hls(ElementwiseFunctionOperation_hls, elementwise_functions.ElementwiseRelu): + pass + + +# Derive a specialization to implement elementwise exponent of the input +@register_custom_op +class ElementwiseExp_hls( + ElementwiseMathFunctionOperation_hls, elementwise_functions.ElementwiseExp +): + pass + + +# Derive a specialization to implement elementwise erf of the input +@register_custom_op +class ElementwiseErf_hls( + ElementwiseMathFunctionOperation_hls, elementwise_functions.ElementwiseErf +): + pass diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 24806490c8..7044b298aa 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -46,6 +46,9 @@ # Module containing specializations of elementwise binary operations import finn.custom_op.fpgadataflow.elementwise_binary as elementwise_binary +# Module containing specializations of elementwise function operations +import finn.custom_op.fpgadataflow.elementwise_functions as elementwise_functions + class InferConvInpGen(Transformation): """Convert Im2Col layers to ConvolutionInputGenerator layers.""" @@ -2014,29 +2017,11 @@ def apply(self, model: ModelWrapper): # noqa return model, graph_modified -# Converts ReLU into ElementwiseMaximum(in, 0) -class InferReLUAsElementwiseMax(Transformation): - # Filter function to filter out any operation involving any floating-point - # tensor - @staticmethod - def reject_unsupported_dtypes(model: ModelWrapper, node: NodeProto): - def dtype_ok(tname): - dt = model.get_tensor_datatype(tname) - if dt is None: - return False - if ( - dt.is_integer() - or dt.is_fixed_point() - or dt in [DataType["FLOAT32"], DataType["FLOAT16"]] - ): - return True - else: - return False - - return all([dtype_ok(tname) for tname in list(node.input) + list(node.output)]) - +# Converts supported elementwise function operations to their FINN custom +# operation +class InferElementwiseFunctionOperation(Transformation): # Initializes the transformation method with an optional filter function - def __init__(self, _filter=reject_unsupported_dtypes): + def __init__(self, _filter=None): # Initialize the base class Transformation object super().__init__() # Register the filter function as attribute @@ -2053,38 +2038,37 @@ def apply(self, model: ModelWrapper): # noqa # Skip transforming nodes rejected by the filter if not self._filter(model, node): continue - if node.op_type == "Relu": + # If a custom operation with corresponding name is implemented in + # the module, this operator is supported for conversion + if f"Elementwise{node.op_type}" in dir(elementwise_functions): inp = node.input[0] - # add a second 0-valued input for ReLU - new_tname = model.make_new_valueinfo_name() - model.set_initializer(new_tname, np.asarray(0.0, dtype=np.float32)) - # comparison of fp16 and uint2 is not possible in HLS - new_tdtype = ( - "FLOAT16" - if model.get_tensor_datatype(inp).get_canonical_name() == "FLOAT16" - else "UINT2" - ) - # for the constant 0 input, use a small-width datatype - # (to avoid unnecessarily promoting output type to something larger) - model.set_tensor_datatype(new_tname, DataType[new_tdtype]) + # if input is a constant, throw an error and + # ask user to run FoldConstants transform first + assert ( + model.get_initializer(inp) is None + ), """Input is a constant, + please run FoldConstants from qonnx.transformation.fold_constants first.""" result = node.output[0] # Need to "lift" potential scalar inputs to rank-1 tensors lift_to_rank1(inp, model) - lift_to_rank1(new_tname, model) + + inp_shape = model.get_tensor_shape(inp) + out_shape = model.get_tensor_shape(result) + + idt0 = model.get_tensor_datatype(inp) + odt0 = model.get_tensor_datatype(result) new_node = helper.make_node( - "ElementwiseMax", - [inp, new_tname], + f"Elementwise{node.op_type}", + [inp], [result], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - lhs_shape=model.get_tensor_shape(inp), - rhs_shape=model.get_tensor_shape(new_tname), - out_shape=model.get_tensor_shape(result), - lhs_dtype=str(model.get_tensor_datatype(inp)), - rhs_dtype=str(model.get_tensor_datatype(new_tname)), - out_dtype=str(model.get_tensor_datatype(result)), + inp_shape=inp_shape, + out_shape=out_shape, + inp_dtype=str(idt0), + out_dtype=str(odt0), ) graph.node.insert(index + 1, new_node) graph.node.remove(node) diff --git a/tests/fpgadataflow/test_fpgadataflow_relu_elementwisemax.py b/tests/fpgadataflow/test_fpgadataflow_elementwise_functions.py similarity index 67% rename from tests/fpgadataflow/test_fpgadataflow_relu_elementwisemax.py rename to tests/fpgadataflow/test_fpgadataflow_elementwise_functions.py index 17ce600989..829c5952ef 100644 --- a/tests/fpgadataflow/test_fpgadataflow_relu_elementwisemax.py +++ b/tests/fpgadataflow/test_fpgadataflow_elementwise_functions.py @@ -29,6 +29,7 @@ import pytest import numpy as np +import scipy.special from onnx import TensorProto from onnx import helper as oh from qonnx.core.datatype import DataType @@ -42,7 +43,7 @@ from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.convert_to_hw_layers import ( - InferReLUAsElementwiseMax, + InferElementwiseFunctionOperation, ) from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.minimize_accumulator_width import ( @@ -57,65 +58,92 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers - -# Creates a model executing a ReLU operation -def create_relu_model_onnx(inp_dtype, inp_shape): - # Create a node representing the binary elementwise operation +# Mapping of ElementwiseFunctionOperation specializations to numpy reference +# implementation functions +NUMPY_REFERENCES = { + "ElementwiseRelu": lambda x: np.maximum(x, 0), + "ElementwiseExp": np.exp, + "ElementwiseErf": scipy.special.erf, +} + + +# Creates a model executing a elementwise function operation +def create_elementwise_function_operation_onnx(op_type, inp_dtype, out_dtype, inp_shape): + # Remove "Elementwise" from op_type string which is the onnx ops op_type + onnx_op_type = op_type[11:] + # Automatically derive the output shape + out_shape = inp_shape + # Create a node representing the elementwise operation node = oh.make_node( - op_type="Relu", + op_type=onnx_op_type, inputs=["inp"], outputs=["out"], ) if inp_dtype == "FLOAT16": inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT16, inp_shape) - out = oh.make_tensor_value_info("out", TensorProto.FLOAT16, inp_shape) else: inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, inp_shape) - out = oh.make_tensor_value_info("out", TensorProto.FLOAT, inp_shape) + if out_dtype == "FLOAT16": + out = oh.make_tensor_value_info("out", TensorProto.FLOAT16, out_shape) + else: + out = oh.make_tensor_value_info("out", TensorProto.FLOAT, out_shape) # Create a graph connecting the node to the inputs and outputs - graph = oh.make_graph([node], inputs=[inp], outputs=[out], name="relu-eltwisemax") - model = ModelWrapper(qonnx_make_model(graph, producer_name="relu-eltwisemax")) + graph = oh.make_graph([node], inputs=[inp], outputs=[out], name="elementwise-function") + model = ModelWrapper(qonnx_make_model(graph, producer_name="elementwise-function")) - # Add datatype annotation to the value info of tensors + # Add datatype annotation to the value info of input and output tensors model.set_tensor_datatype("inp", DataType[inp_dtype]) - model.set_tensor_datatype("out", DataType[inp_dtype]) + model.set_tensor_datatype("out", DataType[out_dtype]) return model +# Operator type to be tested +@pytest.mark.parametrize( + "op_type", + [ + # Test all Numpy references specified above + *NUMPY_REFERENCES.keys() + ], +) # Data type of the input elements -@pytest.mark.parametrize("inp_dtype", ["INT8", "FLOAT32", "FLOAT16", "FIXED<8,3>"]) +@pytest.mark.parametrize( + "inp_dtype", + ["FLOAT32", "FLOAT16", "INT6", "FIXED<8,3>"], +) # Shape of the input -@pytest.mark.parametrize("inp_shape", [[4], [3, 32, 1, 16]]) +@pytest.mark.parametrize("inp_shape", [[8]]) # Number of elements to process in parallel -@pytest.mark.parametrize("pe", [1, 2, 4]) +@pytest.mark.parametrize("pe", [1, 2]) # Exec mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_relu_elementwisemax(inp_dtype, inp_shape, pe, exec_mode): +def test_elementwise_function_operation(op_type, inp_dtype, inp_shape, pe, exec_mode): + if not op_type.endswith("Relu"): + if not inp_dtype.startswith("FLOAT"): + pytest.skip("Non-float inputs are not yet supported for functions except Relu.") + out_dtype = inp_dtype # Make dummy model for testing - model = create_relu_model_onnx(inp_dtype, inp_shape) + model = create_elementwise_function_operation_onnx(op_type, inp_dtype, out_dtype, inp_shape) # Prepare the execution context - context = {"inp": gen_finn_dt_tensor(DataType[inp_dtype], inp_shape)} - # Compute ground-truth output in software - o_ref = np.maximum(context["inp"], 0) + context = { + "inp": gen_finn_dt_tensor(DataType[inp_dtype], inp_shape), + } + + # Get the numpy reference implementation for this operation + numpy_reference = NUMPY_REFERENCES[op_type] # Test running shape and data type inference on the model graph model = model.transform(InferDataTypes()) model = model.transform(InferShapes()) # Specializes all nodes to be implemented as HLS backend - model = model.transform(InferReLUAsElementwiseMax()) + model = model.transform(InferElementwiseFunctionOperation()) assert len(model.graph.node) == 1 - assert model.graph.node[0].op_type == "ElementwiseMax" - # Execute the onnx model to collect the result - o_hw = execute_onnx(model, context)["out"] - - # Compare the expected to the produced for exact equality - assert np.all(o_hw == o_ref) + assert model.graph.node[0].op_type == f"{op_type}" # Test running shape and data type inference on the model graph model = model.transform(InferDataTypes()) @@ -125,7 +153,7 @@ def test_relu_elementwisemax(inp_dtype, inp_shape, pe, exec_mode): model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) assert len(model.graph.node) == 1 - assert model.graph.node[0].op_type == "ElementwiseMax_hls" + assert model.graph.node[0].op_type == f"{op_type}_hls" getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe) @@ -143,8 +171,17 @@ def test_relu_elementwisemax(inp_dtype, inp_shape, pe, exec_mode): model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) + # Compute ground-truth output in software + inp = context["inp"] + + o_expected = numpy_reference(inp) # Execute the onnx model to collect the result - o_sim = execute_onnx(model, context)["out"] + o_produced = execute_onnx(model, context)["out"] - # Compare the expected to the produced for exact equality - assert np.all(o_sim == o_ref) + if op_type.endswith("Relu"): + assert np.all(o_expected == o_produced) + else: + if inp_dtype == "FLOAT16": + assert np.allclose(o_expected, o_produced, rtol=1e-3, atol=2**-13) + else: + assert np.allclose(o_expected, o_produced)