Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 65 additions & 11 deletions finn-rtllib/mvu/add_multi.sv
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @brief Pipelined multi-input adder tree.
* @brief Pipelined multi-input adder using LUT-based compressors.
* @author Thomas B. Preußer <thomas.preusser@amd.com>
*****************************************************************************/

Expand All @@ -50,13 +50,65 @@ module add_multi import mvu_pkg::*; #(
output logic [SUM_WIDTH-1:0] sum
);

localparam int unsigned L = $clog2(N); // Number of levels with reductions
//---------------------------------------------------------------------------
// Compressor Path
//
// CATCH_COMP entries instantiate a generated compressor module for a
// specific (N, ARG_WIDTH, delay) triple. The macro transposes arg[i][j]
// to the column-major bit-vector expected by the compressor and pads any
// remaining DEPTH with a shift-register delay.
//
// Generated compressors have no en port — when en=0, upstream holds
// inputs stable and the downstream accumulator does not latch, so
// correctness is preserved.

uwire [SUM_WIDTH-1:0] sum0;
if(L < 1) begin : genTrivial
`define CATCH_COMP(n,w,d) \
else if(!RESET_ZERO && (N == n) && (ARG_WIDTH == w) && (DEPTH >= d) && (0 <= ARG_LO)) begin : genComp``n``u``w``_d``d \
initial $display("[ADD_MULTI_PATH] COMP N=%0d D=%0d W=%0d", N, DEPTH, ARG_WIDTH); \
\
uwire [N*ARG_WIDTH-1:0] in; \
uwire [SUM_WIDTH -1:0] out; \
for(genvar i = 0; i < N; i++) begin : genIn \
for(genvar j = 0; j < ARG_WIDTH; j++) begin : genBit \
assign in[j*N+i] = arg[i][j]; \
end : genBit \
end : genIn \
comp_``n``u``w``_d``d comp_inst ( \
.clk, \
.in, .out \
); \
initial assert($bits(out) >= $bits(comp_inst.out)) else $warning("CATCH_COMP(%0d,%0d,%0d): compressor output width %0d > SUM_WIDTH %0d", n, w, d, $bits(comp_inst.out), SUM_WIDTH); \
\
localparam int unsigned COMP_DELAY = d; \
localparam int unsigned SUM_DELAY = DEPTH - COMP_DELAY; \
if(SUM_DELAY == 0) assign sum = out; \
else begin : genDelay \
logic [SUM_WIDTH-1:0] SumZ[SUM_DELAY] = '{ default: '0 }; \
always_ff @(posedge clk) begin \
if(rst) SumZ <= '{ default: '0 }; \
else if(en) begin \
for(int unsigned i = 0; i < SUM_DELAY-1; i++) SumZ[i] <= SumZ[i+1]; \
SumZ[SUM_DELAY-1] <= out; \
end \
end \
assign sum = SumZ[0]; \
end : genDelay \
end : genComp``n``u``w``_d``d

if(0) begin end
// FINN_GENERATED_COMP_ENTRIES

//- Generic Behavioral Addition ---------
else begin : genGeneric

localparam int unsigned L = $clog2(N); // Tree levels

logic [SUM_WIDTH-1:0] sum0;
if(L < 1) begin : genPassThrough
assign sum0 = arg[0];
end : genTrivial
end : genPassThrough
else begin : genTree
initial $display("[ADD_MULTI_PATH] TREE N=%0d D=%0d W=%0d", N, DEPTH, ARG_WIDTH);
localparam int unsigned D = L < DEPTH? L : DEPTH; // Pipeline stages absorbed by tree

// Compute the count of decendents for all nodes in the reduction trees.
Expand Down Expand Up @@ -117,16 +169,18 @@ module add_multi import mvu_pkg::*; #(
// Delay Output if requested DEPTH exceeds Tree Height
if(DEPTH <= L) assign sum = sum0;
else begin : genDelay
localparam logic [SUM_WIDTH-1:0] SUM_RESET = {(SUM_WIDTH){RESET_ZERO? 1'b0 : 1'bx}};
logic [SUM_WIDTH-1:0] SumZ[DEPTH - L] = '{ default: SUM_RESET };
localparam int unsigned DELAY = DEPTH - L;
logic [SUM_WIDTH-1:0] SumZ[DELAY] = '{ default: '0 };
always_ff @(posedge clk) begin
if(rst) SumZ <= '{ default: SUM_RESET };
else begin
for(int unsigned i = 0; i < DEPTH-L-1; i++) SumZ[i] <= SumZ[i+1];
SumZ[DEPTH-L-1] <= sum0;
if(rst) SumZ <= '{ default: '0 };
else if(en) begin
for(int unsigned i = 0; i < DELAY-1; i++) SumZ[i] <= SumZ[i+1];
SumZ[DELAY-1] <= sum0;
end
end
assign sum = SumZ[0];
end : genDelay

end : genGeneric

endmodule : add_multi
34 changes: 28 additions & 6 deletions finn-rtllib/mvu/mvu_vvu_axi.sv
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ module mvu_vvu_axi #(
bit FORCE_BEHAVIORAL = 0,
bit M_REG_LUT = 1,

// LUT-based compressor tree pipeline depth. This is set by default for maximum Pipelining (inbetween every stage).
int unsigned COMP_PIPELINE_DEPTH = 1,

// Passed at generation time, whether compressors were generated if deemed worth it.
// Decides wether to use LUT-based compressors instead of DSPs.
bit USE_COMPRESSOR = 0,

// Safely deducible parameters
localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH,
localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
Expand Down Expand Up @@ -310,7 +317,19 @@ module mvu_vvu_axi #(
localparam int unsigned A_WIDTH = 25 + 2*(VERSION > 1); // Width of A datapath
localparam int unsigned NUM_LANES = A_WIDTH == WEIGHT_WIDTH? 1 : 1 + (A_WIDTH - !NARROW_WEIGHTS - WEIGHT_WIDTH) / MIN_LANE_WIDTH;

if(!IS_MVU || ((VERSION > 2) && (NUM_LANES <= 3) && (WEIGHT_WIDTH <= 8) && (ACTIVATION_WIDTH <= 9))) begin : genINT8
if(USE_COMPRESSOR) begin : genCompressor
$DOTP_MODULE_NAME$ #(
.PE(PE), .SIMD(DSP_SIMD),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
.COMP_PIPELINE_DEPTH(COMP_PIPELINE_DEPTH)
) core (
.clk(ap_clk), .rst, .en('1),
.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
.vld(dsp_vld), .p(dsp_p)
);
end : genCompressor
else if(!IS_MVU || ((VERSION > 2) && (NUM_LANES <= 3) && (WEIGHT_WIDTH <= 8) && (ACTIVATION_WIDTH <= 9))) begin : genINT8
initial $info("Sidestepping to INT8 mode of DSP58 for %0dx%0d.", WEIGHT_WIDTH, ACTIVATION_WIDTH);
mvu_vvu_8sx9_dsp58 #(
.IS_MVU(IS_MVU),
Expand Down Expand Up @@ -343,11 +362,14 @@ module mvu_vvu_axi #(

if(1) begin : blkOutput
localparam int unsigned CORE_PIPELINE_DEPTH =
VERSION == 3? 3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) :
/* else */ 3 + $clog2(SIMD+1) + (SIMD == 1);

// This is conservative and could be divided by a guaranteed minimum output interval, e.g. MW/SIMD.
localparam int unsigned MAX_IN_FLIGHT = CORE_PIPELINE_DEPTH;
USE_COMPRESSOR? COMP_PIPELINE_DEPTH :
VERSION == 3? 3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) :
/* else */ 3 + $clog2(SIMD+1) + (SIMD == 1);

// Floor at the DSP-equivalent depth so the compressor path (shallow pipeline)
localparam int unsigned DSP_PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1);
localparam int unsigned MAX_IN_FLIGHT =
CORE_PIPELINE_DEPTH > DSP_PIPELINE_DEPTH? CORE_PIPELINE_DEPTH : DSP_PIPELINE_DEPTH;
typedef logic [PE-1:0][ACCU_WIDTH-1:0] output_t;

logic signed [$clog2(MAX_IN_FLIGHT+1):0] OPtr = '1; // -1 | 0, 1, ..., MAX_IN_FLIGHT
Expand Down
5 changes: 4 additions & 1 deletion finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
parameter NARROW_WEIGHTS = $NARROW_WEIGHTS$,
parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
parameter SEGMENTLEN = $SEGMENTLEN$,
parameter COMP_PIPELINE_DEPTH = $COMP_PIPELINE_DEPTH$,
parameter USE_COMPRESSOR = $USE_COMPRESSOR$,

// Safely deducible parameters
parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
Expand Down Expand Up @@ -81,7 +83,8 @@ mvu_vvu_axi #(
`endif
.IS_MVU(IS_MVU), .VERSION(VERSION), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
.COMP_PIPELINE_DEPTH(COMP_PIPELINE_DEPTH), .USE_COMPRESSOR(USE_COMPRESSOR)
) inst (
.ap_clk(ap_clk),
.ap_clk2x(ap_clk2x),
Expand Down
17 changes: 17 additions & 0 deletions src/finn/compressor/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#############################################################################
# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#
# @brief Build automation for compressor testing and generation
# @author Simon Gerber <simon.gerber@amd.com>
#############################################################################

# Default: no constant absorption
CA?=
.PHONY: default clean

default:
./run_tests.sh $(CA)
clean:
rm -rf *.log *.jou *.vivado .Xil xvlog.pb gen/*
103 changes: 103 additions & 0 deletions src/finn/compressor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
<!--
Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.

SPDX-License-Identifier: BSD-3-Clause
-->

# Python Compressor Generator
This tool can generate compressor trees for 7-Series, UltraScale(+) and Versal for arbitrary input shapes.

# Getting started
1. Part of the FINN framework (integrated into MVAU RTL backend).
2. _standalone compressor generation_ requires no external dependencies.

## FINN Integration
The compressor is automatically invoked during MVAU layer specialization (`SpecializeLayers` transformation).
FINN selects the between RTL compressor, RTL DSP and HLS implementations based on the node parameters.
See the [MVAU compressor integration flow diagram](mvau_compressor_inegration_flow.svg) for the complete decision tree.

**Key integration files:**
- `src/finn/transformation/fpgadataflow/specialize_layers.py` - RTL vs HLS selection logic
- `src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py` - FINN-side RTL MVAU integration with compressor path selection
- `src/finn/compressor/src/dotp_finn.py` - FINN wrapper for dot-product compressor generation
- `src/finn/compressor/src/add_multi_finn.py` - FINN wrapper for multi-operand adder generation
- `finn-rtllib/mvu/mvu_vvu_axi.sv` - RTL template that instantiates generated compressors

This project implements either the full dotp unit of the node with a compressor impleemntation, or optimizes the add_multi additions of the DSP lanes when the RTL DSP path is invoked.

## Standalone Usage
Generate a compressor of shape `(12,12,12)` called `comp` and save it under `/gen/comp12_12_12.sv`:

```python3 -m finn.compressor.src.main -s 12,12,12 -n comp -o gen/comp12_12_12.sv```

See `python3 -m finn.compressor.src.main -h` for details.

## Testing
Run the test suite for verification on different platforms:

```bash
# Core compressor tests (21 configs)
./run_tests.sh "" versal # or 7series, ultrascale

# MVAU integration tests (8 configs)
./run_dotp_comp_tests.sh versal # or 7series, ultrascale

# Multi-operand adder tests (8 configs)
./run_add_multi_comp_tests.sh versal # or 7series, ultrascale
```

## Features
### Custom Input Shape
The tool can generate compressors for any input shape. A shape is passed as a comma-separated list. Each digit indicates a column's height. *LSB* is *left*, *MSB* is *right*.

### Accumulation
By passing `-a`, the tool generates an accumulator instead of just an adder. The accumulators width can be specified by `-w`.
### Gate Absorption
If desired, every input to the compressor can be preceded by a two-input gate. These gates can be integrated into the first compression stage. Each gate is specified as a HEX digit. The encoding is the same is Vivado's LUT2 primitive:
| Secondary Input | Primary Input | Output
|-----------------|---------------|----------------
|0 |0 |(DIGIT << 0) & 1
|0 |1 |(DIGIT << 1) & 1
|1 |0 |(DIGIT << 2) & 1
|1 |1 |(DIGIT << 3) & 1

For example, `8` maps to an AND gate and `6` maps to an XOR gate.

In CLI, gates can be specified as a flat string like `-g 883ABC`. The *LSB* is *left* and *MSB* is *right*. The leftmost specified gate corresponds to the LSB input in the generated compressor input vector.

### Target
Generate compressors for either Versal, 7-Series or UltraScale fabrics using `-t {Versal,7-Series,UltraScale}`.

### Automated Testing
The tool can automatically generate a SystemVerilog testbench to fuzzy-test the generated compressors by passing `--test`. For testing, the `xvlog`, `xelab` and `xsim` commands have to be available.

### Custom Pipeline Depth
Specify the maximum combinational delay for the compressor using `-p MAX_DEPTH`. Note that the final adder, which has at least one single routing delay, cannot be pipelined.
This excludes the `Quaternary Adder`, which can be split into two stages when not used in accumulation. The pipelined version is the default if `-a` is not passed.

### Constant Input
Aside to the regular, variable compressor inputs, the tool also supports an additional constant input. It can be specified as a binary number by `-c NUMBER`.

# Implementation Details - How the Code is Structured
The compressor is internally represented as a graph. Its nodes are defined in `src/graph/nodes.py`.
Compressor construction is done in several passes:
1. Create a graph with all scheduled counters and a final adder (in `src/passes/compressor_constructor.py`).
1. (Optional) Generate a gate absorption stage.
2. Generate regular compression stages until the compression goal is reached.
3. Insert pipeline registers between compressor stages.
4. Build either a final adder or an accumulator as the final stage.
2. Annotate LUT6CY instances with placement constraints so that the LUT Cascade will be utilized (in `src/passes/lut_placer.py`).
3. Replace inexpressible connections: Place wires between connected instantiated modules (in `src/passes/wire_inserter.py`).
4. Annotate input and output signals in the compressor (in `src/passes/io_annotator.py`).
5. Emit generated SystemVerilog source (in `src/passes/emitter.py`)

## Extending the Tool
### Adding new Counters
Counters without gate absorption are defined in `graph/counters/counter_candidates.py`.
Counters with gate absorption are defined in `graph/counters/absorption_counter_candidates.py`.

### Adding new Passes
Before adding new passes over the compressor graph, check out if the simple iterator defined in `node_iterator.py` can be inherited to save boilerplate code.

# Authors
This tool was created as a standalone compressor generator by Konstantin Hossfeld and Thomas Preußer. It was extended and integrated into the finn flow by Simon Gerber.
15 changes: 15 additions & 0 deletions src/finn/compressor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#############################################################################
# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#
# @brief FINN compressor package initialization
# @author Simon Gerber <simon.gerber@amd.com>
#############################################################################

"""FINN compressor — LUT-based compressor tree generator for MVU."""

from .src.add_multi_finn import generate_add_multi_comps
from .src.dotp_finn import generate_dotp_comp

__all__ = ["generate_add_multi_comps", "generate_dotp_comp"]
62 changes: 62 additions & 0 deletions src/finn/compressor/gen_dotp_netlist.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash
#############################################################################
# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#
# Generate standalone dotp compressor netlist for inspection or integration.
# Output is a self-contained RTL directory that can be simulated or synthesized.
#
# Usage: Edit parameters below, then run: ./gen_dotp_netlist.sh
#############################################################################

# === Configuration ===
SIMD=256
WW=4
AW=4
ACCU_WIDTH=16
SIGNED_WEIGHTS=0 # 0=unsigned, 1=signed
SIGNED_ACT=0 # 0=unsigned, 1=signed
TARGET="Versal" # Versal, 7-Series, UltraScale
# =====================

set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
export PYTHONPATH="$(cd "$SCRIPT_DIR/../../.." && pwd):${PYTHONPATH:-}"

# Build output directory name from config
LABEL="simd${SIMD}_w${WW}_a${AW}"
[ "$SIGNED_WEIGHTS" -eq 0 ] && LABEL="${LABEL}_uw"
[ "$SIGNED_ACT" -eq 1 ] && LABEL="${LABEL}_sa"
LABEL="${LABEL}_$(echo "$TARGET" | tr '[:upper:]' '[:lower:]' | tr -d '-')"
OUT_DIR="$SCRIPT_DIR/gen/$LABEL"
mkdir -p "$OUT_DIR"

echo "Generating dotp compressor netlist"
echo " Config: SIMD=$SIMD, WW=$WW, AW=$AW, ACCU=$ACCU_WIDTH"
echo " Target: $TARGET"
echo " Output: $OUT_DIR"
echo ""

# Build flags
FLAGS=""
[ "$SIGNED_WEIGHTS" -eq 0 ] && FLAGS="--unsigned_weights"
[ "$SIGNED_ACT" -eq 1 ] && FLAGS="$FLAGS --signed_activations"

# Generate compressor core and dotp wrapper
python3 -m finn.compressor.src.dotp_finn \
--simd "$SIMD" --ww "$WW" --aw "$AW" \
--accu_width "$ACCU_WIDTH" $FLAGS \
--target "$TARGET" \
--dotp-template "$SCRIPT_DIR/hdl/dotp_comp_template.sv" \
--dotp-output-name dotp_comp.sv \
-o "$OUT_DIR"

# Include mul_comp_map for complete netlist
cp "$SCRIPT_DIR/hdl/mul_comp_map.sv" "$OUT_DIR/"

echo ""
echo "Generated files:"
ls -1 "$OUT_DIR"/*.sv
echo ""
echo "Done. Netlist ready in: $OUT_DIR"
Loading