From 4e92b7b95e111621bc4eb82ea5e50aeb67cdc4be Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Fri, 26 May 2023 17:50:22 +0100 Subject: [PATCH 001/272] Introduce unrolled implementation of Dense Resource --- .../vivado/passes/convolution_templates.py | 29 ++++++++++++++++ .../backends/vivado/passes/core_templates.py | 9 +++++ .../vivado/passes/recurrent_templates.py | 13 +++++-- hls4ml/backends/vivado/vivado_backend.py | 34 +++++++++++++++++-- hls4ml/model/graph.py | 22 ++++++++++++ .../vivado/nnet_utils/nnet_code_gen.h | 12 ++++++- .../vivado/nnet_utils/nnet_conv_stream.h | 20 +++++------ .../templates/vivado/nnet_utils/nnet_dense.h | 15 +++++++- .../vivado/nnet_utils/nnet_dense_stream.h | 2 ++ 9 files changed, 139 insertions(+), 17 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 4845b8f1da..dde42d97fe 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -9,6 +9,9 @@ static const unsigned n_out = {n_out}; static const unsigned reuse_factor = {reuse}; static const unsigned strategy = nnet::{strategy}; + static const unsigned resource_implementation = nnet::{dense_resource_implementation}; + template + using dense_unrolled = nnet::{unrolled_function}; static const unsigned n_zeros = 0; static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; typedef {accum_t.name} accum_t; @@ -86,6 +89,8 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) + # TODO - Extend unrolled Dense Resource to Conv1D + mult_params['unrolled_function'] = 'DenseResourceUnrolled' mult_config = self.mult_template.format(**mult_params) return mult_config + '\n' + conv_config @@ -130,6 +135,9 @@ def format(self, node): static const bool store_weights_in_bram = false; static const unsigned strategy = nnet::{strategy}; static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned resource_implementation = nnet::{dense_resource_implementation}; + template + using dense_unrolled = nnet::{unrolled_function}; static const unsigned min_height = {min_height}; static const unsigned min_width = {min_width}; static const ap_uint pixels[min_height * min_width]; @@ -183,6 +191,12 @@ def format(self, node): params['fill_fn'] = f'fill_buffer_{node.index}' else: params['fill_fn'] = 'FillConv2DBuffer' + + if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: + # Implemented in subsequent commits + params['unrolled_function'] = 'DenseResourceUnrolled' + else: + params['unrolled_function'] = 'DenseResourceUnrolled' conv_config = self.template.format(**params) @@ -192,6 +206,11 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) + if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: + # Implemented in subsequent commits + mult_params['unrolled_function'] = 'DenseResourceUnrolled' + else: + mult_params['unrolled_function'] = 'DenseResourceUnrolled' mult_config = self.mult_template.format(**mult_params) return mult_config + '\n' + conv_config @@ -278,6 +297,9 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision ) + # TODO - Extend unrolled Dense Resource to depthwise Conv1D + mult_params['unrolled_function'] = 'DenseResourceUnrolled' + depthwise_mult_config = self.depthwise_mult_template.format(**mult_params) # Pointwise config @@ -317,6 +339,9 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision ) + # TODO - Extend unrolled Dense Resource to separable Conv1D + mult_params['unrolled_function'] = 'DenseResourceUnrolled' + pointwise_mult_config = self.pointwise_mult_template.format(**mult_params) return ( @@ -399,6 +424,8 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision ) + # TODO - Extend unrolled Dense Resource to depthwise Conv2D + mult_params['unrolled_function'] = 'DenseResourceUnrolled' depthwise_mult_config = self.depthwise_mult_template.format(**mult_params) # Pointwise config @@ -442,6 +469,8 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision ) + # TODO - Extend unrolled Dense Resource to separable Conv2D + mult_params['unrolled_function'] = 'DenseResourceUnrolled' pointwise_mult_config = self.pointwise_mult_template.format(**mult_params) return ( diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index c8119c0c2e..faabf434eb 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -9,6 +9,9 @@ static const unsigned n_out = {n_out}; static const unsigned io_type = nnet::{iotype}; static const unsigned strategy = nnet::{strategy}; + static const unsigned resource_implementation = nnet::{dense_resource_implementation}; + template + using dense_unrolled = nnet::{unrolled_function}; static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; @@ -40,6 +43,12 @@ def format(self, node): node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) + if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: + # Implemented in subsequent commits + params['unrolled_function'] = 'DenseResourceUnrolled' + else: + params['unrolled_function'] = 'DenseResourceUnrolled' + return self.template.format(**params) diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py index aae806b35c..eb12412def 100644 --- a/hls4ml/backends/vivado/passes/recurrent_templates.py +++ b/hls4ml/backends/vivado/passes/recurrent_templates.py @@ -11,6 +11,9 @@ static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; + static const unsigned resource_implementation = nnet::{dense_resource_implementation}; + template + using dense_unrolled = nnet::{unrolled_function}; static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; static const bool store_weights_in_bram = false; typedef {accum_t.name} accum_t; @@ -137,6 +140,10 @@ def format(self, node): mult_params1['index'] = str(node.index) + '_1' mult_params1['nzeros'] = node.get_weights('weight').nzeros mult_params1['nonzeros'] = node.get_weights('weight').nonzeros + + # TODO - Extend unrolled Dense Resource to recurrent kernels + mult_params1['unrolled_function'] = 'DenseResourceUnrolled' + if node.get_attr('return_sequences'): mult_params2['n_in'] = node.get_output_variable().dim_names[1] mult_params2['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult @@ -150,13 +157,15 @@ def format(self, node): mult_params2['index'] = str(node.index) + '_2' mult_params2['nzeros'] = node.get_weights('recurrent_weight').nzeros mult_params2['nonzeros'] = node.get_weights('recurrent_weight').nonzeros - + + # TODO - Extend unrolled Dense Resource to recurrent kernels + mult_params2['unrolled_function'] = 'DenseResourceUnrolled' + mult_config1 = self.mult1_template.format(**mult_params1) mult_config2 = self.mult2_template.format(**mult_params2) return mult_config1 + '\n' + mult_config2 + '\n' + recr_act_config + '\n' + act_config + '\n' + recr_config - class RecurrentFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__((LSTM, GRU), include_header=recr_include_list) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 1d4c96d982..1f71ddcdc4 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -68,12 +68,20 @@ def _register_layer_attributes(self): # Add ConvImplementation to Convolution+Pooling layers cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D] - for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) self.attribute_map[layer] = attrs + + # Add implementation of Dense Resource for all layers that use Dense for matrix mult + # Handle different implementations of Resource strategy; this attribute only makes a difference if strategy == Resource + # Standard -> nnet_dense_resource.h + # Unrolled -> Code generation, ignoring zero DSPs and optimizing zero-filled BRAM blocks + for layer in [Dense] + cnn_layers + rnn_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(ChoiceAttribute('dense_resource_implementation', choices=['standard', 'unrolled'], default='standard')) + self.attribute_map[layer] = attrs def _register_flows(self): initializers = self._get_layer_initializers() @@ -240,6 +248,7 @@ def init_dense(self, layer): else: layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) + layer.set_attr('dense_resource_implementation', layer.model.config.get_dense_resource_implementation(layer).lower()) # TODO consolidate these functions into a single `init_conv` @layer_optimizer(Conv1D) @@ -270,6 +279,9 @@ def init_conv1d(self, layer): layer.set_attr('n_partitions', out_width // closest_pf) layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + # TODO - Extend unrolled Dense Resource to Conv1D kernels + layer.set_attr('dense_resource_implementation', 'standard') self._validate_conv_strategy(layer) @@ -286,7 +298,10 @@ def init_sepconv1d(self, layer): 'n_partitions', 1 ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) - + + # TODO - Extend unrolled Dense Resource to separable Conv1D + layer.set_attr('dense_resource_implementation', 'standard') + @layer_optimizer(Conv2D) def init_conv2d(self, layer): if len(layer.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D @@ -313,9 +328,10 @@ def init_conv2d(self, layer): ) else: closest_pf = chosen_pf + layer.set_attr('n_partitions', out_height * out_width // closest_pf) - layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + layer.set_attr('dense_resource_implementation', layer.model.config.get_dense_resource_implementation(layer).lower()) self._validate_conv_strategy(layer) @@ -333,6 +349,9 @@ def init_sepconv2d(self, layer): ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + # TODO - Extend unrolled Dense Resource to separable Conv2D + layer.set_attr('dense_resource_implementation', 'standard') + @layer_optimizer(DepthwiseConv2D) def init_depconv2d(self, layer): if layer.model.config.is_resource_strategy(layer): @@ -346,6 +365,9 @@ def init_depconv2d(self, layer): 'n_partitions', 1 ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + # TODO - Extend unrolled Dense Resource to depthwise Conv2D + layer.set_attr('dense_resource_implementation', 'standard') def _set_pooling_accum_t(self, layer, pool_size): extra_bits = ceil_log2(pool_size) @@ -404,6 +426,9 @@ def init_lstm(self, layer): layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', IntegerPrecisionType(width=1, signed=False))) + + # TODO - Extend unrolled Dense Resource to recurrent kernels + layer.set_attr('dense_resource_implementation', 'standard') @layer_optimizer(GRU) def init_gru(self, layer): @@ -419,6 +444,9 @@ def init_gru(self, layer): layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', IntegerPrecisionType(width=1, signed=False))) + + # TODO - Extend unrolled Dense Resource to recurrent kernels + layer.set_attr('dense_resource_implementation', 'standard') @layer_optimizer(GarNet) def init_garnet(self, layer): diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index c44fd8f02e..57fb31841d 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -43,6 +43,10 @@ def __init__(self, config): self.layer_type_conv_implementation = {} self.layer_name_conv_implementation = {} + self.model_dense_resource_implementation = 'Standard' + self.layer_type_dense_resource_implementation = {} + self.layer_name_dense_resource_implementation = {} + self.model_compression = False self.layer_type_compression = {} self.layer_name_compression = {} @@ -165,6 +169,15 @@ def get_conv_implementation(self, layer): return conv_implementation + def get_dense_resource_implementation(self, layer): + dense_resource_implementation = self.layer_name_dense_resource_implementation.get(layer.name.lower()) + if dense_resource_implementation is None: + dense_resource_implementation = self.layer_type_dense_resource_implementation.get(layer.__class__.__name__.lower()) + if dense_resource_implementation is None: + dense_resource_implementation = self.model_dense_resource_implementation + + return dense_resource_implementation + def is_resource_strategy(self, layer): return self.get_strategy(layer).lower() == 'resource' @@ -212,6 +225,7 @@ def _parse_hls_config(self): self.model_rf = model_cfg.get('ReuseFactor') self.model_targ_cycles = model_cfg.get('TargetCycles') self.model_conv_implementation = model_cfg.get('ConvImplementation', 'LineBuffer') + self.model_dense_resource_implementation = model_cfg.get('DenseResourceImplementation', 'Standard') self.model_strategy = model_cfg.get('Strategy', 'Latency') self.model_compression = bool(model_cfg.get('Compression', 0)) self.pipeline_style = model_cfg.get('PipelineStyle', 'pipeline') @@ -241,6 +255,10 @@ def _parse_hls_config(self): conv_implementation = layer_cfg.get('ConvImplementation') if conv_implementation is not None: self.layer_type_conv_implementation[layer_type.lower()] = conv_implementation + + dense_resource_implementation = layer_cfg.get('DenseResourceImplementation') + if conv_implementation is not None: + self.layer_type_dense_resource_implementation[layer_type.lower()] = dense_resource_implementation compression = layer_cfg.get('Compression') if compression is not None: @@ -271,6 +289,10 @@ def _parse_hls_config(self): conv_implementation = layer_cfg.get('ConvImplementation') if conv_implementation is not None: self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation + + dense_resource_implementation = layer_cfg.get('DenseResourceImplementation') + if conv_implementation is not None: + self.layer_name_dense_resource_implementation[layer_name.lower()] = dense_resource_implementation compression = layer_cfg.get('Compression') if compression is not None: diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index e4db43682e..553044479e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -2,7 +2,6 @@ #define NNET_INSTR_GEN_H_ #include "nnet_helpers.h" -#include namespace nnet { @@ -25,6 +24,17 @@ template class FillConv2DBuffer { } }; +template class DenseResourceUnrolled { + public: + static void dense_unrolled( + data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out] + ) { + // To be implemented in subclasses + } +}; + // hls4ml insert code } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index 7bd47442f6..509feb5f35 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -291,11 +291,11 @@ void compute_output_buffer_2d( // Dense multiply // #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - dense_latency( - kernel_data, res_out, weights, biases); + dense_latency(kernel_data, res_out, weights, biases); + } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + CONFIG_T::template dense_unrolled::dense_unrolled(kernel_data, res_out, weights, biases); } else { - dense_resource( - kernel_data, res_out, weights, biases); + dense_resource(kernel_data, res_out, weights, biases); } // Pack output @@ -335,7 +335,7 @@ void compute_output_buffer_1d( const data_T &in_elem, hls::stream &res_stream, typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - #pragma HLS INLINE + #pragma HLS INLINE OFF // Thresholds const static int lShiftX = CONFIG_T::filt_width - 1; @@ -360,13 +360,13 @@ void compute_output_buffer_1d( if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { // Dense multiply - #pragma HLS INLINE recursive + // #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - dense_latency( - kernel_data, res_out, weights, biases); + dense_latency(kernel_data, res_out, weights, biases); + } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + CONFIG_T::template dense_unrolled::dense_unrolled(kernel_data, res_out, weights, biases); } else { - dense_resource( - kernel_data, res_out, weights, biases); + dense_resource(kernel_data, res_out, weights, biases); } // Pack output diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h index c5155d8485..c278606594 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h @@ -11,6 +11,11 @@ namespace nnet { +// Different implementations of Resource strategy; this attribute only makes a difference if strategy == Resource +// Default -> nnet_dense_resource.h +// Unrolled -> Code generation, ignoring zero DSPs and optimizing BRAM +enum resource_implementation { standard, unrolled }; + struct dense_config { // Internal data type definitions typedef float bias_t; @@ -27,7 +32,13 @@ struct dense_config { static const unsigned reuse_factor = 1; static const bool store_weights_in_bram = false; static const unsigned n_zeros = 0; - // partitioning arrays cyclically to go with roll factors? + + static const unsigned resource_implementation = standard; + template + using dense_unrolled = nnet::DenseResourceUnrolled; + + // Partitioning arrays cyclically to go with roll factors? + // Product function to use template using product = nnet::product::mult; }; @@ -39,6 +50,8 @@ void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], #pragma HLS inline if (CONFIG_T::strategy == nnet::latency) { dense_latency(data, res, weights, biases); + } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + CONFIG_T::template dense_unrolled::dense_unrolled(data, res, weights, biases); } else { dense_resource(data, res, weights, biases); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h index ad3a972ef6..28bdfa7fe3 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h @@ -17,6 +17,8 @@ void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], if (CONFIG_T::strategy == nnet::latency) { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor dense_latency(data, res, weights, biases); + } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled and CONFIG_T::reuse_factor > 1) { + CONFIG_T::template dense_unrolled::dense_unrolled(data, res, weights, biases); } else { dense_resource(data, res, weights, biases); } From 4fa21cbecdb4add9ea68942e3b6c8f3f4a4e9ae2 Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Fri, 26 May 2023 21:28:49 +0100 Subject: [PATCH 002/272] Code generation for unrolled Dense --- hls4ml/backends/fpga/passes/codegen.py | 197 +++++++++++++++++- .../vivado/passes/convolution_templates.py | 10 +- .../backends/vivado/passes/core_templates.py | 3 +- hls4ml/backends/vivado/vivado_backend.py | 1 + .../vivado/nnet_utils/nnet_code_gen.h | 3 + .../vivado/nnet_utils/nnet_conv2d_stream.h | 5 + .../vivado/nnet_utils/nnet_conv_stream.h | 2 +- hls4ml/writer/vivado_writer.py | 19 +- 8 files changed, 225 insertions(+), 15 deletions(-) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index f1f1080996..2936645355 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -1,7 +1,8 @@ -from hls4ml.model.layers import Conv1D, Conv2D -from hls4ml.model.optimizer import OptimizerPass +import math +import numpy as np from hls4ml.model.types import Source - +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.layers import Dense, Conv1D, Conv2D class GenerateConvIm2col(OptimizerPass): '''Generates tcode for im2col step of 1D/2d convolution''' @@ -49,3 +50,193 @@ def _generate_im2col_2d(self, node): ) node.set_attr('line_buffer_codegen', Source(code_str)) + +class GenerateUnrolledDenseResource(OptimizerPass): + '''Generates C++ code for unrolled Dense resource''' + + def match(self, node): + # Only apply to layers use that use Dense Matrix Multiplication + # TODO - Extend (& test) for Conv1D / Separable Conv / Depthwise Conv / Recurrent layers + layers_with_dense = (Dense, Conv2D) + + # Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer + weights_transposed = node.get_attr('_weights_transposed', False) + + # RF = 1 will optimize DSPs anyway, so no need to unroll code + rf_gt_one = node.get_attr('reuse_factor') > 1 + + # User requested unrolled implementation of Dense + is_unrolled = node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' + + return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled + + def transform(self, model, node): + code_str = self.__generate_unrolled_dense_resource(model, node) + node.set_attr('unrolled_dense_resource_codegen', Source(code_str)) + + def __generate_unrolled_dense_resource(self, model, node): + """ + Generate a C++ function that mimics the Dense Resource implementation. Similar to Dense Resource, 3 cases are considered + + The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero. + Latency strategy can optimize zero mutiplications, Resource strategy, on the other hand, cannot. + Furthermore, when all the weights in the same BRAM block are zero (e.g. due to model pruning), Vivado is unable to optimize it + With this (and additional TCL scripts) zero BRAM are optimised + + Args: + node: Layer to generate code for + Returns: + generated_code: Generated C++ function (string) + """ + + # Variable instantiation and function pragmas + generated_code = ( + "template\n" + "class dense_unrolled_{index} : public DenseResourceUnrolled {{\n" + " public:\n" + " static void dense_unrolled(\n" + " data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n" + " typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n" + " ) {{\n" + " #pragma HLS pipeline II=CONFIG_T::reuse_factor\n" + "\n" + " constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n" + " #pragma HLS function_instantiate variable=weights,biases\n" + " #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n" + " #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n" + " #pragma HLS ARRAY_PARTITION variable=biases complete\n" + "\n" + " typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n" + " #pragma HLS ARRAY_PARTITION variable=acc complete\n" + "\n" + " InitAccum:\n" + " for (int i = 0; i < CONFIG_T::n_out; i++) {{\n" + " #pragma HLS UNROLL\n" + " acc[i] = (typename CONFIG_T::accum_t) biases[i];\n" + " }}\n" + "\n" + ).format(index=node.index) + + # Unrolled multiplication, according to the three cases + n_in, n_out = node.model.config.backend.get_layer_mult_size(node) + reuse_factor = node.get_attr('reuse_factor') + weights = node.weights['weight'] + if reuse_factor <= n_in: + mult_code = self.__generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights) + elif reuse_factor > n_in and reuse_factor % n_in == 0: + mult_code = self.__generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights) + else: + # This case shouldn't happen if my understanding of RF is correct + # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in + raise Exception('Not implemented...') + + # Write output + generated_code += mult_code + "\n" + generated_code += ( + " Result:\n" + " for (int i = 0; i < CONFIG_T::n_out; i++) {\n" + " #pragma HLS UNROLL\n" + " res[i] = cast(acc[i]);\n" + " }\n" + " }\n" + "};\n" + ) + + return generated_code + + def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights): + # Function constants + mult_factor = min(n_in, reuse_factor) + block_factor = int(math.ceil(n_in * n_out / reuse_factor)) + mult_limit = int(math.ceil(n_in * n_out / mult_factor)) + mult_scale = mult_limit // n_out + + # Zero DSPs are the DSP blocks that always have zero input + # In this case, it is the number of rows in the transposed and reshaped weight matrix + # The new shape is (parallel_mult, reuse_factor) + zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) + + # Generate unrolled multiplications + mult_code = f"\t\t#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n" + mult_code += "\t\tMULT: {\n" + mult_code += "\t\t\t#pragma HLS protocol\n" + + for ir in range(reuse_factor): + acc_step = 0 + out_index = 0 + w_index = ir + in_index = ir + + mult_code += f"\t\t\tM{ir}: {{\n" + for _ in range(block_factor): + if weights.data.flatten()[w_index] != 0: + mult_code += f"\t\t\t\tacc[{out_index}] += static_cast(CONFIG_T::template product::product(data[{in_index}], weights[{w_index}]));\n" + + w_index += reuse_factor + in_index += reuse_factor + if in_index >= n_in: + in_index = ir + if acc_step + 1 >= mult_scale: + acc_step = 0 + out_index += 1 + else: + acc_step += 1 + + mult_code += "\t\t\t}\n" + + mult_code += "\t\t}\n" + + return mult_code + + def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights): + # Function constants + mult_factor = min(n_in, reuse_factor) + block_factor = int(math.ceil(n_in * n_out / reuse_factor)) + mult_limit = int(math.ceil(n_in * n_out / mult_factor)) + + # Zero DSPs are the DSP blocks that always have zero input + # In this case, it is the number of rows in the transposed and reshaped weight matrix + # The new shape is (parallel_mult, reuse_factor) + zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) + + # Generate out indices + outidx = [0] * reuse_factor + outstep = 0 + outscale = reuse_factor // n_in + for ir in range(reuse_factor): + outidx[ir] = outstep + if (ir + 1) % n_in == 0: + outstep += 1 + + # Define variables + in_index = 0 + + # Generate unrolled multiplications + mult_code = f"\t\t#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n" + mult_code += "\t\tMULT: {\n" + mult_code += "\t\t\t#pragma HLS protocol\n" + + for ir in range(reuse_factor): + w_index = ir + out_index = outidx[ir] + + mult_code += f"\t\t\tM{ir}: {{\n" + for _ in range(block_factor): + if weights.data.flatten()[w_index] != 0: + mult_code += f"\t\t\t\tacc[{int(out_index)}] += static_cast(CONFIG_T::template product::product(data[{in_index}], weights[{w_index}]));\n" + + w_index += reuse_factor + if w_index > n_in * n_out: + break + out_index += outscale + mult_code += "\t\t\t}\n" + + in_index += 1 + if in_index >= n_in: + in_index = 0 + + mult_code += "\t\t}\n" + + return mult_code + diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index dde42d97fe..f3e8f969af 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -193,10 +193,9 @@ def format(self, node): params['fill_fn'] = 'FillConv2DBuffer' if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: - # Implemented in subsequent commits - params['unrolled_function'] = 'DenseResourceUnrolled' + params['unrolled_function'] = f'dense_unrolled_{node.index}' else: - params['unrolled_function'] = 'DenseResourceUnrolled' + params['unrolled_function'] = 'DenseResourceUnrolled' conv_config = self.template.format(**params) @@ -207,8 +206,7 @@ def format(self, node): node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: - # Implemented in subsequent commits - mult_params['unrolled_function'] = 'DenseResourceUnrolled' + mult_params['unrolled_function'] = f'dense_unrolled_{node.index}' else: mult_params['unrolled_function'] = 'DenseResourceUnrolled' mult_config = self.mult_template.format(**mult_params) @@ -299,7 +297,7 @@ def format(self, node): ) # TODO - Extend unrolled Dense Resource to depthwise Conv1D mult_params['unrolled_function'] = 'DenseResourceUnrolled' - + depthwise_mult_config = self.depthwise_mult_template.format(**mult_params) # Pointwise config diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index faabf434eb..9f5353cf93 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -44,8 +44,7 @@ def format(self, node): ) if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: - # Implemented in subsequent commits - params['unrolled_function'] = 'DenseResourceUnrolled' + params['unrolled_function'] = f'dense_unrolled_{node.index}' else: params['unrolled_function'] = 'DenseResourceUnrolled' diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 1f71ddcdc4..d2f793568e 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -118,6 +118,7 @@ def _register_flows(self): 'vivado:generate_conv_streaming_instructions', 'vivado:apply_resource_strategy', 'vivado:generate_conv_im2col', + 'vivado:generate_unrolled_dense_resource' ] vivado_types_flow = register_flow('specific_types', vivado_types, requires=[init_flow], backend=self.name) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index 553044479e..9687cb7b44 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -2,6 +2,9 @@ #define NNET_INSTR_GEN_H_ #include "nnet_helpers.h" +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_mult.h" namespace nnet { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h index 8a4fb6be81..803fc7cc23 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h @@ -74,6 +74,11 @@ void conv_2d_buffer_cl( static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] [CONFIG_T::n_chan]; #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + #pragma HLS allocation instances=compute_output_buffer_1d limit=1 function + #pragma HLS allocation instances=compute_output_buffer_2d limit=1 function + } ReadInputHeight: for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index 509feb5f35..bb1b97dc07 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -358,7 +358,7 @@ void compute_output_buffer_1d( // Check to see if we have a full kernel if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { - + // Dense multiply // #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 2fbe3d9438..1f148452ad 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -13,6 +13,14 @@ class VivadoWriter(Writer): + def __get_max_reuse_factor(self, model): + max_rf = 0 + for layer in model.get_layers(): + rf = int(layer.get_attr('reuse_factor')) + if rf > max_rf: + max_rf = rf + return max_rf + def print_array_to_cpp(self, var, odir, write_txt_file=True): """Write a weights array to C++ header files. @@ -171,10 +179,15 @@ def write_project_cpp(self, model): newline += indent + '#pragma HLS INTERFACE ap_vld port={},{} \n'.format( ','.join(all_inputs), ','.join(all_outputs) ) - if model.config.pipeline_style.lower() == 'dataflow': - newline += indent + '#pragma HLS DATAFLOW \n' + + model_cfg = model.config.get_config_value('HLSConfig')['Model'] + if 'DenseResourceImplementation' in model_cfg and model_cfg['DenseResourceImplementation'].lower() == 'unrolled': + newline += indent + f'#pragma HLS PIPELINE ii={self.__get_max_reuse_factor(model)} \n' else: - newline += indent + '#pragma HLS PIPELINE \n' + if model.config.pipeline_style.lower() == 'dataflow': + newline += indent + '#pragma HLS DATAFLOW \n' + else: + newline += indent + '#pragma HLS PIPELINE \n' if io_type == 'io_stream': newline += indent + '#pragma HLS INTERFACE axis port={},{} \n'.format( ','.join(all_inputs), ','.join(all_outputs) From 22e815b1b63a296d4ee260b50346c2f52d9c055f Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Mon, 29 May 2023 15:51:37 +0100 Subject: [PATCH 003/272] Fix incorrect BRAM reporting (#798) --- hls4ml/templates/vivado/build_prj.tcl | 6 +++--- .../vivado/nnet_utils/nnet_dense_resource.h | 15 ++++++++++++--- hls4ml/templates/vivado/vivado_synth.tcl | 4 ++-- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index d34337c573..2a8326aae0 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -236,15 +236,15 @@ if {$opt(export)} { if {$opt(vsynth)} { puts "***** VIVADO SYNTHESIS *****" - if {[file exist ${project_name}_prj/solution1/syn/vhdl]} { + if {[file exist ${project_name}_prj/solution1/syn/verilog]} { set time_start [clock clicks -milliseconds] exec vivado -mode batch -source vivado_synth.tcl >@ stdout set time_end [clock clicks -milliseconds] report_time "VIVADO SYNTHESIS" $time_start $time_end } else { - puts "ERROR: Cannot find generated VHDL files. Did you run C synthesis?" + puts "ERROR: Cannot find generated Verilog files. Did you run C synthesis?" exit 1 } } -exit +exit \ No newline at end of file diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h index 88de94729b..333a0e75fe 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h @@ -26,10 +26,13 @@ void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T:: assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); #pragma HLS function_instantiate variable=weights,biases - //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor #pragma HLS ARRAY_PARTITION variable=biases complete + if (CONFIG_T::reuse_factor > 1) { + #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM + } + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; #pragma HLS ARRAY_PARTITION variable=acc complete @@ -97,10 +100,13 @@ void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0"); #pragma HLS function_instantiate variable=weights,biases - //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor #pragma HLS ARRAY_PARTITION variable=biases complete + if (CONFIG_T::reuse_factor > 1) { + #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM + } + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; #pragma HLS ARRAY_PARTITION variable=acc complete @@ -176,10 +182,13 @@ void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n assert((rufactor > nin) && "This function is correct only for RF > N_IN"); #pragma HLS function_instantiate variable=weights,biases - //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor #pragma HLS ARRAY_PARTITION variable=biases complete + if (CONFIG_T::reuse_factor > 1) { + #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM + } + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; #pragma HLS ARRAY_PARTITION variable=acc complete diff --git a/hls4ml/templates/vivado/vivado_synth.tcl b/hls4ml/templates/vivado/vivado_synth.tcl index 4634b166f6..96bd21c672 100644 --- a/hls4ml/templates/vivado/vivado_synth.tcl +++ b/hls4ml/templates/vivado/vivado_synth.tcl @@ -1,6 +1,6 @@ set tcldir [file dirname [info script]] source [file join $tcldir project.tcl] -add_files ${project_name}_prj/solution1/syn/vhdl +add_files ${project_name}_prj/solution1/syn/verilog synth_design -top ${project_name} -part $part -report_utilization -file vivado_synth.rpt +report_utilization -file vivado_synth.rpt \ No newline at end of file From 9cab74a2eb314f2bcb785af6f38576055f97ef5a Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Sun, 11 Jun 2023 18:58:32 +0100 Subject: [PATCH 004/272] Add post-synthesis design optimisation to remove unused BRAM --- hls4ml/templates/vivado/vivado_synth.tcl | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/templates/vivado/vivado_synth.tcl b/hls4ml/templates/vivado/vivado_synth.tcl index 96bd21c672..9f4119d6bd 100644 --- a/hls4ml/templates/vivado/vivado_synth.tcl +++ b/hls4ml/templates/vivado/vivado_synth.tcl @@ -3,4 +3,5 @@ source [file join $tcldir project.tcl] add_files ${project_name}_prj/solution1/syn/verilog synth_design -top ${project_name} -part $part +opt_design -retarget -propconst -sweep -bram_power_opt -shift_register_opt report_utilization -file vivado_synth.rpt \ No newline at end of file From d79f868c4dc50d666a1528d6efd076083e226d0c Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Sun, 11 Jun 2023 20:47:16 +0100 Subject: [PATCH 005/272] Tests for unrolled Dense --- test/pytest/test_dense_unrolled.py | 63 ++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 test/pytest/test_dense_unrolled.py diff --git a/test/pytest/test_dense_unrolled.py b/test/pytest/test_dense_unrolled.py new file mode 100644 index 0000000000..69daf9cd96 --- /dev/null +++ b/test/pytest/test_dense_unrolled.py @@ -0,0 +1,63 @@ +import pytest +import numpy as np +from pathlib import Path + +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Conv2D, Flatten + +from hls4ml.utils import config_from_keras_model +from hls4ml.converters import convert_from_keras_model + +test_root_path = Path(__file__).parent + +# Tests a wide range of RF to ensure the unrolled Dense is correct +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +@pytest.mark.parametrize('reuse_factor', [1, 2, 4, 8, 16, 32, 48, 64, 96, 192]) +def test_dense_unrolled(io_type, reuse_factor): + input_shape = (16, ) + X = np.random.rand(100, *input_shape) + + model = Sequential() + model.add(Dense(12, input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform')) + model.compile('adam', 'mse') + keras_prediction = model.predict(X) + + config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', default_reuse_factor=reuse_factor) + config['Model']['Strategy'] = 'Resource' + config['Model']['DenseResourceImplementation'] = 'Unrolled' + + output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_{io_type}_{reuse_factor}') + hls_model = convert_from_keras_model( + model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type + ) + hls_model.compile() + + hls_prediction = hls_model.predict(X) + np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=1e-2) + +# Tests a wide range RF on streaming Conv2D to ensure the unrolled Dense is correct +@pytest.mark.parametrize('io_type', ['io_stream']) +@pytest.mark.parametrize('reuse_factor', [1, 3, 9, 27, 54, 108]) +def test_dense_unrolled_streaming_conv(io_type, reuse_factor): + input_shape = (8, 8, 3) + X = np.random.rand(100, *input_shape) + + model = Sequential() + model.add(Conv2D(4, (3, 3), input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform')) + model.add(Flatten()) + model.add(Dense(1, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform')) + model.compile('adam', 'mse') + keras_prediction = model.predict(X) + + config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', default_reuse_factor=reuse_factor) + config['Model']['Strategy'] = 'Resource' + config['Model']['DenseResourceImplementation'] = 'Unrolled' + + output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_conv2d_{io_type}_{reuse_factor}') + hls_model = convert_from_keras_model( + model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type + ) + hls_model.compile() + + hls_prediction = hls_model.predict(X) + np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=1e-2) From ff86c266008f963f1a16bd1f30a3100ac83a5e0d Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Fri, 16 Jun 2023 11:27:59 +0100 Subject: [PATCH 006/272] pre-commit on hls4ml Optimization pt.2 --- hls4ml/backends/fpga/passes/codegen.py | 66 +++++++++++-------- .../vivado/passes/convolution_templates.py | 16 +++-- .../backends/vivado/passes/core_templates.py | 6 +- .../vivado/passes/recurrent_templates.py | 13 ++-- hls4ml/backends/vivado/vivado_backend.py | 24 +++---- hls4ml/model/graph.py | 8 ++- hls4ml/templates/vivado/build_prj.tcl | 2 +- .../vivado/nnet_utils/nnet_code_gen.h | 11 ++-- .../vivado/nnet_utils/nnet_conv2d_stream.h | 5 +- .../vivado/nnet_utils/nnet_conv_stream.h | 28 +++++--- .../templates/vivado/nnet_utils/nnet_dense.h | 9 +-- .../vivado/nnet_utils/nnet_dense_stream.h | 3 +- hls4ml/templates/vivado/vivado_synth.tcl | 2 +- hls4ml/writer/vivado_writer.py | 7 +- test/pytest/test_dense_unrolled.py | 28 ++++---- 15 files changed, 135 insertions(+), 93 deletions(-) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index 2936645355..32243356c3 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -1,8 +1,11 @@ import math + import numpy as np -from hls4ml.model.types import Source + +from hls4ml.model.layers import Conv1D, Conv2D, Dense from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.layers import Dense, Conv1D, Conv2D +from hls4ml.model.types import Source + class GenerateConvIm2col(OptimizerPass): '''Generates tcode for im2col step of 1D/2d convolution''' @@ -51,6 +54,7 @@ def _generate_im2col_2d(self, node): node.set_attr('line_buffer_codegen', Source(code_str)) + class GenerateUnrolledDenseResource(OptimizerPass): '''Generates C++ code for unrolled Dense resource''' @@ -73,14 +77,15 @@ def match(self, node): def transform(self, model, node): code_str = self.__generate_unrolled_dense_resource(model, node) node.set_attr('unrolled_dense_resource_codegen', Source(code_str)) - + def __generate_unrolled_dense_resource(self, model, node): """ - Generate a C++ function that mimics the Dense Resource implementation. Similar to Dense Resource, 3 cases are considered + Generate a C++ function that mimics the Dense Resource implementation. The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero. - Latency strategy can optimize zero mutiplications, Resource strategy, on the other hand, cannot. - Furthermore, when all the weights in the same BRAM block are zero (e.g. due to model pruning), Vivado is unable to optimize it + Latency strategy can optimize zero mutiplications + Resource strategy, on the other hand, cannot. + When all the weights in the same BRAM block are zero, Vivado is unable to optimize it With this (and additional TCL scripts) zero BRAM are optimised Args: @@ -97,16 +102,16 @@ def __generate_unrolled_dense_resource(self, model, node): " static void dense_unrolled(\n" " data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n" " typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n" - " typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n" " ) {{\n" " #pragma HLS pipeline II=CONFIG_T::reuse_factor\n" "\n" " constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n" " #pragma HLS function_instantiate variable=weights,biases\n" - " #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n" + " #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n" " #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n" " #pragma HLS ARRAY_PARTITION variable=biases complete\n" - "\n" + "\n" " typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n" " #pragma HLS ARRAY_PARTITION variable=acc complete\n" "\n" @@ -117,7 +122,7 @@ def __generate_unrolled_dense_resource(self, model, node): " }}\n" "\n" ).format(index=node.index) - + # Unrolled multiplication, according to the three cases n_in, n_out = node.model.config.backend.get_layer_mult_size(node) reuse_factor = node.get_attr('reuse_factor') @@ -128,7 +133,7 @@ def __generate_unrolled_dense_resource(self, model, node): mult_code = self.__generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights) else: # This case shouldn't happen if my understanding of RF is correct - # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in + # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in raise Exception('Not implemented...') # Write output @@ -151,8 +156,8 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we block_factor = int(math.ceil(n_in * n_out / reuse_factor)) mult_limit = int(math.ceil(n_in * n_out / mult_factor)) mult_scale = mult_limit // n_out - - # Zero DSPs are the DSP blocks that always have zero input + + # Zero DSPs are the DSP blocks that always have zero input # In this case, it is the number of rows in the transposed and reshaped weight matrix # The new shape is (parallel_mult, reuse_factor) zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) @@ -161,7 +166,7 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we mult_code = f"\t\t#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n" mult_code += "\t\tMULT: {\n" mult_code += "\t\t\t#pragma HLS protocol\n" - + for ir in range(reuse_factor): acc_step = 0 out_index = 0 @@ -171,8 +176,11 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we mult_code += f"\t\t\tM{ir}: {{\n" for _ in range(block_factor): if weights.data.flatten()[w_index] != 0: - mult_code += f"\t\t\t\tacc[{out_index}] += static_cast(CONFIG_T::template product::product(data[{in_index}], weights[{w_index}]));\n" - + mult_code += f"\t\t\t\tacc[{out_index}] += \ + static_cast\ + (CONFIG_T::template product::\ + product(data[{in_index}], weights[{w_index}]));\n" + w_index += reuse_factor in_index += reuse_factor if in_index >= n_in: @@ -181,10 +189,10 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we acc_step = 0 out_index += 1 else: - acc_step += 1 - + acc_step += 1 + mult_code += "\t\t\t}\n" - + mult_code += "\t\t}\n" return mult_code @@ -194,13 +202,13 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor mult_factor = min(n_in, reuse_factor) block_factor = int(math.ceil(n_in * n_out / reuse_factor)) mult_limit = int(math.ceil(n_in * n_out / mult_factor)) - - # Zero DSPs are the DSP blocks that always have zero input + + # Zero DSPs are the DSP blocks that always have zero input # In this case, it is the number of rows in the transposed and reshaped weight matrix # The new shape is (parallel_mult, reuse_factor) zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) - - # Generate out indices + + # Generate out indices outidx = [0] * reuse_factor outstep = 0 outscale = reuse_factor // n_in @@ -216,7 +224,7 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor mult_code = f"\t\t#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n" mult_code += "\t\tMULT: {\n" mult_code += "\t\t\t#pragma HLS protocol\n" - + for ir in range(reuse_factor): w_index = ir out_index = outidx[ir] @@ -224,14 +232,17 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor mult_code += f"\t\t\tM{ir}: {{\n" for _ in range(block_factor): if weights.data.flatten()[w_index] != 0: - mult_code += f"\t\t\t\tacc[{int(out_index)}] += static_cast(CONFIG_T::template product::product(data[{in_index}], weights[{w_index}]));\n" - + mult_code += f"\t\t\t\tacc[{int(out_index)}] += \ + static_cast\ + (CONFIG_T::template product::\ + product(data[{in_index}], weights[{w_index}]));\n" + w_index += reuse_factor if w_index > n_in * n_out: break out_index += outscale mult_code += "\t\t\t}\n" - + in_index += 1 if in_index >= n_in: in_index = 0 @@ -239,4 +250,3 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor mult_code += "\t\t}\n" return mult_code - diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index f3e8f969af..0c5a1da729 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -191,11 +191,15 @@ def format(self, node): params['fill_fn'] = f'fill_buffer_{node.index}' else: params['fill_fn'] = 'FillConv2DBuffer' - - if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: + + if ( + node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' + and node.get_attr('strategy').lower() == 'resource' + and node.get_attr('reuse_factor') > 1 + ): params['unrolled_function'] = f'dense_unrolled_{node.index}' else: - params['unrolled_function'] = 'DenseResourceUnrolled' + params['unrolled_function'] = 'DenseResourceUnrolled' conv_config = self.template.format(**params) @@ -205,7 +209,11 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) - if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: + if ( + node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' + and node.get_attr('strategy').lower() == 'resource' + and node.get_attr('reuse_factor') > 1 + ): mult_params['unrolled_function'] = f'dense_unrolled_{node.index}' else: mult_params['unrolled_function'] = 'DenseResourceUnrolled' diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index 9f5353cf93..5f1a25e37f 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -43,7 +43,11 @@ def format(self, node): node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) - if node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' and node.get_attr('strategy').lower() == 'resource' and node.get_attr('reuse_factor') > 1: + if ( + node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' + and node.get_attr('strategy').lower() == 'resource' + and node.get_attr('reuse_factor') > 1 + ): params['unrolled_function'] = f'dense_unrolled_{node.index}' else: params['unrolled_function'] = 'DenseResourceUnrolled' diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py index eb12412def..e5c3937fd3 100644 --- a/hls4ml/backends/vivado/passes/recurrent_templates.py +++ b/hls4ml/backends/vivado/passes/recurrent_templates.py @@ -140,10 +140,10 @@ def format(self, node): mult_params1['index'] = str(node.index) + '_1' mult_params1['nzeros'] = node.get_weights('weight').nzeros mult_params1['nonzeros'] = node.get_weights('weight').nonzeros - + # TODO - Extend unrolled Dense Resource to recurrent kernels - mult_params1['unrolled_function'] = 'DenseResourceUnrolled' - + mult_params1['unrolled_function'] = 'DenseResourceUnrolled' + if node.get_attr('return_sequences'): mult_params2['n_in'] = node.get_output_variable().dim_names[1] mult_params2['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult @@ -157,15 +157,16 @@ def format(self, node): mult_params2['index'] = str(node.index) + '_2' mult_params2['nzeros'] = node.get_weights('recurrent_weight').nzeros mult_params2['nonzeros'] = node.get_weights('recurrent_weight').nonzeros - + # TODO - Extend unrolled Dense Resource to recurrent kernels - mult_params2['unrolled_function'] = 'DenseResourceUnrolled' - + mult_params2['unrolled_function'] = 'DenseResourceUnrolled' + mult_config1 = self.mult1_template.format(**mult_params1) mult_config2 = self.mult2_template.format(**mult_params2) return mult_config1 + '\n' + mult_config2 + '\n' + recr_act_config + '\n' + act_config + '\n' + recr_config + class RecurrentFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__((LSTM, GRU), include_header=recr_include_list) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index d2f793568e..3300f31dc9 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -73,14 +73,16 @@ def _register_layer_attributes(self): # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) self.attribute_map[layer] = attrs - + # Add implementation of Dense Resource for all layers that use Dense for matrix mult - # Handle different implementations of Resource strategy; this attribute only makes a difference if strategy == Resource + # Handle different implementations of Resource strategy; only makes a difference if strategy == Resource # Standard -> nnet_dense_resource.h # Unrolled -> Code generation, ignoring zero DSPs and optimizing zero-filled BRAM blocks for layer in [Dense] + cnn_layers + rnn_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ChoiceAttribute('dense_resource_implementation', choices=['standard', 'unrolled'], default='standard')) + attrs.append( + ChoiceAttribute('dense_resource_implementation', choices=['standard', 'unrolled'], default='standard') + ) self.attribute_map[layer] = attrs def _register_flows(self): @@ -118,7 +120,7 @@ def _register_flows(self): 'vivado:generate_conv_streaming_instructions', 'vivado:apply_resource_strategy', 'vivado:generate_conv_im2col', - 'vivado:generate_unrolled_dense_resource' + 'vivado:generate_unrolled_dense_resource', ] vivado_types_flow = register_flow('specific_types', vivado_types, requires=[init_flow], backend=self.name) @@ -280,7 +282,7 @@ def init_conv1d(self, layer): layer.set_attr('n_partitions', out_width // closest_pf) layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) - + # TODO - Extend unrolled Dense Resource to Conv1D kernels layer.set_attr('dense_resource_implementation', 'standard') @@ -299,10 +301,10 @@ def init_sepconv1d(self, layer): 'n_partitions', 1 ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) - + # TODO - Extend unrolled Dense Resource to separable Conv1D layer.set_attr('dense_resource_implementation', 'standard') - + @layer_optimizer(Conv2D) def init_conv2d(self, layer): if len(layer.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D @@ -329,7 +331,7 @@ def init_conv2d(self, layer): ) else: closest_pf = chosen_pf - + layer.set_attr('n_partitions', out_height * out_width // closest_pf) layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) layer.set_attr('dense_resource_implementation', layer.model.config.get_dense_resource_implementation(layer).lower()) @@ -366,7 +368,7 @@ def init_depconv2d(self, layer): 'n_partitions', 1 ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) - + # TODO - Extend unrolled Dense Resource to depthwise Conv2D layer.set_attr('dense_resource_implementation', 'standard') @@ -427,7 +429,7 @@ def init_lstm(self, layer): layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', IntegerPrecisionType(width=1, signed=False))) - + # TODO - Extend unrolled Dense Resource to recurrent kernels layer.set_attr('dense_resource_implementation', 'standard') @@ -445,7 +447,7 @@ def init_gru(self, layer): layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', IntegerPrecisionType(width=1, signed=False))) - + # TODO - Extend unrolled Dense Resource to recurrent kernels layer.set_attr('dense_resource_implementation', 'standard') diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 57fb31841d..55ec06e18a 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -172,7 +172,9 @@ def get_conv_implementation(self, layer): def get_dense_resource_implementation(self, layer): dense_resource_implementation = self.layer_name_dense_resource_implementation.get(layer.name.lower()) if dense_resource_implementation is None: - dense_resource_implementation = self.layer_type_dense_resource_implementation.get(layer.__class__.__name__.lower()) + dense_resource_implementation = self.layer_type_dense_resource_implementation.get( + layer.__class__.__name__.lower() + ) if dense_resource_implementation is None: dense_resource_implementation = self.model_dense_resource_implementation @@ -255,7 +257,7 @@ def _parse_hls_config(self): conv_implementation = layer_cfg.get('ConvImplementation') if conv_implementation is not None: self.layer_type_conv_implementation[layer_type.lower()] = conv_implementation - + dense_resource_implementation = layer_cfg.get('DenseResourceImplementation') if conv_implementation is not None: self.layer_type_dense_resource_implementation[layer_type.lower()] = dense_resource_implementation @@ -289,7 +291,7 @@ def _parse_hls_config(self): conv_implementation = layer_cfg.get('ConvImplementation') if conv_implementation is not None: self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation - + dense_resource_implementation = layer_cfg.get('DenseResourceImplementation') if conv_implementation is not None: self.layer_name_dense_resource_implementation[layer_name.lower()] = dense_resource_implementation diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index 2a8326aae0..b6419773cb 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -247,4 +247,4 @@ if {$opt(vsynth)} { } } -exit \ No newline at end of file +exit diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index 9687cb7b44..caab69663e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -2,6 +2,7 @@ #define NNET_INSTR_GEN_H_ #include "nnet_helpers.h" + #include "hls_stream.h" #include "nnet_common.h" #include "nnet_mult.h" @@ -29,13 +30,11 @@ template class FillConv2DBuffer { template class DenseResourceUnrolled { public: - static void dense_unrolled( - data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out] - ) { + static void dense_unrolled(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { // To be implemented in subclasses - } + } }; // hls4ml insert code diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h index 803fc7cc23..08d06501c3 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h @@ -74,8 +74,9 @@ void conv_2d_buffer_cl( static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] [CONFIG_T::n_chan]; #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 - - if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + + if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && + CONFIG_T::reuse_factor > 1) { #pragma HLS allocation instances=compute_output_buffer_1d limit=1 function #pragma HLS allocation instances=compute_output_buffer_2d limit=1 function } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index bb1b97dc07..d95d528e46 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -291,11 +291,16 @@ void compute_output_buffer_2d( // Dense multiply // #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - dense_latency(kernel_data, res_out, weights, biases); - } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && CONFIG_T::reuse_factor > 1) { - CONFIG_T::template dense_unrolled::dense_unrolled(kernel_data, res_out, weights, biases); + dense_latency( + kernel_data, res_out, weights, biases); + } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && + CONFIG_T::reuse_factor > 1) { + CONFIG_T::template dense_unrolled::dense_unrolled(kernel_data, res_out, weights, + biases); } else { - dense_resource(kernel_data, res_out, weights, biases); + dense_resource( + kernel_data, res_out, weights, biases); } // Pack output @@ -358,15 +363,20 @@ void compute_output_buffer_1d( // Check to see if we have a full kernel if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { - + // Dense multiply // #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - dense_latency(kernel_data, res_out, weights, biases); - } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && CONFIG_T::reuse_factor > 1) { - CONFIG_T::template dense_unrolled::dense_unrolled(kernel_data, res_out, weights, biases); + dense_latency( + kernel_data, res_out, weights, biases); + } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && + CONFIG_T::reuse_factor > 1) { + CONFIG_T::template dense_unrolled::dense_unrolled(kernel_data, res_out, weights, + biases); } else { - dense_resource(kernel_data, res_out, weights, biases); + dense_resource( + kernel_data, res_out, weights, biases); } // Pack output diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h index c278606594..2037daf0b9 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h @@ -34,11 +34,11 @@ struct dense_config { static const unsigned n_zeros = 0; static const unsigned resource_implementation = standard; - template + template using dense_unrolled = nnet::DenseResourceUnrolled; - + // Partitioning arrays cyclically to go with roll factors? - + // Product function to use template using product = nnet::product::mult; }; @@ -50,7 +50,8 @@ void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], #pragma HLS inline if (CONFIG_T::strategy == nnet::latency) { dense_latency(data, res, weights, biases); - } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && + CONFIG_T::reuse_factor > 1) { CONFIG_T::template dense_unrolled::dense_unrolled(data, res, weights, biases); } else { dense_resource(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h index 28bdfa7fe3..db3039fc33 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h @@ -17,7 +17,8 @@ void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], if (CONFIG_T::strategy == nnet::latency) { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor dense_latency(data, res, weights, biases); - } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled and CONFIG_T::reuse_factor > 1) { + } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && + CONFIG_T::reuse_factor > 1) { CONFIG_T::template dense_unrolled::dense_unrolled(data, res, weights, biases); } else { dense_resource(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/vivado_synth.tcl b/hls4ml/templates/vivado/vivado_synth.tcl index 9f4119d6bd..342b1e6740 100644 --- a/hls4ml/templates/vivado/vivado_synth.tcl +++ b/hls4ml/templates/vivado/vivado_synth.tcl @@ -4,4 +4,4 @@ source [file join $tcldir project.tcl] add_files ${project_name}_prj/solution1/syn/verilog synth_design -top ${project_name} -part $part opt_design -retarget -propconst -sweep -bram_power_opt -shift_register_opt -report_utilization -file vivado_synth.rpt \ No newline at end of file +report_utilization -file vivado_synth.rpt diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 1f148452ad..6509fb5e3d 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -20,7 +20,7 @@ def __get_max_reuse_factor(self, model): if rf > max_rf: max_rf = rf return max_rf - + def print_array_to_cpp(self, var, odir, write_txt_file=True): """Write a weights array to C++ header files. @@ -181,7 +181,10 @@ def write_project_cpp(self, model): ) model_cfg = model.config.get_config_value('HLSConfig')['Model'] - if 'DenseResourceImplementation' in model_cfg and model_cfg['DenseResourceImplementation'].lower() == 'unrolled': + if ( + 'DenseResourceImplementation' in model_cfg + and model_cfg['DenseResourceImplementation'].lower() == 'unrolled' + ): newline += indent + f'#pragma HLS PIPELINE ii={self.__get_max_reuse_factor(model)} \n' else: if model.config.pipeline_style.lower() == 'dataflow': diff --git a/test/pytest/test_dense_unrolled.py b/test/pytest/test_dense_unrolled.py index 69daf9cd96..a3318049be 100644 --- a/test/pytest/test_dense_unrolled.py +++ b/test/pytest/test_dense_unrolled.py @@ -1,20 +1,21 @@ -import pytest -import numpy as np from pathlib import Path +import numpy as np +import pytest +from tensorflow.keras.layers import Conv2D, Dense, Flatten from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Dense, Conv2D, Flatten -from hls4ml.utils import config_from_keras_model from hls4ml.converters import convert_from_keras_model +from hls4ml.utils import config_from_keras_model test_root_path = Path(__file__).parent + # Tests a wide range of RF to ensure the unrolled Dense is correct @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) @pytest.mark.parametrize('reuse_factor', [1, 2, 4, 8, 16, 32, 48, 64, 96, 192]) def test_dense_unrolled(io_type, reuse_factor): - input_shape = (16, ) + input_shape = (16,) X = np.random.rand(100, *input_shape) model = Sequential() @@ -25,16 +26,15 @@ def test_dense_unrolled(io_type, reuse_factor): config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', default_reuse_factor=reuse_factor) config['Model']['Strategy'] = 'Resource' config['Model']['DenseResourceImplementation'] = 'Unrolled' - + output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_{io_type}_{reuse_factor}') - hls_model = convert_from_keras_model( - model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type - ) + hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type) hls_model.compile() hls_prediction = hls_model.predict(X) np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=1e-2) + # Tests a wide range RF on streaming Conv2D to ensure the unrolled Dense is correct @pytest.mark.parametrize('io_type', ['io_stream']) @pytest.mark.parametrize('reuse_factor', [1, 3, 9, 27, 54, 108]) @@ -43,7 +43,9 @@ def test_dense_unrolled_streaming_conv(io_type, reuse_factor): X = np.random.rand(100, *input_shape) model = Sequential() - model.add(Conv2D(4, (3, 3), input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform')) + model.add( + Conv2D(4, (3, 3), input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform') + ) model.add(Flatten()) model.add(Dense(1, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform')) model.compile('adam', 'mse') @@ -52,11 +54,9 @@ def test_dense_unrolled_streaming_conv(io_type, reuse_factor): config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', default_reuse_factor=reuse_factor) config['Model']['Strategy'] = 'Resource' config['Model']['DenseResourceImplementation'] = 'Unrolled' - + output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_conv2d_{io_type}_{reuse_factor}') - hls_model = convert_from_keras_model( - model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type - ) + hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type) hls_model.compile() hls_prediction = hls_model.predict(X) From 0f0adc4908e22b23d9a5bd8953d528b527d9523b Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Fri, 16 Jun 2023 12:13:13 +0100 Subject: [PATCH 007/272] Fix failing PyTests --- hls4ml/backends/fpga/passes/codegen.py | 2 +- .../backends/vivado/passes/convolution_templates.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index 32243356c3..09e600d421 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -67,7 +67,7 @@ def match(self, node): weights_transposed = node.get_attr('_weights_transposed', False) # RF = 1 will optimize DSPs anyway, so no need to unroll code - rf_gt_one = node.get_attr('reuse_factor') > 1 + rf_gt_one = node.get_attr('reuse_factor', 1) > 1 # User requested unrolled implementation of Dense is_unrolled = node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 0c5a1da729..2b9fe13b7a 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -39,6 +39,9 @@ static const bool store_weights_in_bram = false; static const unsigned strategy = nnet::{strategy}; static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned resource_implementation = nnet::{dense_resource_implementation}; + template + using dense_unrolled = nnet::{unrolled_function}; static const unsigned min_width = {min_width}; static const ap_uint pixels[min_width]; static const unsigned n_partitions = {n_partitions}; @@ -80,6 +83,8 @@ def format(self, node): params['fill_fn'] = f'fill_buffer_{node.index}' else: params['fill_fn'] = 'FillConv1DBuffer' + # TODO - Extend unrolled Dense Resource to Conv1D + params['unrolled_function'] = 'DenseResourceUnrolled' conv_config = self.template.format(**params) @@ -292,6 +297,8 @@ def format(self, node): params['scale_index_type'] = 'scale_index_regular' params['config_t'] = f'config{node.index}_depthwise_mult' + # TODO - Extend unrolled Dense Resource + params['unrolled_function'] = 'DenseResourceUnrolled' depthwise_config = self.depthwise_template.format(**params) # Depthwise mult config @@ -334,6 +341,8 @@ def format(self, node): params['scale_index_type'] = 'scale_index_regular' params['config_t'] = f'config{node.index}_pointwise_mult' + # TODO - Extend unrolled Dense Resource + params['unrolled_function'] = 'DenseResourceUnrolled' pointwise_config = self.pointwise_template.format(**params) # Pointwise mult config @@ -419,6 +428,8 @@ def format(self, node): params['scale_index_width_type'] = 'scale_index_regular' params['config_t'] = f'config{node.index}_depthwise_mult' + # TODO - Extend unrolled Dense Resource + params['unrolled_function'] = 'DenseResourceUnrolled' depthwise_config = self.depthwise_template.format(**params) # Depthwise mult config @@ -464,6 +475,8 @@ def format(self, node): else: params['scale_index_width_type'] = 'scale_index_regular' params['config_t'] = f'config{node.index}_pointwise_mult' + # TODO - Extend unrolled Dense Resource + params['unrolled_function'] = 'DenseResourceUnrolled' pointwise_config = self.pointwise_template.format(**params) # Pointwise mult config From ea5c5a86fb2d353a90eb1300824b6529888b26d8 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 08:47:30 -0700 Subject: [PATCH 008/272] merge --- hls4ml/templates/vivado/build_prj.tcl | 2 +- .../templates/vivado/nnet_utils/nnet_common.h | 1 + .../templates/vivado/nnet_utils/nnet_conv1d.h | 16 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 221 ++++++++++++++++++ .../vivado/nnet_utils/nnet_conv_stream.h | 2 - 5 files changed, 237 insertions(+), 5 deletions(-) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index d34337c573..6383b910ca 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -161,7 +161,7 @@ if {$opt(reset)} { } else { open_solution "solution1" } -catch {config_array_partition -maximum_size 4096} +catch {config_array_partition -maximum_size 8192} config_compile -name_max_length 80 set_part $part config_schedule -enable_dsp_full_reg=false diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index fed0395a1a..b6582e1406 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -24,6 +24,7 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; enum strategy { latency, resource }; +enum class conv_implementation { linebuffer=0, encoded=1, pointwise=2}; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index e2e0211b49..c2990ea97a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -53,9 +53,21 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], #pragma HLS INLINE region - // Nothing special to be done for io_parallel implementation if (CONFIG_T::strategy == nnet::latency) { - conv_1d_latency_cl(data, res, weights, biases); + if (CONFIG_T::implementation == conv_implementation::pointwise){ + // Use pointwise unrolled implementation + if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { + pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); + } + else { + assert(CONFIG_T::reuse_factor == 1); + pointwise_conv_1d_latency_cl(data, res, weights, biases); + } + } + else { + // Use standard unrolled implementation + conv_1d_resource_cl(data, res, weights, biases); + } } else { conv_1d_resource_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 0d9afb10cb..8549ae9add 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,5 +84,226 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } +template +void pointwise_conv_1d_latency_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan/CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width/CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + //const int multiplier_limit = compute_multiplier_limit(weights); + //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + + // Convolve, saving all multiplication results to accumulate later + ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; + int index_weight = cc*CONFIG_T::n_filt + ff; + int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if((ii*CONFIG_T::stride_width) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){ + mult[index_mult] = 0; + } + else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + }//end channel loop + }//end filter loop + }//end output loop + + + // Initialize accumulator with input biases + for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[ii][ff]=biases[ff]; + } + } + + + // Accumulate multiplication result + AccumOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + //Do "dot product" sum within filter and sum over channels + AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + }//end channel loop + }//end filter loop + }//end output loop + + + // Cast to "res_t" type + for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + +template void pointwise_conv_1d_latency_cl_split_by_rf( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + + data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 + res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; + #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + + for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { + #pragma HLS UNROLL + data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; + } + } + + pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); + pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); + if (CONFIG_T::reuse_factor > 2) pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); + if (CONFIG_T::reuse_factor > 3) pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); + if (CONFIG_T::reuse_factor > 4) pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); + if (CONFIG_T::reuse_factor > 5) pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); + if (CONFIG_T::reuse_factor > 6) pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); + if (CONFIG_T::reuse_factor > 7) pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); + if (CONFIG_T::reuse_factor > 8) pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); + if (CONFIG_T::reuse_factor > 9) pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); + if (CONFIG_T::reuse_factor > 10) pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); + if (CONFIG_T::reuse_factor > 11) pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); + if (CONFIG_T::reuse_factor > 12) pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); + if (CONFIG_T::reuse_factor > 13) pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); + if (CONFIG_T::reuse_factor > 14) pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); + if (CONFIG_T::reuse_factor > 15) pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); + if (CONFIG_T::reuse_factor > 16) pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); + if (CONFIG_T::reuse_factor > 17) pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); + if (CONFIG_T::reuse_factor > 18) pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); + if (CONFIG_T::reuse_factor > 19) pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); + if (CONFIG_T::reuse_factor > 20) pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); + if (CONFIG_T::reuse_factor > 21) pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); + if (CONFIG_T::reuse_factor > 22) pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); + if (CONFIG_T::reuse_factor > 23) pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); + if (CONFIG_T::reuse_factor > 24) pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); + if (CONFIG_T::reuse_factor > 25) pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); + if (CONFIG_T::reuse_factor > 26) pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); + if (CONFIG_T::reuse_factor > 27) pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); + if (CONFIG_T::reuse_factor > 28) pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); + if (CONFIG_T::reuse_factor > 29) pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); + if (CONFIG_T::reuse_factor > 30) pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); + if (CONFIG_T::reuse_factor > 31) pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); + if (CONFIG_T::reuse_factor > 32) pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); + if (CONFIG_T::reuse_factor > 33) pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); + if (CONFIG_T::reuse_factor > 34) pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); + if (CONFIG_T::reuse_factor > 35) pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); + if (CONFIG_T::reuse_factor > 36) pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); + if (CONFIG_T::reuse_factor > 37) pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); + if (CONFIG_T::reuse_factor > 38) pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); + if (CONFIG_T::reuse_factor > 39) pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); + if (CONFIG_T::reuse_factor > 40) pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); + if (CONFIG_T::reuse_factor > 41) pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); + if (CONFIG_T::reuse_factor > 42) pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); + if (CONFIG_T::reuse_factor > 43) pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); + if (CONFIG_T::reuse_factor > 44) pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); + if (CONFIG_T::reuse_factor > 45) pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 46) pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 47) pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); + if (CONFIG_T::reuse_factor > 48) pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); + if (CONFIG_T::reuse_factor > 49) pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); + if (CONFIG_T::reuse_factor > 50) pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); + if (CONFIG_T::reuse_factor > 51) pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); + if (CONFIG_T::reuse_factor > 52) pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); + if (CONFIG_T::reuse_factor > 53) pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); + if (CONFIG_T::reuse_factor > 54) pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); + if (CONFIG_T::reuse_factor > 55) pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 56) pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 57) pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); + if (CONFIG_T::reuse_factor > 58) pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); + if (CONFIG_T::reuse_factor > 59) pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); + if (CONFIG_T::reuse_factor > 60) pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); + if (CONFIG_T::reuse_factor > 61) pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); + if (CONFIG_T::reuse_factor > 62) pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); + if (CONFIG_T::reuse_factor > 63) pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); + if (CONFIG_T::reuse_factor > 64) pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); + if (CONFIG_T::reuse_factor > 65) pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); + if (CONFIG_T::reuse_factor > 66) pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); + if (CONFIG_T::reuse_factor > 67) pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); + if (CONFIG_T::reuse_factor > 68) pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); + if (CONFIG_T::reuse_factor > 69) pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); + if (CONFIG_T::reuse_factor > 70) pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); + if (CONFIG_T::reuse_factor > 71) pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); + if (CONFIG_T::reuse_factor > 72) pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); + if (CONFIG_T::reuse_factor > 73) pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); + if (CONFIG_T::reuse_factor > 74) pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); + if (CONFIG_T::reuse_factor > 75) pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); + if (CONFIG_T::reuse_factor > 76) pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); + if (CONFIG_T::reuse_factor > 77) pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); + if (CONFIG_T::reuse_factor > 78) pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); + if (CONFIG_T::reuse_factor > 79) pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); + if (CONFIG_T::reuse_factor > 80) pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); + if (CONFIG_T::reuse_factor > 81) pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); + if (CONFIG_T::reuse_factor > 82) pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); + if (CONFIG_T::reuse_factor > 83) pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); + if (CONFIG_T::reuse_factor > 84) pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); + if (CONFIG_T::reuse_factor > 85) pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); + if (CONFIG_T::reuse_factor > 86) pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); + if (CONFIG_T::reuse_factor > 87) pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); + if (CONFIG_T::reuse_factor > 88) pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); + if (CONFIG_T::reuse_factor > 89) pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); + if (CONFIG_T::reuse_factor > 90) pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); + if (CONFIG_T::reuse_factor > 91) pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); + if (CONFIG_T::reuse_factor > 92) pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); + if (CONFIG_T::reuse_factor > 93) pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); + if (CONFIG_T::reuse_factor > 94) pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); + if (CONFIG_T::reuse_factor > 95) pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); + if (CONFIG_T::reuse_factor > 96) pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); + if (CONFIG_T::reuse_factor > 97) pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); + if (CONFIG_T::reuse_factor > 98) pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); + if (CONFIG_T::reuse_factor > 99) pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); + if (CONFIG_T::reuse_factor > 100) pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); + if (CONFIG_T::reuse_factor > 101) pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); + if (CONFIG_T::reuse_factor > 102) pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); + if (CONFIG_T::reuse_factor > 103) pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); + if (CONFIG_T::reuse_factor > 104) pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); + if (CONFIG_T::reuse_factor > 105) pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); + if (CONFIG_T::reuse_factor > 106) pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); + if (CONFIG_T::reuse_factor > 107) pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); + if (CONFIG_T::reuse_factor > 108) pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); + if (CONFIG_T::reuse_factor > 109) pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); + if (CONFIG_T::reuse_factor > 110) pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); + if (CONFIG_T::reuse_factor > 111) pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); + if (CONFIG_T::reuse_factor > 112) pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); + if (CONFIG_T::reuse_factor > 113) pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); + if (CONFIG_T::reuse_factor > 114) pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); + if (CONFIG_T::reuse_factor > 115) pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); + if (CONFIG_T::reuse_factor > 116) pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); + if (CONFIG_T::reuse_factor > 117) pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); + if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); + if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + + for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { + #pragma HLS UNROLL + res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; + } + } +} + } // namespace nnet #endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index 7bd47442f6..b763938cb3 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -8,8 +8,6 @@ namespace nnet { -enum class conv_implementation { linebuffer = 0, encoded = 1 }; - // ************************************************* // Encoded Implementation (Vlad's) // ************************************************* From 6849e0b4d0a1b352cac1d61870273882dc112705 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 22 Dec 2022 16:21:25 -0600 Subject: [PATCH 009/272] add pointwise --- hls4ml/backends/vivado/vivado_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 1d4c96d982..4dab5f5c18 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -72,7 +72,7 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) - attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) + attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')) self.attribute_map[layer] = attrs def _register_flows(self): From 0244b666652e2667c8df72c134f9abd94c731685 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 25 Mar 2023 18:29:44 -0700 Subject: [PATCH 010/272] latency --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index c2990ea97a..e2dee3485a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -66,7 +66,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } else { // Use standard unrolled implementation - conv_1d_resource_cl(data, res, weights, biases); + conv_1d_latency_cl(data, res, weights, biases); } } else { conv_1d_resource_cl(data, res, weights, biases); From 3ae7752e70dc43d0687b39a90d7c4d0fd6f9b797 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 25 Mar 2023 18:56:58 -0700 Subject: [PATCH 011/272] unroll --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 8549ae9add..4179c1dde8 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -104,6 +104,7 @@ void pointwise_conv_1d_latency_cl( // Parallel mode #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization @@ -114,6 +115,7 @@ void pointwise_conv_1d_latency_cl( ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; int index_weight = cc*CONFIG_T::n_filt + ff; int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; @@ -132,6 +134,7 @@ void pointwise_conv_1d_latency_cl( // Initialize accumulator with input biases for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL acc[ii][ff]=biases[ff]; } } @@ -152,6 +155,7 @@ void pointwise_conv_1d_latency_cl( // Cast to "res_t" type for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); } } @@ -169,7 +173,9 @@ template void pointwise_conv_1d_la res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + RFInputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + InnerInputLoop: for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; @@ -297,7 +303,9 @@ template void pointwise_conv_1d_la if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + RFOutputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + InnerOutputLoop: for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; From 23126b70ca5496bcc7da993d95a8d939920bd8bc Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 26 Mar 2023 17:19:08 -0700 Subject: [PATCH 012/272] add hls unroll --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 4179c1dde8..c5b520c703 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -175,6 +175,7 @@ template void pointwise_conv_1d_la RFInputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL InnerInputLoop: for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL @@ -305,6 +306,7 @@ template void pointwise_conv_1d_la RFOutputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL InnerOutputLoop: for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL From 6aff9e996df95955d010013c2163a723ab8a8170 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 8 Jun 2023 08:15:11 -0700 Subject: [PATCH 013/272] fix pragma from walkie --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index c5b520c703..c423c7a228 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -108,8 +108,8 @@ void pointwise_conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - //const int multiplier_limit = compute_multiplier_limit(weights); - //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + int multiplier_limit = ceil ( (float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor )* CONFIG_T::n_filt * CONFIG_T::n_chan ) / float(CONFIG_T::reuse_factor) ); + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { From 7f1c318dea6767d5b0e4996786c356d48bfa4560 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 Jun 2023 18:46:37 +0000 Subject: [PATCH 014/272] [pre-commit.ci] auto fixes from pre-commit hooks --- hls4ml/backends/vivado/vivado_backend.py | 4 +- .../templates/vivado/nnet_utils/nnet_common.h | 2 +- .../templates/vivado/nnet_utils/nnet_conv1d.h | 8 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 488 +++++++++++------- 4 files changed, 311 insertions(+), 191 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 4dab5f5c18..1eb58f0952 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -72,7 +72,9 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) - attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')) + attrs.append( + ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer') + ) self.attribute_map[layer] = attrs def _register_flows(self): diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index b6582e1406..e942a1dc89 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -24,7 +24,7 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; enum strategy { latency, resource }; -enum class conv_implementation { linebuffer=0, encoded=1, pointwise=2}; +enum class conv_implementation { linebuffer = 0, encoded = 1, pointwise = 2 }; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index e2dee3485a..0f2e89ac8f 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -54,17 +54,15 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], #pragma HLS INLINE region if (CONFIG_T::strategy == nnet::latency) { - if (CONFIG_T::implementation == conv_implementation::pointwise){ + if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); - } - else { + } else { assert(CONFIG_T::reuse_factor == 1); pointwise_conv_1d_latency_cl(data, res, weights, biases); } - } - else { + } else { // Use standard unrolled implementation conv_1d_latency_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index c423c7a228..aabc869823 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,17 +84,15 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } -template -void pointwise_conv_1d_latency_cl( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan/CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) -{ +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_width == 1); - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan/CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width/CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 @@ -108,209 +106,331 @@ void pointwise_conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - int multiplier_limit = ceil ( (float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor )* CONFIG_T::n_filt * CONFIG_T::n_chan ) / float(CONFIG_T::reuse_factor) ); - #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit - - // Convolve, saving all multiplication results to accumulate later - ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { - #pragma HLS UNROLL - int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; - int index_weight = cc*CONFIG_T::n_filt + ff; - int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; - - if((ii*CONFIG_T::stride_width) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){ + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; - } - else { + } else { mult[index_mult] = data[index_data] * weights[index_weight]; } - }//end channel loop - }//end filter loop - }//end output loop - + } // end channel loop + } // end filter loop + } // end output loop // Initialize accumulator with input biases - for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL - acc[ii][ff]=biases[ff]; + acc[ii][ff] = biases[ff]; } } - - // Accumulate multiplication result - AccumOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - //Do "dot product" sum within filter and sum over channels - AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { - int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; acc[ii][ff] += mult[index_mult]; - }//end channel loop - }//end filter loop - }//end output loop - + } // end channel loop + } // end filter loop + } // end output loop // Cast to "res_t" type - for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); } } } -template void pointwise_conv_1d_latency_cl_split_by_rf( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) -{ +template +void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 - res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; + res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 - - RFInputLoop: - for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerInputLoop: - for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { + +RFInputLoop: + for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL + InnerInputLoop: + for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL - data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; + data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii]; } } pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); - if (CONFIG_T::reuse_factor > 2) pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); - if (CONFIG_T::reuse_factor > 3) pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); - if (CONFIG_T::reuse_factor > 4) pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); - if (CONFIG_T::reuse_factor > 5) pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); - if (CONFIG_T::reuse_factor > 6) pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); - if (CONFIG_T::reuse_factor > 7) pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); - if (CONFIG_T::reuse_factor > 8) pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); - if (CONFIG_T::reuse_factor > 9) pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); - if (CONFIG_T::reuse_factor > 10) pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); - if (CONFIG_T::reuse_factor > 11) pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); - if (CONFIG_T::reuse_factor > 12) pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); - if (CONFIG_T::reuse_factor > 13) pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); - if (CONFIG_T::reuse_factor > 14) pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); - if (CONFIG_T::reuse_factor > 15) pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); - if (CONFIG_T::reuse_factor > 16) pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); - if (CONFIG_T::reuse_factor > 17) pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); - if (CONFIG_T::reuse_factor > 18) pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); - if (CONFIG_T::reuse_factor > 19) pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); - if (CONFIG_T::reuse_factor > 20) pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); - if (CONFIG_T::reuse_factor > 21) pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); - if (CONFIG_T::reuse_factor > 22) pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); - if (CONFIG_T::reuse_factor > 23) pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); - if (CONFIG_T::reuse_factor > 24) pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); - if (CONFIG_T::reuse_factor > 25) pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); - if (CONFIG_T::reuse_factor > 26) pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); - if (CONFIG_T::reuse_factor > 27) pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); - if (CONFIG_T::reuse_factor > 28) pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); - if (CONFIG_T::reuse_factor > 29) pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); - if (CONFIG_T::reuse_factor > 30) pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); - if (CONFIG_T::reuse_factor > 31) pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); - if (CONFIG_T::reuse_factor > 32) pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); - if (CONFIG_T::reuse_factor > 33) pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); - if (CONFIG_T::reuse_factor > 34) pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); - if (CONFIG_T::reuse_factor > 35) pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); - if (CONFIG_T::reuse_factor > 36) pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); - if (CONFIG_T::reuse_factor > 37) pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); - if (CONFIG_T::reuse_factor > 38) pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); - if (CONFIG_T::reuse_factor > 39) pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); - if (CONFIG_T::reuse_factor > 40) pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); - if (CONFIG_T::reuse_factor > 41) pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); - if (CONFIG_T::reuse_factor > 42) pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); - if (CONFIG_T::reuse_factor > 43) pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); - if (CONFIG_T::reuse_factor > 44) pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); - if (CONFIG_T::reuse_factor > 45) pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 46) pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 47) pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); - if (CONFIG_T::reuse_factor > 48) pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); - if (CONFIG_T::reuse_factor > 49) pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); - if (CONFIG_T::reuse_factor > 50) pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); - if (CONFIG_T::reuse_factor > 51) pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); - if (CONFIG_T::reuse_factor > 52) pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); - if (CONFIG_T::reuse_factor > 53) pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); - if (CONFIG_T::reuse_factor > 54) pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); - if (CONFIG_T::reuse_factor > 55) pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 56) pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 57) pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); - if (CONFIG_T::reuse_factor > 58) pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); - if (CONFIG_T::reuse_factor > 59) pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); - if (CONFIG_T::reuse_factor > 60) pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); - if (CONFIG_T::reuse_factor > 61) pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); - if (CONFIG_T::reuse_factor > 62) pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); - if (CONFIG_T::reuse_factor > 63) pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); - if (CONFIG_T::reuse_factor > 64) pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); - if (CONFIG_T::reuse_factor > 65) pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); - if (CONFIG_T::reuse_factor > 66) pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); - if (CONFIG_T::reuse_factor > 67) pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); - if (CONFIG_T::reuse_factor > 68) pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); - if (CONFIG_T::reuse_factor > 69) pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); - if (CONFIG_T::reuse_factor > 70) pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); - if (CONFIG_T::reuse_factor > 71) pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); - if (CONFIG_T::reuse_factor > 72) pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); - if (CONFIG_T::reuse_factor > 73) pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); - if (CONFIG_T::reuse_factor > 74) pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); - if (CONFIG_T::reuse_factor > 75) pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); - if (CONFIG_T::reuse_factor > 76) pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); - if (CONFIG_T::reuse_factor > 77) pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); - if (CONFIG_T::reuse_factor > 78) pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); - if (CONFIG_T::reuse_factor > 79) pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); - if (CONFIG_T::reuse_factor > 80) pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); - if (CONFIG_T::reuse_factor > 81) pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); - if (CONFIG_T::reuse_factor > 82) pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); - if (CONFIG_T::reuse_factor > 83) pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); - if (CONFIG_T::reuse_factor > 84) pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); - if (CONFIG_T::reuse_factor > 85) pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); - if (CONFIG_T::reuse_factor > 86) pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); - if (CONFIG_T::reuse_factor > 87) pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); - if (CONFIG_T::reuse_factor > 88) pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); - if (CONFIG_T::reuse_factor > 89) pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); - if (CONFIG_T::reuse_factor > 90) pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); - if (CONFIG_T::reuse_factor > 91) pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); - if (CONFIG_T::reuse_factor > 92) pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); - if (CONFIG_T::reuse_factor > 93) pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); - if (CONFIG_T::reuse_factor > 94) pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); - if (CONFIG_T::reuse_factor > 95) pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); - if (CONFIG_T::reuse_factor > 96) pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); - if (CONFIG_T::reuse_factor > 97) pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); - if (CONFIG_T::reuse_factor > 98) pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); - if (CONFIG_T::reuse_factor > 99) pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); - if (CONFIG_T::reuse_factor > 100) pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); - if (CONFIG_T::reuse_factor > 101) pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); - if (CONFIG_T::reuse_factor > 102) pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); - if (CONFIG_T::reuse_factor > 103) pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); - if (CONFIG_T::reuse_factor > 104) pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); - if (CONFIG_T::reuse_factor > 105) pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); - if (CONFIG_T::reuse_factor > 106) pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); - if (CONFIG_T::reuse_factor > 107) pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); - if (CONFIG_T::reuse_factor > 108) pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); - if (CONFIG_T::reuse_factor > 109) pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); - if (CONFIG_T::reuse_factor > 110) pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); - if (CONFIG_T::reuse_factor > 111) pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); - if (CONFIG_T::reuse_factor > 112) pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); - if (CONFIG_T::reuse_factor > 113) pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); - if (CONFIG_T::reuse_factor > 114) pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); - if (CONFIG_T::reuse_factor > 115) pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); - if (CONFIG_T::reuse_factor > 116) pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); - if (CONFIG_T::reuse_factor > 117) pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); - if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); - if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); - - RFOutputLoop: - for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerOutputLoop: - for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { + if (CONFIG_T::reuse_factor > 2) + pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); + if (CONFIG_T::reuse_factor > 3) + pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); + if (CONFIG_T::reuse_factor > 4) + pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); + if (CONFIG_T::reuse_factor > 5) + pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); + if (CONFIG_T::reuse_factor > 6) + pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); + if (CONFIG_T::reuse_factor > 7) + pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); + if (CONFIG_T::reuse_factor > 8) + pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); + if (CONFIG_T::reuse_factor > 9) + pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); + if (CONFIG_T::reuse_factor > 10) + pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); + if (CONFIG_T::reuse_factor > 11) + pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); + if (CONFIG_T::reuse_factor > 12) + pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); + if (CONFIG_T::reuse_factor > 13) + pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); + if (CONFIG_T::reuse_factor > 14) + pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); + if (CONFIG_T::reuse_factor > 15) + pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); + if (CONFIG_T::reuse_factor > 16) + pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); + if (CONFIG_T::reuse_factor > 17) + pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); + if (CONFIG_T::reuse_factor > 18) + pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); + if (CONFIG_T::reuse_factor > 19) + pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); + if (CONFIG_T::reuse_factor > 20) + pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); + if (CONFIG_T::reuse_factor > 21) + pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); + if (CONFIG_T::reuse_factor > 22) + pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); + if (CONFIG_T::reuse_factor > 23) + pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); + if (CONFIG_T::reuse_factor > 24) + pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); + if (CONFIG_T::reuse_factor > 25) + pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); + if (CONFIG_T::reuse_factor > 26) + pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); + if (CONFIG_T::reuse_factor > 27) + pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); + if (CONFIG_T::reuse_factor > 28) + pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); + if (CONFIG_T::reuse_factor > 29) + pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); + if (CONFIG_T::reuse_factor > 30) + pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); + if (CONFIG_T::reuse_factor > 31) + pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); + if (CONFIG_T::reuse_factor > 32) + pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); + if (CONFIG_T::reuse_factor > 33) + pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); + if (CONFIG_T::reuse_factor > 34) + pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); + if (CONFIG_T::reuse_factor > 35) + pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); + if (CONFIG_T::reuse_factor > 36) + pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); + if (CONFIG_T::reuse_factor > 37) + pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); + if (CONFIG_T::reuse_factor > 38) + pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); + if (CONFIG_T::reuse_factor > 39) + pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); + if (CONFIG_T::reuse_factor > 40) + pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); + if (CONFIG_T::reuse_factor > 41) + pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); + if (CONFIG_T::reuse_factor > 42) + pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); + if (CONFIG_T::reuse_factor > 43) + pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); + if (CONFIG_T::reuse_factor > 44) + pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); + if (CONFIG_T::reuse_factor > 45) + pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 46) + pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 47) + pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); + if (CONFIG_T::reuse_factor > 48) + pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); + if (CONFIG_T::reuse_factor > 49) + pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); + if (CONFIG_T::reuse_factor > 50) + pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); + if (CONFIG_T::reuse_factor > 51) + pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); + if (CONFIG_T::reuse_factor > 52) + pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); + if (CONFIG_T::reuse_factor > 53) + pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); + if (CONFIG_T::reuse_factor > 54) + pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); + if (CONFIG_T::reuse_factor > 55) + pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 56) + pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 57) + pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); + if (CONFIG_T::reuse_factor > 58) + pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); + if (CONFIG_T::reuse_factor > 59) + pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); + if (CONFIG_T::reuse_factor > 60) + pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); + if (CONFIG_T::reuse_factor > 61) + pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); + if (CONFIG_T::reuse_factor > 62) + pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); + if (CONFIG_T::reuse_factor > 63) + pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); + if (CONFIG_T::reuse_factor > 64) + pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); + if (CONFIG_T::reuse_factor > 65) + pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); + if (CONFIG_T::reuse_factor > 66) + pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); + if (CONFIG_T::reuse_factor > 67) + pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); + if (CONFIG_T::reuse_factor > 68) + pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); + if (CONFIG_T::reuse_factor > 69) + pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); + if (CONFIG_T::reuse_factor > 70) + pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); + if (CONFIG_T::reuse_factor > 71) + pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); + if (CONFIG_T::reuse_factor > 72) + pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); + if (CONFIG_T::reuse_factor > 73) + pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); + if (CONFIG_T::reuse_factor > 74) + pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); + if (CONFIG_T::reuse_factor > 75) + pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); + if (CONFIG_T::reuse_factor > 76) + pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); + if (CONFIG_T::reuse_factor > 77) + pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); + if (CONFIG_T::reuse_factor > 78) + pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); + if (CONFIG_T::reuse_factor > 79) + pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); + if (CONFIG_T::reuse_factor > 80) + pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); + if (CONFIG_T::reuse_factor > 81) + pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); + if (CONFIG_T::reuse_factor > 82) + pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); + if (CONFIG_T::reuse_factor > 83) + pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); + if (CONFIG_T::reuse_factor > 84) + pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); + if (CONFIG_T::reuse_factor > 85) + pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); + if (CONFIG_T::reuse_factor > 86) + pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); + if (CONFIG_T::reuse_factor > 87) + pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); + if (CONFIG_T::reuse_factor > 88) + pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); + if (CONFIG_T::reuse_factor > 89) + pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); + if (CONFIG_T::reuse_factor > 90) + pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); + if (CONFIG_T::reuse_factor > 91) + pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); + if (CONFIG_T::reuse_factor > 92) + pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); + if (CONFIG_T::reuse_factor > 93) + pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); + if (CONFIG_T::reuse_factor > 94) + pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); + if (CONFIG_T::reuse_factor > 95) + pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); + if (CONFIG_T::reuse_factor > 96) + pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); + if (CONFIG_T::reuse_factor > 97) + pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); + if (CONFIG_T::reuse_factor > 98) + pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); + if (CONFIG_T::reuse_factor > 99) + pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); + if (CONFIG_T::reuse_factor > 100) + pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); + if (CONFIG_T::reuse_factor > 101) + pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); + if (CONFIG_T::reuse_factor > 102) + pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); + if (CONFIG_T::reuse_factor > 103) + pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); + if (CONFIG_T::reuse_factor > 104) + pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); + if (CONFIG_T::reuse_factor > 105) + pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); + if (CONFIG_T::reuse_factor > 106) + pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); + if (CONFIG_T::reuse_factor > 107) + pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); + if (CONFIG_T::reuse_factor > 108) + pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); + if (CONFIG_T::reuse_factor > 109) + pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); + if (CONFIG_T::reuse_factor > 110) + pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); + if (CONFIG_T::reuse_factor > 111) + pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); + if (CONFIG_T::reuse_factor > 112) + pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); + if (CONFIG_T::reuse_factor > 113) + pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); + if (CONFIG_T::reuse_factor > 114) + pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); + if (CONFIG_T::reuse_factor > 115) + pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); + if (CONFIG_T::reuse_factor > 116) + pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); + if (CONFIG_T::reuse_factor > 117) + pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); + if (CONFIG_T::reuse_factor > 118) + pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); + if (CONFIG_T::reuse_factor > 119) + pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + +RFOutputLoop: + for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL + InnerOutputLoop: + for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL - res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; + res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; } } } From 69aecc6dc187a6e9a1ecdd2e7449629f1a88e87b Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:27:20 -0700 Subject: [PATCH 015/272] add test --- hls4ml/backends/vivado/vivado_backend.py | 1 - test/pytest/test_pointwiseconv.py | 37 ++++++++++++------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 1eb58f0952..1a99d90a8e 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -71,7 +71,6 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) - # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) attrs.append( ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer') ) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 28314fe130..080106955e 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -21,20 +21,22 @@ @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) @pytest.mark.parametrize( - 'backend, io_type, strategy', + 'backend, io_type, strategy, conv_implementation', [ - ('Quartus', 'io_parallel', 'resource'), - ('Vivado', 'io_parallel', 'resource'), - ('Vitis', 'io_parallel', 'resource'), - ('Vivado', 'io_parallel', 'latency'), - ('Vitis', 'io_parallel', 'latency'), - ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource'), - ('Vitis', 'io_stream', 'latency'), - ('Vitis', 'io_stream', 'resource'), + ('Quartus', 'io_parallel', 'resource', 'LineBuffer'), + ('Vivado', 'io_parallel', 'resource', 'LineBuffer'), + ('Vitis', 'io_parallel', 'resource', 'LineBuffer'), + ('Vivado', 'io_parallel', 'latency', 'LineBuffer'), + ('Vitis', 'io_parallel', 'latency', 'LineBuffer'), + ('Vivado', 'io_parallel', 'latency', 'Pointwise'), + ('Vitis', 'io_parallel', 'latency', 'Pointwise'), + ('Vivado', 'io_stream', 'latency', 'LineBuffer'), + ('Vivado', 'io_stream', 'resource', 'LineBuffer'), + ('Vitis', 'io_stream', 'latency', 'LineBuffer'), + ('Vitis', 'io_stream', 'resource', 'LineBuffer'), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_implementation): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -47,6 +49,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, + name='pointwise1d' ) ) model.compile(optimizer='adam', loss='mse') @@ -55,14 +58,13 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): keras_prediction = model.predict(X_input) default_precision = 'ac_fixed<32,16,true>' if backend == 'Quartus' else 'ap_fixed<32,16>' - config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision) + config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy + config['LayerName']['pointwise1d']['ConvImplementation'] = conv_implementation output_dir = str( test_root_path - / 'hls4mlprj_pointwise1d_{}_strides_{}_{}_padding_{}_{}_{}'.format( - chans, strides[0], padds, backend, io_type, strategy - ) + / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_implementation}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend @@ -100,6 +102,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, + name='pointwise2d' ) ) @@ -114,9 +117,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( test_root_path - / 'hls4mlprj_pointwise2d_{}_strides_{}_{}_padding_{}_{}_{}'.format( - chans, stride_cfg, padds, backend, io_type, strategy - ) + / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' ) hls_model = hls4ml.converters.convert_from_keras_model( From 4febceded10000b3b1b6b4254c9b9c230a9f475c Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:48:44 -0700 Subject: [PATCH 016/272] pre-commit --- test/pytest/test_pointwiseconv.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 080106955e..0cb75b7a87 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -49,7 +49,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv kernel_initializer='normal', use_bias=False, data_format=chans, - name='pointwise1d' + name='pointwise1d', ) ) model.compile(optimizer='adam', loss='mse') @@ -102,7 +102,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, - name='pointwise2d' + name='pointwise2d', ) ) @@ -116,8 +116,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): config['Model']['Strategy'] = strategy stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( - test_root_path - / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' + test_root_path / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' ) hls_model = hls4ml.converters.convert_from_keras_model( From 56797e73ecb1a830c28128387536308fd3f50beb Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:53:37 -0700 Subject: [PATCH 017/272] pre-commit --- test/pytest/test_pointwiseconv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 0cb75b7a87..cbe2036712 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -21,7 +21,7 @@ @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) @pytest.mark.parametrize( - 'backend, io_type, strategy, conv_implementation', + 'backend, io_type, strategy, conv_impl', [ ('Quartus', 'io_parallel', 'resource', 'LineBuffer'), ('Vivado', 'io_parallel', 'resource', 'LineBuffer'), @@ -36,7 +36,7 @@ ('Vitis', 'io_stream', 'resource', 'LineBuffer'), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_implementation): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -60,11 +60,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv default_precision = 'ac_fixed<32,16,true>' if backend == 'Quartus' else 'ap_fixed<32,16>' config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy - config['LayerName']['pointwise1d']['ConvImplementation'] = conv_implementation + config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl output_dir = str( test_root_path - / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_implementation}' + / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_impl}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend From 0765ec44135debde756548b0932f5ccce12da8b5 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 11 Jul 2023 11:00:56 -0500 Subject: [PATCH 018/272] Add needed layer types for QONNX --- hls4ml/model/layers.py | 136 +++++++++++++++++++++++- hls4ml/model/optimizer/passes/qkeras.py | 30 +----- 2 files changed, 132 insertions(+), 34 deletions(-) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index d9da2cc741..6a23a9b934 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -333,7 +333,7 @@ class Input(Layer): def initialize(self): shape = self.attributes['input_shape'] if shape[0] is None: - shape = shape[1:] + raise RuntimeError(f"Unexpectedly have a None in {shape=} of Input layer") dims = [f'N_INPUT_{i}_{self.index}' for i in range(1, len(shape) + 1)] if self.index == 1: default_type_name = 'input_t' @@ -344,6 +344,41 @@ def initialize(self): self.add_output_variable(shape, dims, var_name=self.name, type_name=type_name, precision=precision) +class Constant(Layer): + _expected_attributes = [ + Attribute('value', value_type=np.ndarray), + ] + + def initialize(self): + value = self.attributes['value'] + self.value = value # note, this is unquantized; Only here for easier access + shape = value.shape + if not shape: + shape = (1,) + self.value = np.array([self.value]) + dims = [f'{self.name}_{i}' for i in range(len(shape))] + self.add_output_variable(shape, dims, var_name=self.name, precision=self.get_attr("precision")) + + +class Quant(Layer): # The QONNX quantization layer + """ + This is a QONNX quantization layer. Optimizations should convert it + before HLS is produced. + """ + + _expected_attributes = [ + Attribute('narrow', value_type=bool), + Attribute('rounding_mode', value_type=str), + Attribute('signed', value_type=bool), + ] + + def initialize(self): + inp = self.get_input_variable(self.inputs[0]) + shape = inp.shape + dims = inp.dim_names + self.add_output_variable(shape, dims) + + class Reshape(Layer): _expected_attributes = [ Attribute('target_shape', value_type=typing.Sequence), @@ -351,19 +386,20 @@ class Reshape(Layer): def initialize(self): input_shape = self.get_input_variable(self.inputs[0]).shape - target_shape = self.get_attr('target_shape') + target_shape = self.get_attr('target_shape') # this should not have a batch dimension if target_shape is None: # need to get it from the input shape_node = self.get_input_node(self.inputs[1]) # for QONNX, remove batch dimension + # (onnx cleaning should have removed reshape dimension) if shape_node: target_shape = shape_node.value[1:] else: raise RuntimeError("Reshape for ONNX requires the target shape to be a second input.") - # remove Nones -- is this ever triggered? + # nones should not exist here if target_shape[0] is None: - target_shape = target_shape[1:] + raise RuntimeError(f"Unexpectedly have a None in {target_shape=}") # take care of -1 shapes shape = self._infer_output_shape(input_shape, target_shape) @@ -395,7 +431,7 @@ class Dense(Layer): ] def initialize(self): - shape = self.get_input_variable().shape[:] + shape = list(self.get_input_variable().shape) shape[-1] = self.attributes['n_out'] if len(shape) > 1: dims = [f'N_LAYER_{i}_{self.index}' for i in range(1, len(shape) + 1)] @@ -406,6 +442,27 @@ def initialize(self): self.add_bias(quantizer=self.get_attr('bias_quantizer')) +class Conv(Layer): + """ + This is for the ONNX Conv node. Currently, it is only supported as an intermediate + form that gets converted to an explicit ConvXD. + + Note: these are always channels-last. + """ + + def initialize(self): + # use negative indexing because it is not clear if batch dimension is always stripped + if self.attributes['n_dim'] == 1: + # this is 1D convolution + shape = [self.attributes['out_width'], self.attributes['n_filt']] + dims = [f'N_OUTPUTS_{self.index}', f'N_FILT_{self.index}'] + else: + shape = [self.attributes['out_height'], self.attributes['out_width'], self.attributes['n_filt']] + dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_FILT_{self.index}'] + + self.add_output_variable(shape, dims) + + class Conv1D(Layer): _expected_attributes = [ Attribute('in_width'), @@ -811,6 +868,19 @@ def initialize(self): super().initialize() +class BatchNormOnnx(Layer): + ''' + A transient layer formed from ONNX BatchNormalization that gets converted to + BatchNormalization after the scale and bias are determined + ''' + + def initialize(self): + inp = self.get_input_variable() + shape = inp.shape + dims = inp.dim_names + self.add_output_variable(shape, dims) + + class BatchNormalization(Layer): _expected_attributes = [ Attribute('n_in'), @@ -841,6 +911,31 @@ def initialize(self): self.add_weights_variable(name='bias', var_name='b{index}', data=bias) +class ApplyAlpha(BatchNormalization): + '''A custom layer to scale the output of a QDense layer which used 'alpha != 1' + Inference computation uses BatchNormalization methods''' + + def initialize(self): + inp = self.get_input_variable() + shape = inp.shape + dims = inp.dim_names + self.add_output_variable(shape, dims) + + scale = self.get_attr('scale_data') + scale_quantizer = self.get_attr('scale_quantizer') + bias = self.get_attr('bias_data') + bias_quantizer = self.get_attr('bias_quantizer') + + self.add_weights(scale, quantizer=scale_quantizer) + self.add_bias(bias, quantizer=bias_quantizer) + + def add_weights(self, scale, quantizer=None): + self.add_weights_variable(name='scale', var_name='s{index}', data=scale, quantizer=quantizer) + + def add_bias(self, bias, quantizer=None): + self.add_weights_variable(name='bias', var_name='b{index}', data=bias, quantizer=quantizer) + + class Merge(Layer): def initialize(self): assert len(self.inputs) == 2 @@ -855,6 +950,31 @@ def initialize(self): self.add_output_variable(shape, dims) +class MatMul(Layer): + """ + This is a matrix multiply. Currently, it is only supported as an intermediate + form that gets converted to a Dense layer. + """ + + def initialize(self): + assert len(self.inputs) == 2 + inp1 = self.get_input_variable(self.inputs[0]) + inp2 = self.get_input_variable(self.inputs[1]) + if len(inp2.shape) == 1: + # mat vec multiply + assert inp1.shape[-1] == inp2.shape[0] + shape = tuple(inp1.shape[:-1]) + (inp2.shape[0],) + else: + assert inp1.shape[-1] == inp2.shape[-2] + shape = tuple(inp1.shape[:-1]) + (inp2.shape[-1],) + if len(shape) > 1: + dims = [f'N_LAYER_{i}_{self.index}' for i in range(1, len(shape) + 1)] + else: + dims = [f'N_LAYER_{self.index}'] + + self.add_output_variable(shape, dims) + + class Dot(Merge): def initialize(self): assert len(self.inputs) == 2 @@ -1293,6 +1413,7 @@ def initialize(self): layer_map = { 'Input': Input, 'InputLayer': Input, + 'Constant': Constant, 'Activation': Activation, 'QActivation': Activation, 'LeakyReLU': ParametrizedActivation, @@ -1307,6 +1428,7 @@ def initialize(self): 'BinaryDense': Dense, 'TernaryDense': Dense, 'QDense': Dense, + 'Conv': Conv, 'Conv1D': Conv1D, 'QConv1D': Conv1D, 'Conv2D': Conv2D, @@ -1329,6 +1451,7 @@ def initialize(self): 'ZeroPadding1D': ZeroPadding1D, 'ZeroPadding2D': ZeroPadding2D, 'Merge': Merge, + 'MatMul': MatMul, 'Dot': Dot, 'Concatenate': Concatenate, 'Resize': Resize, @@ -1341,6 +1464,9 @@ def initialize(self): 'GRU': GRU, 'GarNet': GarNet, 'GarNetStack': GarNetStack, + 'Quant': Quant, + 'ApplyAlpha': ApplyAlpha, + 'BatchNormOnnx': BatchNormOnnx, 'LayerGroup': LayerGroup, # TensorFlow-specific layers: 'BiasAdd': BiasAdd, diff --git a/hls4ml/model/optimizer/passes/qkeras.py b/hls4ml/model/optimizer/passes/qkeras.py index cdbb56ec46..2d2b6b0f77 100644 --- a/hls4ml/model/optimizer/passes/qkeras.py +++ b/hls4ml/model/optimizer/passes/qkeras.py @@ -1,7 +1,7 @@ import numpy as np import tensorflow as tf -from hls4ml.model.layers import BatchNormalization, register_layer +from hls4ml.model.layers import ApplyAlpha, BatchNormalization from hls4ml.model.optimizer import ConfigurableOptimizerPass, OptimizerPass, register_pass from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, QKerasPO2Quantizer @@ -76,35 +76,7 @@ def precision_string_modify(self, pstr): return pstr -class ApplyAlpha(BatchNormalization): - '''A custom layer to scale the output of a QDense layer which used 'alpha != 1' - Inference computation uses BatchNormalization methods''' - - def initialize(self): - inp = self.get_input_variable() - shape = inp.shape - dims = inp.dim_names - self.add_output_variable(shape, dims) - - scale = self.get_attr('scale_data') - scale_quantizer = self.get_attr('scale_quantizer') - bias = self.get_attr('bias_data') - bias_quantizer = self.get_attr('bias_quantizer') - - self.add_weights(scale, quantizer=scale_quantizer) - self.add_bias(bias, quantizer=bias_quantizer) - - def add_weights(self, scale, quantizer=None): - self.add_weights_variable(name='scale', var_name='s{index}', data=scale, quantizer=quantizer) - - def add_bias(self, bias, quantizer=None): - self.add_weights_variable(name='bias', var_name='b{index}', data=bias, quantizer=quantizer) - - def register_qkeras(): - # Register the layer types to the layer map - register_layer('ApplyAlpha', ApplyAlpha) - # Register the optimization passes register_pass('output_rounding_saturation_mode', OutputRoundingSaturationMode) register_pass('qkeras_factorize_alpha', QKerasFactorizeAlpha) From ff788eae9a541e88c74e0876d405a487537632cc Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 11 Jul 2023 19:32:13 -0500 Subject: [PATCH 019/272] add qonnx pytest --- test/pytest/test_qonnx.py | 189 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100755 test/pytest/test_qonnx.py diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py new file mode 100755 index 0000000000..be567d81f9 --- /dev/null +++ b/test/pytest/test_qonnx.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python +import os +import urllib +from pathlib import Path + +import numpy as np +import pytest +import qonnx.core.onnx_exec as oxe +import qonnx.util.cleanup +import qonnx.util.to_channels_last + +# To conveniently run QONNX inference +from qonnx.core.modelwrapper import ModelWrapper + +import hls4ml + +test_root_path = Path(__file__).parent + + +def test_tfc_2w2a(): + # download test model + dl_dir = test_root_path + dl_file = str(dl_dir / "qonnx-tfc-2w2a.onnx") + tfc_w2a2_qonnx_url = ( + "https://raw.githubusercontent.com/fastmachinelearning/" + "QONNX_model_zoo/main/models/MNIST/Brevitas_FINN_TFC/TFC/TFC_2W2A.onnx" + ) + urllib.request.urlretrieve(tfc_w2a2_qonnx_url, dl_file) + assert os.path.isfile(dl_file) + out_file = str(dl_dir / "qonnx-tfc-2w2a-clean.onnx") + + # cleanup + qonnx.util.cleanup.cleanup(dl_file, out_file=out_file) + model = ModelWrapper(out_file) + + # Execute QONNX model inference + # TODO make the test bigger + ishape = (1, 1, 28, 28) + np.random.seed(0) + X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + # Convert QONNX model, compile, and run inference + config = hls4ml.utils.config_from_onnx_model(model) + # Some hand-derived config + # TODO should be auto-derived by QuantizeDenseOutput pass after some adaptation + config['LayerName'] = {} + config['LayerName']['global_in'] = {'Precision': 'ap_fixed<16,2>'} + hls_model = hls4ml.converters.convert_from_onnx_model( + model, output_dir=str(test_root_path / 'hls4mlprj_qonnx_tfc-2w2a'), part='xcu250-figd2104-2L-e', hls_config=config + ) + hls_model.compile() + y_hls4ml = hls_model.predict(X) + + np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) + + +def test_tfc_2w2a_quartus(): + # download test model + dl_dir = test_root_path + dl_file = str(dl_dir / "qonnx-tfc-2w2a.onnx") + tfc_w2a2_qonnx_url = ( + "https://raw.githubusercontent.com/fastmachinelearning/" + "QONNX_model_zoo/main/models/MNIST/Brevitas_FINN_TFC/TFC/TFC_2W2A.onnx" + ) + urllib.request.urlretrieve(tfc_w2a2_qonnx_url, dl_file) + assert os.path.isfile(dl_file) + out_file = str(dl_dir / "qonnx-tfc-2w2a-clean.onnx") + + # cleanup + qonnx.util.cleanup.cleanup(dl_file, out_file=out_file) + model = ModelWrapper(out_file) + + # Execute QONNX model inference + # TODO make the test bigger + ishape = (1, 1, 28, 28) + np.random.seed(0) + X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + # Convert QONNX model, compile, and run inference + config = hls4ml.utils.config_from_onnx_model(model) + # Some hand-derived config + # TODO should be auto-derived by QuantizeDenseOutput pass after some adaptation + config['LayerName'] = {} + config['LayerName']['global_in'] = {'Precision': 'ac_fixed<16,2>'} + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir=str(test_root_path / 'hls4mlprj_qonnx_tfc-2w2a-quartus'), + part='Arria10', + backend='Quartus', + hls_config=config, + ) + hls_model.compile() + y_hls4ml = hls_model.predict(X) + + np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) + + +def test_cnv_2w2a(): + # download test model + dl_dir = test_root_path + dl_file = str(dl_dir / "qonnx-cnv-2w2a.onnx") + cnv_w2a2_qonnx_url = ( + "https://raw.githubusercontent.com/fastmachinelearning/" + "QONNX_model_zoo/main/models/CIFAR10/Brevitas_FINN_CNV/CNV_2W2A.onnx" + ) + urllib.request.urlretrieve(cnv_w2a2_qonnx_url, dl_file) + assert os.path.isfile(dl_file) + out_clean = str(dl_dir / "qonnx-cnv-2w2a-clean.onnx") + out_chanlast = str(dl_dir / "qonnx-cnv-2w2a-clean-channels-last.onnx") + out_file = str(dl_dir / "qonnx-cnv-2w2a-clean-channels-last-clean.onnx") + + # cleanup + qonnx.util.cleanup.cleanup(dl_file, out_file=out_clean) + qonnx.util.to_channels_last.to_channels_last(out_clean, make_input_channels_last=True, out_file=out_chanlast) + qonnx.util.cleanup.cleanup(out_chanlast, out_file=out_file) + model = ModelWrapper(out_file) + + # Execute QONNX model inference + # TODO make the test bigger + ishape = (1, 32, 32, 3) + np.random.seed(1) + X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + # Convert QONNX model, compile, and run inference + config = hls4ml.utils.config_from_onnx_model(model) + config['Model']['Precision'] = 'ap_fixed<32,16>' + # Some hand-derived config + # TODO should be auto-derived by QuantizeDenseOutput pass after some adaptation + + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir=str(test_root_path / 'hls4mlprj_qonnx_cnv-2w2a'), + part='xcu250-figd2104-2L-e', + io_type='io_stream', + hls_config=config, + ) + hls_model.compile() + y_hls4ml = hls_model.predict(X) + + np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) + + +@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +def test_jet_tagging(backend): + # download test model + dl_dir = test_root_path + dl_file = dl_dir / "qkeras_jettagging.onnx" + jet_tagging_qonnx_url = ( + "https://raw.githubusercontent.com/fastmachinelearning/" + "QONNX_model_zoo/main/models/JetTagging/QKeras_hls4ml_3layer/qkeras_jettagging.onnx" + ) + urllib.request.urlretrieve(jet_tagging_qonnx_url, dl_file) + assert os.path.isfile(dl_file) + out_file = dl_dir / "qkeras_jettagging-clean.onnx" + + # cleanup + qonnx.util.cleanup.cleanup(dl_file, out_file=out_file) + model = ModelWrapper(out_file) + + # Execute QONNX model inference + # TODO make the test bigger + ishape = (1, 16) + np.random.seed(0) + X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + # Convert QONNX model, compile, and run inference + config = hls4ml.utils.config_from_onnx_model(model) + # Some hand-derived config + # TODO should be auto-derived by QuantizeDenseOutput pass after some adaptation + + hls_model = hls4ml.converters.convert_from_onnx_model( + model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_jettag_{backend}'), backend=backend, hls_config=config + ) + hls_model.compile() + y_hls4ml = hls_model.predict(X) + + np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) + + +if __name__ == '__main__': + test_tfc_2w2a() From cda7208675c85ffadbcde4ce873521bf9187d7c1 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 11 Jul 2023 19:41:59 -0500 Subject: [PATCH 020/272] first migration of onnx parsing --- hls4ml/converters/onnx/convolution.py | 127 +++++++------- hls4ml/converters/onnx/core.py | 103 ++++++------ hls4ml/converters/onnx/merge.py | 24 +-- hls4ml/converters/onnx/pooling.py | 84 ++++------ hls4ml/converters/onnx/reshape.py | 37 ++--- hls4ml/converters/onnx_to_hls.py | 227 ++++++++++---------------- 6 files changed, 258 insertions(+), 344 deletions(-) diff --git a/hls4ml/converters/onnx/convolution.py b/hls4ml/converters/onnx/convolution.py index 39b2232169..85dc0ca804 100644 --- a/hls4ml/converters/onnx/convolution.py +++ b/hls4ml/converters/onnx/convolution.py @@ -1,85 +1,72 @@ -from hls4ml.converters.onnx_to_hls import ( - compute_pads_1d, - compute_pads_2d, - get_onnx_attribute, - get_onnx_input_name, - onnx_handler, -) -from hls4ml.converters.utils import compute_padding_1d, compute_padding_2d +import numpy as np + +from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler @onnx_handler('Conv') -def parse_conv_layer(reader, node, inputs_map, input_shapes, graph, config): +def parse_conv_layer(node, input_names, input_shapes, graph): layer = {} layer['name'] = node.name - layer['data_format'] = 'channels_first' # ONNX's default is channel first - layer['inputs'] = get_onnx_input_name(node, graph) - reader.add_input(layer['name'], node.input) + if node.domain != 'qonnx.custom_op.channels_last': + raise RuntimeError("Please convert the model to channels-last format with qonnx-to-channels-last") + layer['data_format'] = 'channels_last' # QONNX needs to be channels-last. + layer['inputs'] = input_names + layer['outputs'] = node.output strides = get_onnx_attribute(node, 'strides') kernel_shape = get_onnx_attribute(node, 'kernel_shape') - - if len(input_shapes[0]) == 3: # Conv1D - layer['class_name'] = 'Conv1D' - - layer['in_width'] = input_shapes[0][2] - layer['n_chan'] = input_shapes[0][1] - layer['filt_width'] = kernel_shape[0] - layer['n_filt'] = reader.get_weights_data(layer['name'], 'kernel').shape[2] - layer['stride_width'] = strides[0] - pads = compute_pads_1d(node, layer) - + # Note: currently don't have support for auto_pad. + pads = get_onnx_attribute(node, 'pads') + dilations = get_onnx_attribute(node, 'dilations') + if dilations is None: + dilations = [1] * len(layer['kernel_shape']) + + if get_onnx_attribute(node, 'group') != 1: + raise ValueError("Only 1 group supported corrently") + + layer['in_width'] = input_shapes[0][-2] + layer['n_chan'] = input_shapes[0][-1] + layer['n_filt'] = input_shapes[1][0] + + layer['n_dim'] = len(input_shapes[0]) - 2 # 2 comes from channels and batch dimentions + if layer['n_dim'] not in (1, 2): + raise ValueError("Only 1D and 2D convolutions are supported") + layer['class_name'] = 'Conv' + + # set some values needed later + if layer['n_dim'] == 1: + # this is 1D convolution + full_width = layer['in_width'] + pads[0] + pads[1] + eff_kernel_width = kernel_shape[0] * dilations[0] + layer['out_width'] = int(np.ceil((full_width - eff_kernel_width + 1) / strides[0])) + # for compatibility interpret some variables layer['pad_left'] = pads[0] layer['pad_right'] = pads[1] - - if all(x == 0 for x in pads): # No padding, i.e., 'VALID' padding - layer['padding'] = 'valid' - else: - layer['padding'] = 'same' - - (layer['out_width'], _, _) = compute_padding_1d( - layer['padding'], layer['in_width'], layer['stride_width'], layer['filt_width'] - ) - - output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_width']] - - elif len(input_shapes[0]) == 4: # Conv2D - layer['class_name'] = 'Conv2D' - - layer['in_height'] = input_shapes[0][2] - layer['in_width'] = input_shapes[0][3] - layer['n_chan'] = input_shapes[0][1] - + layer['filt_width'] = kernel_shape[0] + layer['stride_width'] = strides[0] + layer['dilation_width'] = dilations[0] + else: + # 2d + layer['in_height'] = input_shapes[0][-3] + full_height = layer['in_height'] + pads[0] + pads[2] + eff_kernel_height = kernel_shape[0] * dilations[0] + out_height = int(np.ceil((full_height - eff_kernel_height + 1) / strides[0])) + layer['out_height'] = out_height + + full_width = input_shapes[0][-2] + pads[1] + pads[3] + eff_kernel_width = kernel_shape[1] * dilations[1] + out_width = int(np.ceil((full_width - eff_kernel_width + 1) / strides[1])) + layer['out_width'] = out_width + # for compatibility interpret some variables + layer['pad_top'] = pads[0] + layer['pad_left'] = pads[1] + layer['pad_bottom'] = pads[2] + layer['pad_right'] = pads[3] layer['filt_height'] = kernel_shape[0] layer['filt_width'] = kernel_shape[1] - - layer['n_filt'] = next( - (x.type.tensor_type.shape.dim[1].dim_value for x in graph.value_info if x.name == node.output[0]), None - ) layer['stride_height'] = strides[0] layer['stride_width'] = strides[1] - pads = compute_pads_2d(node, layer) - - layer['pad_top'] = pads[0] - layer['pad_bottom'] = pads[2] - layer['pad_left'] = pads[1] - layer['pad_right'] = pads[3] - - if all(x == 0 for x in pads): # No padding, i.e., 'VALID' padding in Keras/Tensorflow - layer['padding'] = 'valid' - else: # Only 'valid' and 'same' padding are available in Keras - layer['padding'] = 'same' - - (layer['out_height'], layer['out_width'], _, _, _, _) = compute_padding_2d( - layer['padding'], - layer['in_height'], - layer['in_width'], - layer['stride_height'], - layer['stride_width'], - layer['filt_height'], - layer['filt_width'], - ) - - output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_height'], layer['out_width']] + layer['dilation_height'] = dilations[0] + layer['dilation_width'] = dilations[1] - return layer, output_shape + return layer diff --git a/hls4ml/converters/onnx/core.py b/hls4ml/converters/onnx/core.py index 940b860870..c6aaa6009c 100644 --- a/hls4ml/converters/onnx/core.py +++ b/hls4ml/converters/onnx/core.py @@ -1,28 +1,20 @@ -from hls4ml.converters.onnx_to_hls import get_onnx_attribute, get_onnx_input_name, onnx_handler +import numpy as np +from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler -@onnx_handler(*['Gemm', 'MatMul']) -def parse_gemm_layer(reader, node, inputs_map, input_shapes, graph, config): + +@onnx_handler('MatMul') +def parse_matmul_layer(node, input_names, input_shapes, graph): layer = {} - layer['class_name'] = 'Dense' + layer['class_name'] = 'MatMul' layer['name'] = node.name - layer['inputs'] = get_onnx_input_name(node, graph) - - tran_weight = get_onnx_attribute(node, 'transB', 0) - reader.add_input(layer['name'], node.input, tran_weight) - - weights_shape = reader.get_weights_data(layer['name'], 'kernel').shape - layer['n_in'] = weights_shape[0] - layer['n_out'] = weights_shape[1] - - output_shape = input_shapes[0][:] - output_shape[-1] = layer['n_out'] + layer['inputs'] = input_names + layer['outputs'] = list(node.output) - return layer, output_shape + return layer -# ------------------Global paras for activations # TODO: repair HardSigmoid support # https://github.com/fastmachinelearning/hls4ml/issues/409 activation_layers = [ @@ -37,7 +29,7 @@ def parse_gemm_layer(reader, node, inputs_map, input_shapes, graph, config): 'Softmax', 'Softsign', 'Softplus', - 'Clip', + # 'Clip', ] activation_map = { @@ -53,70 +45,89 @@ def parse_gemm_layer(reader, node, inputs_map, input_shapes, graph, config): 'Softmax': 'Softmax', 'Softsign': 'Activation', 'Softplus': 'Activation', - 'Clip': 'Clip', + # 'Clip': 'Clip', } # --------- @onnx_handler(*activation_layers) -def parse_activation_layer(reader, node, inputs_map, input_shapes, graph, config): +def parse_activation_layer(node, input_names, input_shapes, graph): layer = {} layer['name'] = node.name layer['class_name'] = activation_map[node.op_type] layer['activation'] = node.op_type.lower() - layer['inputs'] = get_onnx_input_name(node, graph) + layer['inputs'] = input_names + layer['outputs'] = list(node.output) if layer['class_name'] != 'Activation': if layer['class_name'] == 'Softmax': layer['activation'] = 'softmax' + layer['axis'] = get_onnx_attribute(node, 'axis', -1) elif layer['class_name'] in ['ELU', 'LeakyReLU', 'ThresholdedReLU']: layer['activation'] = layer['class_name'] layer['activ_param'] = get_onnx_attribute(node, 'alpha', 0.01) - elif layer['class_name'] == 'Clip': - clip_min_node = [x for x in graph.initializer if x.name in node.input] - clip_min = clip_min_node[0].float_data[0] + # # Don't yet support Clip + # elif layer['class_name'] == 'Clip': + # clip_min_node = [x for x in graph.initializer if x.name in input_names] + # clip_min = clip_min_node[0].float_data[0] - # Check if it's relu or not - if clip_min == 0.0: - layer['class_name'] = 'Activation' - layer['activation'] = 'ReLU' - else: - raise Exception('Clip with min != 0 is not supported yet!') + # # Check if it's relu or not + # if clip_min == 0.0: + # layer['class_name'] = 'Activation' + # layer['activation'] = 'ReLU' + # else: + # raise Exception('Clip with min != 0 is not supported yet!') else: layer['activation'] = layer['class_name'] layer['class_name'] = 'Activation' - return layer, [shape for shape in input_shapes[0]] + return layer @onnx_handler('BatchNormalization') -def parse_batchnorm_layer(reader, node, inputs_map, input_shapes, graph, config): +def parse_batchnorm_layer(node, input_names, input_shapes, graph): layer = {} - layer['class_name'] = 'BatchNormalization' - layer['data_format'] = 'channels_first' + layer['class_name'] = 'BatchNormOnnx' layer['name'] = node.name - layer['inputs'] = get_onnx_input_name(node, graph) + layer['inputs'] = input_names + layer['outputs'] = list(node.output) # Other attributes - layer['epsilon'] = get_onnx_attribute(node, 'epsilon') - layer['momentum'] = get_onnx_attribute(node, 'momentum') + layer['epsilon'] = get_onnx_attribute(node, 'epsilon', 1e-05) + # layer['momentum'] = get_onnx_attribute(node, 'momentum', 0.9) # not used - reader.add_input(layer['name'], node.input) - - in_size = 1 - for dim in input_shapes[0][1:]: - in_size *= dim - - layer['n_in'] = layer['n_out'] = in_size + layer['n_in'] = layer['n_out'] = np.prod(input_shapes[0][1:]) if len(input_shapes[0]) == 2: layer['n_filt'] = -1 elif len(input_shapes[0]) > 2: - layer['n_filt'] = input_shapes[0][1] # Always channel first for onnx + if node.domain != 'qonnx.custom_op.channels_last': + raise RuntimeError("Please convert the model to channels-last format with qonnx-to-channels-last") + layer['data_format'] = 'channels_last' # QONNX needs to be channels-last. + layer['n_filt'] = input_shapes[0][-1] + else: + raise RuntimeError(f"Unexpected input shape: {input_shapes[0]}") + + return layer + + +@onnx_handler('Quant') +def parse_quant_layer(node, input_names, input_shapes, graph): + layer = {} + + layer['class_name'] = 'Quant' + layer['name'] = node.name + layer['inputs'] = input_names + layer['outputs'] = list(node.output) + + # Other attributes + layer['narrow'] = bool(get_onnx_attribute(node, 'narrow')) + layer['rounding_mode'] = get_onnx_attribute(node, 'rounding_mode') + layer['signed'] = bool(get_onnx_attribute(node, 'signed')) - return layer, [shape for shape in input_shapes[0]] + return layer diff --git a/hls4ml/converters/onnx/merge.py b/hls4ml/converters/onnx/merge.py index 9ccd432d18..2309cc213f 100644 --- a/hls4ml/converters/onnx/merge.py +++ b/hls4ml/converters/onnx/merge.py @@ -1,16 +1,16 @@ -from hls4ml.converters.onnx_to_hls import get_onnx_attribute, get_onnx_input_name, onnx_handler +from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler -merge_layers = ['Add', 'Sub', 'Mul', 'Average', 'Max', 'Min', 'Concat', 'Sum'] +merge_layers = ['Add', 'Sub', 'Mul', 'Div', 'Average', 'Max', 'Min', 'Concat', 'Sum'] @onnx_handler(*merge_layers) -def parse_merge_layer(reader, node, inputs_map, input_shapes, graph, config): +def parse_merge_layer(node, input_names, input_shapes, graph): layer = {} layer['class_name'] = node.op_type layer['name'] = node.name layer['op'] = layer['class_name'].lower() - layer['inputs'] = get_onnx_input_name(node, graph) - output_shape = input_shapes[0] + layer['inputs'] = input_names + layer['outputs'] = list(node.output) if layer['class_name'] == 'Concat': rank = len(input_shapes[0][1:]) @@ -21,22 +21,10 @@ def parse_merge_layer(reader, node, inputs_map, input_shapes, graph, config): layer['op'] = layer['class_name'].lower() + f'{rank}d' layer['axis'] = get_onnx_attribute(node, 'axis') - # Calculate output shape - new_dim = sum( - [x.type.tensor_type.shape.dim[layer['axis']].dim_value for x in graph.value_info if x.name in node.input] - ) - output_shape[layer['axis']] = new_dim - - elif layer['class_name'] == 'Add': - # Check if the layer is an AddBias - for input in node.input: - if "bias" in input: - layer['class_name'] = 'BiasAdd' - reader.add_input(layer['name'], node.input) else: layer['class_name'] = 'Merge' if len(layer['inputs']) > 2: raise Exception('ERROR: Merging more than two tensors is not yet supported.') - return layer, output_shape + return layer diff --git a/hls4ml/converters/onnx/pooling.py b/hls4ml/converters/onnx/pooling.py index 67fa76c7c7..1f5c431004 100644 --- a/hls4ml/converters/onnx/pooling.py +++ b/hls4ml/converters/onnx/pooling.py @@ -1,26 +1,30 @@ -from hls4ml.converters.onnx_to_hls import ( - compute_pads_1d, - compute_pads_2d, - get_onnx_attribute, - get_onnx_input_name, - onnx_handler, -) -from hls4ml.converters.utils import compute_padding_1d, compute_padding_2d +import numpy as np + +from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler pool_operations = ['AveragePool', 'MaxPool'] @onnx_handler(*pool_operations) -def parse_pool_layer(reader, node, inputs_map, input_shapes, graph, config): +def parse_pool_layer(node, input_names, input_shapes, graph): layer = {} layer['name'] = node.name - layer['inputs'] = get_onnx_input_name(node, graph) + layer['inputs'] = input_names + layer['outputs'] = list(node.output) + if node.domain != 'qonnx.custom_op.channels_last': + raise RuntimeError("Please convert the model to channels-last format with qonnx-to-channels-last") layer['class_name'] = node.op_type - layer['data_format'] = 'channels_first' # Default ONNX + layer['data_format'] = 'channels_last' # Default QONNX info = layer['class_name'].replace('Pool', '') strides = get_onnx_attribute(node, 'strides') kernel_shape = get_onnx_attribute(node, 'kernel_shape') + pads = get_onnx_attribute(node, 'pads') + layer['pads'] = pads + dilations = get_onnx_attribute(node, 'dilations') + if dilations is None: + dilations = [1] * len(kernel_shape) + layer['dilations'] = dilations if len(input_shapes[0]) == 3: # 1D layer['class_name'] = info + 'Pooling1D' @@ -31,70 +35,50 @@ def parse_pool_layer(reader, node, inputs_map, input_shapes, graph, config): layer['pool_width'] = kernel_shape[0] layer['stride_width'] = strides[0] - # Padding - pads = compute_pads_1d(node, layer) - layer['pad_left'] = pads[0] - layer['pad_right'] = pads[1] - - if all(x == 0 for x in pads): # No padding, i.e., 'VALID' padding - layer['padding'] = 'valid' - else: - layer['padding'] = 'same' - - (layer['n_out'], _, _) = compute_padding_1d( - layer['padding'], layer['n_in'], layer['stride_width'], layer['pool_width'] + # formula from ONNX Operators.md documentation + layer['n_out'] = int( + np.floor((layer['n_in'] + np.sum(pads) - ((kernel_shape[0] - 1) * dilations[0] + 1)) / strides[0] + 1) ) - output_shape = [input_shapes[0][0], layer['n_filt'], layer['n_out']] - elif len(input_shapes[0]) == 4: # 2D layer['class_name'] = info + 'Pooling2D' - layer['n_filt'] = input_shapes[0][1] - layer['in_height'] = input_shapes[0][2] - layer['in_width'] = input_shapes[0][3] + layer['n_filt'] = input_shapes[0][3] + layer['in_height'] = input_shapes[0][1] + layer['in_width'] = input_shapes[0][2] layer['stride_height'] = strides[0] layer['stride_width'] = strides[1] layer['pool_height'] = layer['filt_height'] = kernel_shape[0] layer['pool_width'] = layer['filt_width'] = kernel_shape[1] - pads = compute_pads_2d(node, layer) layer['pad_top'] = pads[0] layer['pad_bottom'] = pads[2] layer['pad_left'] = pads[1] layer['pad_right'] = pads[3] - if all(x == 0 for x in pads): # No padding, i.e., 'VALID' padding in Keras/Tensorflow - layer['padding'] = 'valid' - else: # Only 'valid' and 'same' padding are available in Keras - layer['padding'] = 'same' - - (layer['out_height'], layer['out_width'], _, _, _, _) = compute_padding_2d( - layer['padding'], - layer['in_height'], - layer['in_width'], - layer['stride_height'], - layer['stride_width'], - layer['filt_height'], - layer['filt_width'], + # formula from ONNX Operators.md documentation + layer['out_height'] = int( + np.floor((layer['in_height'] + pads[0] + pads[2] - ((kernel_shape[0] - 1) * dilations[0] + 1)) / strides[0] + 1) + ) + layer['out_width'] = int( + np.floor((layer['in_width'] + pads[1] + pads[3] - ((kernel_shape[1] - 1) * dilations[1] + 1)) / strides[1] + 1) ) - output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_height'], layer['out_width']] - - return layer, output_shape + return layer global_pooling_layers = ['GlobalMaxPool', 'GlobalAveragePool'] @onnx_handler(*global_pooling_layers) -def parse_global_pooling_layer(reader, node, inputs_map, input_shapes, graph, config): +def parse_global_pooling_layer(node, input_names, input_shapes, graph): layer = {} layer['name'] = node.name - layer['inputs'] = get_onnx_input_name(node, graph) + layer['inputs'] = input_names + layer['outputs'] = list(node.output) layer['class_name'] = node.op_type - layer['data_format'] = 'channels_first' + layer['data_format'] = 'channels_last' # default QONNX # Sonme default parameters for global pooling layer['n_out'] = 1 @@ -116,6 +100,4 @@ def parse_global_pooling_layer(reader, node, inputs_map, input_shapes, graph, co layer['in_height'] = input_shapes[0][2] layer['in_width'] = input_shapes[0][3] - output_shape = [input_shapes[0][0], layer['n_filt']] + [1] * (len(input_shapes[0]) - 2) - - return layer, output_shape + return layer diff --git a/hls4ml/converters/onnx/reshape.py b/hls4ml/converters/onnx/reshape.py index 5bbf58b079..9ef20f03d7 100644 --- a/hls4ml/converters/onnx/reshape.py +++ b/hls4ml/converters/onnx/reshape.py @@ -1,39 +1,38 @@ -import numpy as np - -from hls4ml.converters.onnx_to_hls import get_onnx_input_name, onnx_handler +from hls4ml.converters.onnx_to_hls import onnx_handler @onnx_handler('Transpose') -def parse_transpose_layer(reader, node, inputs_map, input_shapes, graph, config): +def parse_transpose_layer(node, input_names, input_shapes, graph): layer = {} layer['name'] = node.name layer['class_name'] = 'Transpose' - layer['inputs'] = get_onnx_input_name(node, graph) + layer['inputs'] = input_names + layer['outputs'] = list(node.output) perm = [list(i.ints) for i in node.attribute][0] # This will get something like [[a,b,c]][0] = [a,b,c] layer['perm'] = [x - 1 for x in perm[1:]] # Ignore the batch dimension in ONNX, and adjust the perm indexing - output_shape = [input_shapes[0][i] for i in perm] - - return layer, output_shape + return layer @onnx_handler('Reshape') -def parse_reshape_layer(reader, node, inputs_map, input_shapes, graph, config): +def parse_reshape_layer(node, input_names, input_shapes, graph): layer = {} layer['name'] = node.name layer['class_name'] = 'Reshape' - layer['inputs'] = get_onnx_input_name(node, graph) + layer['inputs'] = input_names + layer['outputs'] = list(node.output) - target_shape = list([x for x in graph.initializer if x.name == node.input[1]][0].int64_data)[1:] + return layer - if -1 in target_shape: # Need to infer shape for -1 - print("WARNING: Inferring -1 shape ... ") - dummy_x = np.ones(input_shapes[0][1:]) - dummy_y = np.reshape(dummy_x, target_shape) - target_shape = list(dummy_y.shape) - layer['target_shape'] = target_shape - output_shape = input_shapes[0][:1] + layer['target_shape'] +@onnx_handler('Flatten') +def parse_flatten_layer(node, input_names, input_shapes, graph): + layer = {} + layer['name'] = node.name + layer['class_name'] = 'Reshape' + layer['inputs'] = input_names + layer['outputs'] = list(node.output) + layer['target_shape'] = [-1] # does not contain batch dimension - return layer, output_shape + return layer diff --git a/hls4ml/converters/onnx_to_hls.py b/hls4ml/converters/onnx_to_hls.py index 106daf62da..8f6c7461fb 100644 --- a/hls4ml/converters/onnx_to_hls.py +++ b/hls4ml/converters/onnx_to_hls.py @@ -1,78 +1,10 @@ -import numpy as np import onnx -from onnx import helper, numpy_helper, shape_inference +from onnx import helper, numpy_helper from hls4ml.model import ModelGraph -MAXMULT = 4096 - -class ONNXDataReader: - """ - ONNX data reader to be used for extracting relevant information during conversion. - """ - - def __init__(self, model): - self.model = model - self.input_map = {} - self.index_map = { - # Dense - 'kernel': 1, - 'bias': 2, - # BatchNormalization - 'gamma': 1, - 'beta': 2, - 'moving_mean': 3, - 'moving_variance': 4, - } - - def get_weights_data(self, layer_name, var_name): - """Extract weights data from ONNX model. - - Args: - layer_name (str): Layer's name in the ONNX model. - var_name (str): Variable to be extracted. - - Returns: - ndarray: Extracted weights data. - """ - # Get the node associated with the layer name - node = next(node for node in self.model.graph.node if node.name == layer_name) - - inputs = self.input_map[layer_name] - inp_idx = self.index_map[var_name] - - if inp_idx >= len(inputs['inputs']): - # Check if the layer is an AddBias layer - if (node.op_type == 'Add') and (var_name == 'bias'): - inp_idx = 1 - else: - # Input not found, likely a bias tensor is not available - return None - - tensor = next((x for x in self.model.graph.initializer if x.name == inputs['inputs'][inp_idx]), None) - - if tensor is not None: - data = numpy_helper.to_array(tensor) - - if inputs['transpose']: - if inputs['perm'] is not None and len(data.shape) == len(inputs['perm']): - data = data.transpose(inputs['perm']) - else: - data = data.transpose() - - # Check for transB in Gemm - if node.op_type == 'Gemm': - if not get_onnx_attribute(node, 'transB'): - data = data.transpose() - - return data - - def add_input(self, layer_name, inputs, transpose=True, perm=None): - self.input_map[layer_name] = {'inputs': inputs, 'transpose': transpose, 'perm': perm} - - -# ----------------------Helpers--------------------- # +# ----------------------Helpers--------------------- def sanitize_layer_name(layer): new_name = layer['name'] if new_name[0].isdigit(): @@ -99,9 +31,52 @@ def get_onnx_attribute(operation, name, default=None): return value -def get_input_shape(model, operation, input_idx=0): - value_info_idx = next((i for i, x in enumerate(model.graph.value_info) if x.name == operation.input[input_idx]), 0) - return [d.dim_value for d in model.graph.value_info[value_info_idx].type.tensor_type.shape.dim] +def get_global_input_shape(graph, inp): + """Return the global input shape of the graph with name inp + + Arguments: + graph: the onnx graph + inp (str): the global input name + + Returns: + list: The shape + + Raises: + StopIteration: If the global input name is not found + """ + inp_shape = next(x.type.tensor_type.shape.dim for x in graph.input if x.name == inp) + return list(x.dim_value for x in inp_shape) + + +def get_input_shape(graph, node): + """Return the input shapes of the node in the model + + Arguments: + graph: the onnx graph + node: the onnx node for which the input is desired + + Returns: + list of lists: The shapes of all the inputs + + Raises: + StopIteration: If the an input name is not found in the graph + """ + rv = [] + for inp in node.input: + try: + value_info_idx = next((i for i, x in enumerate(graph.value_info) if x.name == inp)) + dim = list(d.dim_value for d in graph.value_info[value_info_idx].type.tensor_type.shape.dim) + except StopIteration: + # The input is not in the graph, likely it's the input + dim = get_global_input_shape(graph, inp) + if dim: + rv.append(dim) + return rv + + +def get_constant_value(graph, constant_name): + tensor = next((x for x in graph.initializer if x.name == constant_name), None) + return numpy_helper.to_array(tensor) def compute_pads_1d(operation, layer): @@ -155,7 +130,7 @@ def compute_pads_2d(operation, layer): return pads -# ----------------------Layer handling--------------------- # +# ----------------------Layer handling--------------------- layer_handlers = {} @@ -178,27 +153,6 @@ def decorator(function): return decorator -# --->> A set of functions to address the naming convetion in ONNx's graph -def get_onnx_input_name(node, graph): - """ - In ONNX, when calling node.input, it returns the node input's index in the graph instead of the input's name. - However, the input's name is used for indexing in ModelGraph's graph. This function return the input node's name instead. - """ - - in_node = [in_node for in_node in graph.node if (in_node.output[0] in node.input)] - - if in_node: - if in_node[0].op_type != 'Flatten': - input_node_name = [x.name for x in in_node] - else: # IF it's a flatten - input_node_name = [x.name for x in graph.node if (x.output[0] in in_node[0].input)] - - return input_node_name - - else: # If there is no input name it's actually the first layer - return [replace_char_inconsitency(node.input[0])] - - def get_out_layer_name(graph): """ Get the output layer's name for the model. @@ -226,18 +180,16 @@ def onnx_to_hls(config): # Extract model architecture print('Interpreting Model ...') - model = onnx.load(config['OnnxModel']) if isinstance(config['OnnxModel'], str) else config['OnnxModel'] + onnx_model = onnx.load(config['OnnxModel']) if isinstance(config['OnnxModel'], str) else config['OnnxModel'] - model = shape_inference.infer_shapes(model) - graph = model.graph - - reader = ONNXDataReader(model) + # We don't infer the shapes because the qonnx package preprocessing does it. # Obtain list of input/ouput layers - all_inputs = [x.name for x in model.graph.input] - all_initializers = [x.name for x in model.graph.initializer] + all_inputs = [x.name for x in onnx_model.graph.input] + all_initializers = [x.name for x in onnx_model.graph.initializer] input_layers = [x for x in all_inputs if x not in all_initializers] - output_layers = get_out_layer_name(graph) + constant_layers = all_initializers # no need to copy it even though we change it + output_layers = get_out_layer_name(onnx_model.graph) print("Output layers: ", output_layers) @@ -245,69 +197,64 @@ def onnx_to_hls(config): input_layer = {} input_layer['name'] = replace_char_inconsitency(inp) input_layer['class_name'] = 'InputLayer' - inp_shape = next((x.type.tensor_type.shape.dim for x in model.graph.input if x.name == inp), None) - input_layer['input_shape'] = [x.dim_value for x in inp_shape] - - if len(input_layer['input_shape']) > 1: - input_layer['input_shape'][0] = None # Firt dim is batch + inp_shape = get_global_input_shape(onnx_model.graph, inp) + # We only support ONNX where the first dimension is the batch dimension. + # Remove the batch dimension in all subsequnt use + input_layer['input_shape'] = inp_shape[1:] + print('Input shape:', input_layer['input_shape']) # Clean the layer name for specific models sanitize_layer_name(input_layer) input_layers[i] = input_layer['name'] layer_list.append(input_layer) + for i, constant in enumerate(constant_layers): + constant_layer = {} + constant_layer['name'] = replace_char_inconsitency(constant) + constant_layer['class_name'] = 'Constant' + constant_layer['value'] = get_constant_value(onnx_model.graph, constant) + + # Clean the layer name for specific models + sanitize_layer_name(constant_layer) + constant_layers[i] = constant_layer['name'] + + layer_list.append(constant_layer) + # Defined supported layers and check for unsupported layer type - skip_layers = ['Dropout', 'Identity', 'Flatten'] + skip_layers = ['Dropout', 'Identity'] # Map inputs of skipped layers inputs_map = {} supported_layers = get_supported_onnx_layers() + skip_layers - # Get input shape - current_shape = [input_layer['input_shape']] - print('Input shape:', current_shape[0]) - - # Loop through layers - layer_counter = 0 - - # Output shape tracking - output_shape = None - print('Topology:') - for node in graph.node: + for node in onnx_model.graph.node: if node.op_type not in supported_layers: raise Exception(f'ERROR: Unsupported operation type: {node.op_type}') - # If not the first layer then input shape is taken from last layer's output - if layer_counter != 0: - current_shape = [output_shape] + # Note that at this point, input shape still contains batch dimension + # in cases where it appears. That is not filtered out till later. + input_shapes = get_input_shape(onnx_model.graph, node) if node.op_type in skip_layers: - if node.op_type == 'Flatten': - output_shape = [current_shape[0][0], np.prod(current_shape[0][1:])] - - else: - # Currently supported skipped layers have only one input and output - # Skipped layers can follow each other (e.g., Dropout -> Flatten) - - # Mapping inputs - input_name = inputs_map.get(node.input[0], node.input[0]) - output_name = node.output[0] - inputs_map[output_name] = input_name + # Currently supported skipped layers have only one input and output + # Skipped layers can follow each other - output_shape = current_shape[0] + # Mapping inputs + input_name = inputs_map.get(node.input[0], node.input[0]) + output_name = node.output[0] + inputs_map[output_name] = input_name continue - if node.op_type in supported_layers: - layer_counter = layer_counter + 1 + input_names = [inputs_map.get(x, x) for x in node.input] # Process the layer - layer, output_shape = layer_handlers[node.op_type](reader, node, inputs_map, current_shape, graph, config) + layer = layer_handlers[node.op_type](node, input_names, input_shapes, onnx_model.graph) sanitize_layer_name(layer) - print('Layer name: {}, layer type: {}, current shape: {}'.format(layer['name'], layer['class_name'], current_shape)) + print(f"Layer name: {layer['name']}, layer type: {layer['class_name']}, current shape: {input_shapes}") layer_list.append(layer) ################# @@ -315,5 +262,5 @@ def onnx_to_hls(config): ################# print('Creating HLS model') - hls_model = ModelGraph(config, reader, layer_list, input_layers, output_layers) + hls_model = ModelGraph(config, layer_list, input_layers, output_layers) return hls_model From af47a0d4563d986db0b7412536983d77ed9cedca Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 12 Jul 2023 13:50:51 -0500 Subject: [PATCH 021/272] change tuples to lists --- hls4ml/model/layers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 6a23a9b934..320a1fde57 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -352,7 +352,7 @@ class Constant(Layer): def initialize(self): value = self.attributes['value'] self.value = value # note, this is unquantized; Only here for easier access - shape = value.shape + shape = list(value.shape) if not shape: shape = (1,) self.value = np.array([self.value]) @@ -963,10 +963,10 @@ def initialize(self): if len(inp2.shape) == 1: # mat vec multiply assert inp1.shape[-1] == inp2.shape[0] - shape = tuple(inp1.shape[:-1]) + (inp2.shape[0],) + shape = list(inp1.shape[:-1]) + [inp2.shape[0]] else: assert inp1.shape[-1] == inp2.shape[-2] - shape = tuple(inp1.shape[:-1]) + (inp2.shape[-1],) + shape = list(inp1.shape[:-1]) + [inp2.shape[-1]] if len(shape) > 1: dims = [f'N_LAYER_{i}_{self.index}' for i in range(1, len(shape) + 1)] else: From 8f8cc0b21e23f52c5d750cbbc2ea56104008c6d7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 12 Jul 2023 18:26:28 -0500 Subject: [PATCH 022/272] snapshot of adding qonnx optimizers --- hls4ml/backends/fpga/fpga_backend.py | 8 +- hls4ml/model/layers.py | 4 +- hls4ml/model/optimizer/__init__.py | 8 + .../model/optimizer/passes/batchnorm_opt.py | 169 ++++++++ .../model/optimizer/passes/conv_to_convxd.py | 90 ++++ .../optimizer/passes/matmul_const_to_dense.py | 58 +++ hls4ml/model/optimizer/passes/merge_const.py | 192 +++++++++ hls4ml/model/optimizer/passes/move_scales.py | 301 ++++++++++++++ .../passes/propagate_conv_precision.py | 77 ++++ .../passes/propagate_dense_precision.py | 70 ++++ hls4ml/model/optimizer/passes/qkeras.py | 35 +- hls4ml/model/optimizer/passes/quant_opt.py | 387 ++++++++++++++++++ .../model/optimizer/passes/reshape_const.py | 27 ++ 13 files changed, 1389 insertions(+), 37 deletions(-) create mode 100644 hls4ml/model/optimizer/passes/batchnorm_opt.py create mode 100644 hls4ml/model/optimizer/passes/conv_to_convxd.py create mode 100644 hls4ml/model/optimizer/passes/matmul_const_to_dense.py create mode 100644 hls4ml/model/optimizer/passes/merge_const.py create mode 100644 hls4ml/model/optimizer/passes/move_scales.py create mode 100644 hls4ml/model/optimizer/passes/propagate_conv_precision.py create mode 100644 hls4ml/model/optimizer/passes/propagate_dense_precision.py create mode 100644 hls4ml/model/optimizer/passes/quant_opt.py create mode 100644 hls4ml/model/optimizer/passes/reshape_const.py diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 8cfaec8b3f..97e458f7fd 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -13,6 +13,8 @@ LSTM, Activation, BatchNormalization, + BatchNormOnnx, + Conv, Conv1D, Conv2D, Dense, @@ -22,8 +24,10 @@ GarNetStack, GlobalPooling1D, GlobalPooling2D, + MatMul, Pooling1D, Pooling2D, + Quant, SeparableConv1D, SeparableConv2D, SimpleRNN, @@ -63,6 +67,8 @@ def __init__(self, name): LSTM, GRU, Dot, + Conv, + MatMul, ] for layer in accum_layers: @@ -70,7 +76,7 @@ def __init__(self, name): attrs.append(TypeAttribute('accum')) self.attribute_map[layer] = attrs - rf_layers = accum_layers + [BatchNormalization, Activation, Embedding, GarNet, GarNetStack] + rf_layers = accum_layers + [BatchNormalization, Activation, Embedding, GarNet, GarNetStack, Quant, BatchNormOnnx] for layer in rf_layers: attrs = self.attribute_map.get(layer, []) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 320a1fde57..bd465ff7b9 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -391,8 +391,8 @@ def initialize(self): # need to get it from the input shape_node = self.get_input_node(self.inputs[1]) # for QONNX, remove batch dimension - # (onnx cleaning should have removed reshape dimension) - if shape_node: + # (onnx cleaning should have removed reshapes not on data path) + if isinstance(shape_node, Constant): target_shape = shape_node.value[1:] else: raise RuntimeError("Reshape for ONNX requires the target shape to be a second input.") diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 2e9b197475..db65370e40 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -37,6 +37,14 @@ 'fuse_bias_add', 'remove_useless_transpose', 'expand_layer_group', + 'reshape_constant', + 'quant_constant_parameters', + 'quant_to_activation', + 'fuse_quant_with_constant', + 'quant_to_alpha_activation_alpha', + 'const_quant_to_const_alpha', + 'matmul_const_to_dense', + 'conv_to_conv_x_d', 'output_rounding_saturation_mode', 'qkeras_factorize_alpha', 'extract_ternary_threshold', diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py new file mode 100644 index 0000000000..a7b0c27209 --- /dev/null +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -0,0 +1,169 @@ +import numpy as np + +from hls4ml.model.layers import BatchNormalization, BatchNormOnnx, Constant +from hls4ml.model.optimizer import OptimizerPass + +_base_attributes = ('Trace', 'reuse_factor', 'n_in', 'n_filt') + + +class BatchNormOnnxConstantParameters(OptimizerPass): + """Remove Constant from the BatchNormalization node parameters (but not input[0])""" + + def match(self, node): + is_match = isinstance(node, BatchNormOnnx) and any(node.inputs[1:]) + + return is_match + + def transform(self, model, node): + """ + Remove Constant from the BatchNormalization node parameters (but not input[0]) + """ + + if not (len(node.inputs) == 5 and all(node.inputs)): + raise ValueError(f"All {len.node.inputs} BatchNormOnnnx inputs need to be defined") + + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + + gamma_node = node.get_input_node(node.inputs[1]) + if not isinstance(gamma_node, Constant): + raise TypeError("Only consant gammas supported") + gamma = gamma_node.value + attributes['gamma_data'] = gamma + node.inputs[1] = '' + model.remove_node(gamma_node, rewire=False) + + beta_node = node.get_input_node(node.inputs[2]) + if not isinstance(beta_node, Constant): + raise TypeError("Only consant betas supported") + beta = beta_node.value + attributes['beta_data'] = beta + node.inputs[2] = '' + model.remove_node(beta_node, rewire=False) + + moving_mean_node = node.get_input_node(node.inputs[3]) + if not isinstance(moving_mean_node, Constant): + raise TypeError("Only consant moving_means supported") + moving_mean = moving_mean_node.value + attributes['mean_data'] = moving_mean + node.inputs[3] = '' + model.remove_node(moving_mean_node, rewire=False) + + moving_variance_node = node.get_input_node(node.inputs[4]) + if not isinstance(moving_variance_node, Constant): + raise TypeError("Only consant moving_variances supported") + moving_variance = moving_variance_node.value + attributes['variance_data'] = moving_variance + node.inputs[4] = '' + model.remove_node(moving_variance_node, rewire=False) + + # scale = gamma / np.sqrt(moving_variance + node.get_attr('epsilon')) + # bias = beta - gamma * moving_mean / np.sqrt(moving_variance + node.get_attr('epsilon')) + # attributes["scale_data"] = scale + # attributes["bias_data"] = bias + + new_node = model.make_node(BatchNormalization, node.name, attributes, [node.inputs[0]], [x for x in node.outputs]) + + model.replace_node(node, new_node) + + return True + + +class ConstantBatchNormFusion(OptimizerPass): + """ + Merge BatchNorm into Const (after parameters have already been merged in BatchNormalization) + """ + + def match(self, node): + is_match = ( + isinstance(node, BatchNormalization) + and not any(node.inputs[1:]) + and isinstance(node.get_input_node(node.inputs[0]), Constant) + and not node.get_input_node(node.inputs[0]).get_attr("quant_precision") + ) + return is_match + + def transform(self, model, node): + """ + Remove the batch norm + """ + const_node = node.get_input_node(node.inputs[0]) + + new_val = const_node.value * node.weights["scale"].data_unquantized + node.weights["bias"].data_unquantized + const_node.set_attr("value", new_val) + const_node.set_attr("quantizer", node.get_attr("quantizer")) # None if not defined + const_node.set_attr("quant_precision", node.get_attr("quant_precision")) + + # reinitialize (which also runs quantization if quantizer exists) + const_node.initialize() + + # remove the batch norm node + model.remove_node(node, rewire=True) + + return True + + +class FuseConsecutiveBatchNormalization(OptimizerPass): + ''' + OptimizerPass to merge consecutive BatchNormalization layers, + only if the earlier one does not have quantization specified + ''' + + def match(self, node): + prev_node = node.get_input_node(node.inputs[0]) + basic_match = ( + isinstance(node, BatchNormalization) + and isinstance(prev_node, BatchNormalization) + and not prev_node.get_attr("quant_precision") + ) + + # check for compatibility to merge + if basic_match: + s0 = prev_node.weights['scale'].data_unquantized + b0 = prev_node.weights['bias'].data_unquantized + s1 = node.weights['scale'].data_unquantized + b1 = node.weights['bias'].data_unquantized + scale_compatible = ( + (prev_node.get_attr("scale_quantizer") is None and node.get_attr("scale_quantizer") is None) + or (s0 == np.ones_like(s0)).all() + or (s1 == np.ones_like(s1)).all() + ) + bias_compatible = ( + (prev_node.get_attr("bias_quantizer") is None and node.get_attr("bias_quantizer") is None) + or (b0 == np.zeros_like(b0)).all() + or (b1 == np.zeros_like(b1)).all() + ) + return scale_compatible and bias_compatible + else: + return False + + def transform(self, model, node): + prev_node = node.get_input_node(node.inputs[0]) + + s0 = prev_node.weights['scale'].data_unquantized + b0 = prev_node.weights['bias'].data_unquantized + s1 = node.weights['scale'].data_unquantized + b1 = node.weights['bias'].data_unquantized + + s_quantizer = ( + node.get_attr("scale_quantizer") if (s0 == np.ones_like(s0)).all() else prev_node.get_attr("scale_quantizer") + ) + b_quantizer = ( + node.get_attr("bias_quantizer") if (b0 == np.zeros_like(b0)).all() else prev_node.get_attr("bias_quantizer") + ) + + node.set_attr("scale_quantizer", s_quantizer) + node.set_attr("bias_quantizer", b_quantizer) + if s_quantizer: + node.set_attr("scale_precision", s_quantizer.hls_type) + if b_quantizer: + node.set_attr("bias_precision", b_quantizer.hls_type) + + scale_new = s0 * s1 + bias_new = s1 * b0 + b1 + + # call function so that quantizer would be called if needed + node.add_weights(scale_new, quantizer=s_quantizer) + node.add_bias(bias_new, quantizer=b_quantizer) + + model.remove_node(prev_node, rewire=True) + return True diff --git a/hls4ml/model/optimizer/passes/conv_to_convxd.py b/hls4ml/model/optimizer/passes/conv_to_convxd.py new file mode 100644 index 0000000000..28f4d4c0bd --- /dev/null +++ b/hls4ml/model/optimizer/passes/conv_to_convxd.py @@ -0,0 +1,90 @@ +import numpy as np + +from hls4ml.model.layers import Constant, Conv, Conv1D, Conv2D +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import IntegerPrecisionType + +# these are attributes to copy +_base_attributes = ( + 'Trace', + 'reuse_factor', + 'in_width', + 'out_width', + 'n_chan', + 'n_filt', + 'pad_left', + 'pad_right', + 'filt_width', + 'stride_width', + 'dilation_width', + 'in_height', + 'out_height', + 'pad_top', + 'pad_bottom', + 'filt_height', + 'stride_height', + 'dilation_height', + 'strategy', + 'data_format', +) + + +class ConvToConvXD(OptimizerPass): + """Convert Conv with constant to a Conv1D or Conv2D layer""" + + def match(self, node): + is_match = isinstance(node, Conv) and ( + (len(node.inputs) == 2 and isinstance(node.get_input_node(node.inputs[1]), Constant)) + or ( + len(node.inputs) == 3 + and isinstance(node.get_input_node(node.inputs[1]), Constant) + and isinstance(node.get_input_node(node.inputs[2]), Constant) + ) + ) + + return is_match + + def transform(self, model, node): + """Convert Conv with constant to a Conv1D or Conv2D layer""" + + weight_node = node.get_input_node(node.inputs[1]) + weight_precision = weight_node.get_attr("quant_precision") + bias_node = None + bias_precision = None + if len(node.inputs) == 3: + bias_node = node.get_input_node(node.inputs[2]) + bias_precision = bias_node.get_attr("quant_precision") + + # creating the attributes + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + + # The ConvxD nodes expect the weight data to be in a different format, not (M, k1.., C) + if node.attributes['n_dim'] == 1: + newtype = Conv1D + attributes["weight_data"] = np.transpose(weight_node.value, (1, 2, 0)) + else: + newtype = Conv2D + attributes["weight_data"] = np.transpose(weight_node.value, (1, 2, 3, 0)) + attributes["weight_precision"] = weight_precision + attributes["weight_quantizer"] = weight_node.get_attr("quantizer") + + if bias_node: + attributes["bias_data"] = bias_node.value + attributes["bias_precision"] = bias_precision + attributes["bias_quantizer"] = bias_node.get_attr("quantizer") + else: + attributes["bias_data"] = np.zeros(attributes['n_filt']) + attributes["bias_precision"] = IntegerPrecisionType(1, False) + + # making new node + new_node = model.make_node( + newtype, f"{newtype.__name__}_{node.name}", attributes, [node.inputs[0]], [x for x in node.outputs] + ) + + # removing and replacing old nodes + model.remove_node(weight_node, rewire=False) + if bias_node: + model.remove_node(bias_node, rewire=False) + model.replace_node(node, new_node) + + return True diff --git a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py new file mode 100644 index 0000000000..82c7b56313 --- /dev/null +++ b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py @@ -0,0 +1,58 @@ +import numpy as np + +from hls4ml.model.layers import Constant, Dense, MatMul +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import IntegerPrecisionType + +_base_attributes = ('Trace', 'reuse_factor', 'weight', 'weight_t', 'bias', 'bias_t') + + +class MatmulConstToDense(OptimizerPass): + """ + Convert MatMul with constant to a dense layer. Note, this only supports the second input + being the constant. If needed, one could add transposes to make that be the case in + other yet to be written optimizers. + """ + + def match(self, node): + is_match = ( + isinstance(node, MatMul) and len(node.inputs) == 2 and isinstance(node.get_input_node(node.inputs[1]), Constant) + ) + return is_match + + def transform(self, model, node): + """Substitute Matmul + Constant for a single dense""" + # determining Constant layer input + const_node = node.get_input_node(node.inputs[1]) + other_var = node.get_input_variable(node.inputs[0]) + + weight_precision = const_node.get_attr("quant_precision") + weight_quantizer = const_node.get_attr("quantizer") + + in_shape = other_var.shape + n_in = np.prod(in_shape) + out_shape = list(in_shape[:-1]) + [const_node.value.shape[-1]] + n_out = np.prod(out_shape) + + # creating the attributes + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes.update( + { + "weight_data": const_node.value, + "weight_precision": weight_precision, + "weight_quantizer": weight_quantizer, + "bias_data": np.zeros(out_shape), + "bias_precision": IntegerPrecisionType(1, False), + "n_in": n_in, + "n_out": n_out, + } + ) + + # making new node + new_dense = model.make_node(Dense, f"Dense_{node.name}", attributes, [node.inputs[0]], [x for x in node.outputs]) + + # removing and replacing old nodes + model.remove_node(const_node, rewire=False) + model.replace_node(node, new_dense) + + return True diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py new file mode 100644 index 0000000000..4e339ccc3f --- /dev/null +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -0,0 +1,192 @@ +import numpy as np + +from hls4ml.converters.onnx.quantizer import QuantNodeQuantizer +from hls4ml.model.layers import BatchNormalization, Constant, Merge +from hls4ml.model.optimizer import OptimizerPass + +_base_attributes = ('Trace', 'reuse_factor', 'n_in') + +# TODO This doesn't yet support quantization in the constants + + +class MergeTwoConstants(OptimizerPass): + """Merge of two constants makes another constant""" + + def match(self, node): + is_match = ( + isinstance(node, Merge) + and isinstance(node.get_input_node(node.inputs[0]), Constant) + and isinstance(node.get_input_node(node.inputs[1]), Constant) + ) + + return is_match + + def transform(self, model, node): + """ + Merge of two constants makes another constant + """ + const_node0 = node.get_input_node(node.inputs[0]) + const_node1 = node.get_input_node(node.inputs[1]) + + val0 = const_node0.value + val1 = const_node1.value + + op = node.attributes["op"] + if op in ('add', 'sum'): + new_val = val0 + val1 + elif op == 'sub': + new_val = val0 - val1 + elif op == 'mul': + new_val = val0 * val1 + elif op == 'div': + new_val = val0 / val1 + elif op == 'average': + new_val = np.mean(np.array([val0, val1]), axis=0) + elif op == 'max': + new_val = np.maximum(val0, val1) + elif op == 'min': + new_val = np.minimum(val0, val1) + else: + raise RuntimeError(f"Unexpected op_type: {op}") + + quantizer = node.get_attr("quantizer") # None if not defined + if quantizer: + const_node0.set_attr("quantizer", quantizer) + const_node0.set_attr("value", new_val) + + quant_precision = node.get_attr("quant_precision") + if quant_precision: + const_node0.set_attr("quant_precision", quant_precision) + + # reinitialize (which also runs quantization if quantizer exists) + const_node0.initialize() + + model.remove_node(const_node1, rewire=False) + + # remove the batch norm node + model.remove_node(node, rewire=True) + + return True + + +class MergeToBatchNormalization(OptimizerPass): + """Convert Add, Sub, Mul, or Div Merges with consant to BatchNormalization""" + + def match(self, node): + is_match = ( + isinstance(node, Merge) + and node.attributes["op"] in ("add", "sum", "sub", "mul") # Div is separate + and ( + isinstance(node.get_input_node(node.inputs[0]), Constant) + != isinstance(node.get_input_node(node.inputs[1]), Constant) + ) + ) + # note: != for booleans is xor. + return is_match + + def transform(self, model, node): + node1 = node.get_input_node(node.inputs[1]) + + node1const = isinstance(node1, Constant) + if node1const: + const_node = node1 + input_node_idx = 0 + else: + const_node = node.get_input_node(node.inputs[0]) + input_node_idx = 1 + + input_shape = node.get_input_variable(node.inputs[input_node_idx]).shape + n_in = np.prod(input_shape) + + scale_precision = None + scale_quantizer = None + bias_precision = None + bias_quantizer = None + + op = node.attributes["op"] + if op in ('add', 'sum'): + scale = np.array(1) + bias = const_node.value + bias_precision = const_node.get_attr("quant_precision") + bias_quantizer = const_node.get_attr("quantizer") + elif op == 'sub': + if node1const: + scale = np.array(1) + bias = -const_node.value + else: + scale = np.array(-1) + bias = const_node.value + bias_precision = const_node.get_attr("quant_precision") + bias_quantizer = const_node.get_attr("quantizer") + if bias_precision and not bias_precision.signed: + # need to add a bit + bias_precision.signed = 1 + bias_precision.width += 1 + bias_precision.integer += 1 + bias_quantizer = QuantNodeQuantizer(bias_precision) + + elif op == 'mul': + scale = const_node.value + bias = np.array(0) + scale_precision = const_node.get_attr("quant_precision") + scale_quantizer = const_node.get_attr("quantizer") + + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes.update( + { + "scale_data": scale, + "bias_data": bias, + "n_in": n_in, + "n_out": n_in, + "n_filt": -1, + "scale_precision": scale_precision, + "scale_quantizer": scale_quantizer, + "bias_precision": bias_precision, + "bias_quantizer": bias_quantizer, + } + ) + + bn_layer = model.make_node( + BatchNormalization, f"bn_{node.name}", attributes, [node.inputs[input_node_idx]], [x for x in node.outputs] + ) + + model.remove_node(const_node, rewire=False) + model.replace_node(node, bn_layer) + + return True + + +class MergeToBatchNormalizationDiv(OptimizerPass): + """ + Convert Div Merges with consant to BatchNormalization + + TODO: propagate precision + """ + + def match(self, node): + is_match = ( + isinstance(node, Merge) + and node.attributes["op"] == 'div' + and isinstance(node.get_input_node(node.inputs[1]), Constant) + ) # only second can be const + + return is_match + + def transform(self, model, node): + input_shape = node.get_input_variable().shape + n_in = np.prod(input_shape) + const_node = node.get_input_node(node.inputs[1]) + scale = 1 / const_node.value + bias = np.array(0) + + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes.update({"scale_data": scale, "bias_data": bias, "n_in": n_in, "n_out": n_in, "n_filt": -1}) + + bn_layer = model.make_node( + "BatchNormalization", f"bn_{node.name}", attributes, [node.inputs[0]], [x for x in node.outputs] + ) + + model.remove_node(const_node, rewire=False) + model.replace_node(node, bn_layer) + + return True diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py new file mode 100644 index 0000000000..e97fd89947 --- /dev/null +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -0,0 +1,301 @@ +''' +This file includes optimizations related to moving the ApplyAphas across MatMul and Conv nodes. + +TODO: Check that biases are properly handled. (Attempt to do it via Merge) + +''' +import numpy as np + +from hls4ml.model.layers import ApplyAlpha, Constant, Conv, MatMul, Merge +from hls4ml.model.optimizer import OptimizerPass + + +class ScaleDownMatMul(OptimizerPass): + '''Shift an ApplyAlpha below a MatMul''' + + def match(self, node): + ''' + Check to see if we have a MatMul with at least one input ApplyAlpha. + Note, if both are this optimition runs twice. + ''' + is_match = ( + isinstance(node, MatMul) + and len(node.inputs) == 2 + and ( + isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha) + or isinstance(node.get_input_node(node.inputs[1]), ApplyAlpha) + ) + ) + return is_match + + def transform(self, model, node): + # determine input with ApplyAlpha. If both, first propagate apply alpha associated with a constant + is_aa = [False, False] + from_const = [False, False] + inp = [node.get_input_node(node.inputs[0]), node.get_input_node(node.inputs[1])] + for i in range(2): + if isinstance(inp[i], ApplyAlpha): + is_aa[i] = True + from_const[i] = isinstance(inp[i].get_input_node(inp[i].inputs[0]), Constant) + + # prefer alpha from constant + if from_const[0]: + alpha_idx = 0 + elif from_const[1]: + alpha_idx = 1 + elif is_aa[0]: + alpha_idx = 0 + else: + alpha_idx = 1 # is_aa[1] must be true + + apply_alpha = inp[alpha_idx] + other_idx = 0 if alpha_idx else 1 + + # Check if we can move + scale = apply_alpha.weights['scale'].data_unquantized + bias = apply_alpha.weights['bias'].data_unquantized + + scale1d = np.ravel(scale) + if (scale1d[0] == scale).all(): + # scalar scale + scale = np.array(scale1d[0]) + + bias1d = np.ravel(bias) + if (bias1d[0] == bias).all(): + # scalar bias + bias = np.array(bias1d[0]) + + output = node.get_output_variable() + + can_propagate = False + if not bias.shape and bias == 0: + # zero bias, propagate through, if possible + # (always possible if scale is scalar) + try: + np.broadcast_to(scale, output.shape) # check size compatibility + newscale = scale + newbias = np.array(0) + can_propagate = True + except ValueError: + can_propagate = False + + # if did not succeed in propagating, try again + if not can_propagate and isinstance(inp[other_idx], Constant): + # can handle nonzero bias in some cases if other value is a Constant + try: + np.broadcast_to(scale, output.shape) # check size compatibility + newscale = scale + newbias = inp[other_idx].value * bias + np.broadcast_to(newbias, output.shape) + can_propagate = True + except ValueError: + can_propagate = False + + if not can_propagate: + return False + + model.remove_node(apply_alpha) + + new_node = model.make_node('ApplyAlpha', apply_alpha.name, apply_alpha.attributes, [x for x in node.outputs]) + new_node.add_weights(newscale) + new_node.add_bias(newbias) + model.insert_node(new_node) + return True + + +class ScaleDownAdd(OptimizerPass): + '''Shift an identical ApplyAlpha below a Merge (Add)''' + + def match(self, node): + '''Check to see if we have an add with two ApplyAlphas with identical scale''' + is_match = isinstance(node, Merge) and len(node.inputs) == 2 and node.attributes["op"] == "add" + if is_match: + in0 = node.get_input_node(node.inputs[0]) + in1 = node.get_input_node(node.inputs[1]) + is_match = ( + isinstance(in0, ApplyAlpha) + and isinstance(in1, ApplyAlpha) + and (in0.weights['scale'].data_unquantized == in1.weights['scale'].data_unquantized).all() + ) + return is_match + + def transform(self, model, node): + in0 = node.get_input_node(node.inputs[0]) + in1 = node.get_input_node(node.inputs[1]) + + # Check if we can move + scale = in0.weights['scale'].data_unquantized + bias0 = in0.weights['bias'].data_unquantized + bias1 = in1.weights['bias'].data_unquantized + try: + bias = bias0 + bias1 + except ValueError: + return False + + model.remove_node(in0) + model.remove_node(in1) + + new_node = model.make_node('ApplyAlpha', in0.name, in0.attributes, [x for x in node.outputs]) + new_node.add_weights(scale) + new_node.add_bias(bias) + model.insert_node(new_node) + return True + + +class ScaleDownConv(OptimizerPass): + '''Shift an ApplyAlpha on input below a Conv''' + + def match(self, node): + '''Shift an ApplyAlpha from the Weight''' + is_match = isinstance(node, Conv) and isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha) + + return is_match + + def transform(self, model, node): + apply_alpha = node.get_input_node(node.inputs[0]) + + # Check if we can move + scale = apply_alpha.weights['scale'].data_unquantized + bias = apply_alpha.weights['bias'].data_unquantized + + scale1d = np.ravel(scale) + if (scale1d[0] == scale).all(): + # scalar scale + scale = np.array(scale1d[0]) + + bias1d = np.ravel(bias) + if (bias1d[0] == bias).all(): + # scalar bias + bias = np.array(bias1d[0]) + + output = node.get_output_variable() + + can_propagate = False + if not bias.shape and bias == 0: + # zero bias, propagate through, if possible + # (always possible if scale is scalar) + try: + np.broadcast_to(scale, output.shape) # check broadcastable + newscale = scale + newbias = np.array(0) + can_propagate = True + except ValueError: + can_propagate = False + + if not can_propagate: + return False + + model.remove_node(apply_alpha) + + new_node = model.make_node('ApplyAlpha', apply_alpha.name, apply_alpha.attributes, [x for x in node.outputs]) + new_node.add_weights(newscale) + new_node.add_bias(newbias) + model.insert_node(new_node) + return True + + +class ScaleDownWeightConv(OptimizerPass): + '''Shift an ApplyAlpha weight (from conv side) below a Conv''' + + def match(self, node): + '''Shift an ApplyAlpha from the Weight''' + is_match = ( + isinstance(node, Conv) and len(node.inputs) > 1 and isinstance(node.get_input_node(node.inputs[1]), ApplyAlpha) + ) + + return is_match + + def transform(self, model, node): + apply_alpha = node.get_input_node(node.inputs[1]) + + # Check if we can move + scale = apply_alpha.weights['scale'].data_unquantized + bias = apply_alpha.weights['bias'].data_unquantized + + scale1d = np.ravel(scale) + if (scale1d[0] == scale).all(): + # scalar scale + scale = np.array(scale1d[0]) + + bias1d = np.ravel(bias) + if (bias1d[0] == bias).all(): + # scalar bias + bias = np.array(bias1d[0]) + + output = node.get_output_variable() + + can_propagate = False + if not bias.shape and bias == 0: + # zero bias, propagate through, if possible + # (always possible if scale is scalar) + try: + np.broadcast_to(scale, output.shape) # make sure broadcastable + newscale = scale + newbias = np.array(0) + can_propagate = True + except ValueError: + can_propagate = False + + if not can_propagate: + return False + + model.remove_node(apply_alpha) + + new_node = model.make_node('ApplyAlpha', apply_alpha.name, apply_alpha.attributes, [x for x in node.outputs]) + new_node.add_weights(newscale) + new_node.add_bias(newbias) + model.insert_node(new_node) + return True + + +class ScaleDownBiasConv(OptimizerPass): + '''Shift an ApplyAlpha bias (from conv side) below a Conv''' + + def match(self, node): + '''Shift an ApplyAlpha from the Weight''' + is_match = ( + isinstance(node, Conv) and len(node.inputs) > 2 and isinstance(node.get_input_node(node.inputs[2]), ApplyAlpha) + ) + + return is_match + + def transform(self, model, node): + apply_alpha = node.get_input_node(node.inputs[2]) + + # Check if we can move + scale = apply_alpha.weights['scale'].data_unquantized + bias = apply_alpha.weights['bias'].data_unquantized + + scale1d = np.ravel(scale) + if (scale1d[0] == scale).all(): + # scalar scale + scale = np.array(scale1d[0]) + + bias1d = np.ravel(bias) + if (bias1d[0] == bias).all(): + # scalar bias + bias = np.array(bias1d[0]) + + output = node.get_output_variable() + + can_propagate = False + if not scale.shape and scale == 1: + # No scale, just additional bias + try: + np.broadcast_to(bias, output.shape) + newscale = np.array(1) + newbias = bias + can_propagate = True + except ValueError: + can_propagate = False + + if not can_propagate: + return False + + model.remove_node(apply_alpha) + + new_node = model.make_node('ApplyAlpha', apply_alpha.name, apply_alpha.attributes, [x for x in node.outputs]) + new_node.add_weights(newscale) + new_node.add_bias(newbias) + model.insert_node(new_node) + return True diff --git a/hls4ml/model/optimizer/passes/propagate_conv_precision.py b/hls4ml/model/optimizer/passes/propagate_conv_precision.py new file mode 100644 index 0000000000..17e357df88 --- /dev/null +++ b/hls4ml/model/optimizer/passes/propagate_conv_precision.py @@ -0,0 +1,77 @@ +import math # prefer to use math.ceil for scalar values (returns int) + +import numpy as np + +from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import FixedPrecisionType, NamedType + + +class PropagateConvPrecision(OptimizerPass): + """Propagate precision for conv nodes. Restrict it to only cases where + the precision is set by a quant node, since otherwise the values get huge. + """ + + def match(self, node): + is_match = isinstance(node, (Conv1D, Conv2D)) + return is_match + + def transform(self, model, node): + input_precision = node.get_input_node().get_attr("quant_precision") + weight_precision = node.get_attr("weight_precision") + if not input_precision or not weight_precision: + return False + + bias_precision = node.get_attr("bias_precision") + num_feature_maps = node.weights['weight'].data_unquantized.shape[-1] + filt_width = node.get_attr('filt_width') + filt_height = node.get_attr('filt_height', 1) + + accum_precision = _propagate_type_conv( + input_precision, + weight_precision, + bias_precision, + num_feature_maps=num_feature_maps, + filt_width=filt_width, + filt_height=filt_height, + ) + + accum_t = NamedType(f'layer{node.index}_accum_t', accum_precision) + node.set_attr('accum_t', accum_t) + + if not node.get_attr("quant_precision"): + # output precision not explicitly set by quant node + node.update_output_precision(accum_precision) + + return False + + +def _propagate_type_conv(input_precision, weight_precision, bias_precision, num_feature_maps, filt_width, filt_height): + ''' + Propagate the precion type across a multiply. Rounding modes are propagated from input_precision + ''' + + Nacc = filt_width * filt_height * num_feature_maps + bitwidth = weight_precision.width + input_precision.width + math.ceil(np.log2(Nacc)) + integer = weight_precision.integer + input_precision.integer + math.ceil(np.log2(Nacc)) + signed = weight_precision.signed or input_precision.signed + + # Because calculating precision, no need to round or sautration + rounding_mode = None + saturation_mode = None + + frac = bitwidth - integer + + # correct for bias + if bias_precision: + integer = ( + max( + integer + (bias_precision.signed and not signed), + bias_precision.integer + (signed and not bias_precision.signed), + ) + + 1 + ) + bitwidth = integer + max(frac, bias_precision.width - bias_precision.integer) + signed = signed or bias_precision.signed + + return FixedPrecisionType(bitwidth, integer, signed, rounding_mode, saturation_mode) diff --git a/hls4ml/model/optimizer/passes/propagate_dense_precision.py b/hls4ml/model/optimizer/passes/propagate_dense_precision.py new file mode 100644 index 0000000000..cc50bb7553 --- /dev/null +++ b/hls4ml/model/optimizer/passes/propagate_dense_precision.py @@ -0,0 +1,70 @@ +import math # prefer to use math.ceil for scalar values (returns int) + +import numpy as np + +from hls4ml.model.layers import Dense +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import FixedPrecisionType, NamedType + + +class PropagateDensePrecision(OptimizerPass): + """ + Propagate precision for Dense nodes. Restrict it to only cases where + the precision is set by a quant node, since otherwise the values get huge. + """ + + def match(self, node): + is_match = isinstance(node, Dense) + return is_match + + def transform(self, model, node): + input_precision = node.get_input_node().get_attr("quant_precision") + weight_precision = node.get_attr("weight_precision") + if not input_precision or not weight_precision: + return False + + bias_precision = node.get_attr("bias_precision") + input_variable = node.get_input_variable() + num_acc = input_variable.shape[-1] + + accum_precision = _propagate_type_dense(input_precision, weight_precision, bias_precision, num_acc) + + accum_t = NamedType(f'layer{node.index}_accum_t', accum_precision) + node.set_attr('accum_t', accum_t) + + if not node.get_attr("quant_precision"): + # output precision not set by quant node + node.update_output_precision(accum_precision) + + return False + + +def _propagate_type_dense(input_precision, weight_precision, bias_precision, num_acc): + ''' + Propagate the precion type across a multiply. Rounding modes are propagated from input_precision + ''' + + # check to make sure none are None + bitwidth = weight_precision.width + input_precision.width + math.ceil(np.log2(num_acc)) + integer = weight_precision.integer + input_precision.integer + math.ceil(np.log2(num_acc)) + signed = weight_precision.signed or input_precision.signed + + # Because calculating precision, no need to round or sautration + rounding_mode = None + saturation_mode = None + + frac = bitwidth - integer + + # correct for bias + if bias_precision: + integer = ( + max( + integer + (bias_precision.signed and not signed), + bias_precision.integer + (signed and not bias_precision.signed), + ) + + 1 + ) + bitwidth = integer + max(frac, bias_precision.width - bias_precision.integer) + signed = signed or bias_precision.signed + + return FixedPrecisionType(bitwidth, integer, signed, rounding_mode, saturation_mode) diff --git a/hls4ml/model/optimizer/passes/qkeras.py b/hls4ml/model/optimizer/passes/qkeras.py index 2d2b6b0f77..7bed6cb1e7 100644 --- a/hls4ml/model/optimizer/passes/qkeras.py +++ b/hls4ml/model/optimizer/passes/qkeras.py @@ -1,7 +1,7 @@ import numpy as np import tensorflow as tf -from hls4ml.model.layers import ApplyAlpha, BatchNormalization +from hls4ml.model.layers import ApplyAlpha from hls4ml.model.optimizer import ConfigurableOptimizerPass, OptimizerPass, register_pass from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, QKerasPO2Quantizer @@ -81,7 +81,6 @@ def register_qkeras(): register_pass('output_rounding_saturation_mode', OutputRoundingSaturationMode) register_pass('qkeras_factorize_alpha', QKerasFactorizeAlpha) register_pass('extract_ternary_threshold', ExtractTernaryThreshold) - register_pass('fuse_consecutive_batch_normalization', FuseConsecutiveBatchNormalization) class QKerasFactorizeAlpha(OptimizerPass): @@ -181,38 +180,6 @@ def transform(self, model, node): return True -class FuseConsecutiveBatchNormalization(OptimizerPass): - '''OptimizerPass to merge consecutive BatchNormalization layers. - These may exist in a model after QKerasFactorizeAlpha layer. - Scale and Bias of each layer are combined into scale and bias of a single layer. - ''' - - def match(self, node): - return isinstance(node, BatchNormalization) and isinstance(node.get_input_node(), BatchNormalization) - - def transform(self, model, node): - bn0 = node.get_input_node() - bn1 = node - bn0_map = bn0.get_output_use_map() - bn1_map = bn1.get_output_use_map() - if len(bn0_map[bn0.name]) > 1 or len(bn1_map[bn1.name]) > 1: - return False - - s0 = bn0.weights['scale'].data - b0 = bn0.weights['bias'].data - s1 = bn1.weights['scale'].data - b1 = bn1.weights['bias'].data - - s2 = s0 * s1 - b2 = s1 * b0 + b1 - - bn0.weights['scale'].data = s2 - bn0.weights['bias'].data = b2 - - model.remove_node(node, rewire=True) - return True - - class ExtractTernaryThreshold(OptimizerPass): '''The input value (threshold) at which the output of a a ternary activation changes is configurable. This pass extracts that threshold point, inserting diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py new file mode 100644 index 0000000000..f0a5129d52 --- /dev/null +++ b/hls4ml/model/optimizer/passes/quant_opt.py @@ -0,0 +1,387 @@ +''' +This file includes optimizations related to quant nodes. + +As a first step, QuantConstantParameters converts the extra inputs to attributes. It is always the first step + +The next step differs between the case of (1) unitary scale and zero offset, or (2) nonunitary scale and/or +nonzero offset. In the first case no scaling is required, so a Quant node effectively becomes a linear activation. +For the common case when this is applied on a constant weight, the activation is immediately merged with the weight, +qantizing the weights. In case 2, we need to explictly scale and unscale, so the Quant node becomes 3 nodes, an +ApplyAlpha node to apply a scale/shift, a Linear node to apply the quantization, and another ApplyAlpha to unscale/shift. +We depend on optimization steps to move the unscaling ApplyAlpha down as needed. Again, when the Quant is a applied to a +Constant, the scaling and Linear nodes are immediately merged into the Constant. This is done because it simplifies some +of the other optimizations. + +UPDATE: Case 1 is loosened to also include power of 2 scalar scales, not just unitary scale, if + _ALSO_INCLUDE_PO2 is set to true (the default) + +''' +import math # prefer to use math.ceil for scalar values + +import numpy as np + +from hls4ml.converters.onnx.quantizer import QuantNodeQuantizer +from hls4ml.model.layers import Activation, ApplyAlpha, Constant, Quant +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import FixedPrecisionType + +_ALSO_MATCH_PO2 = True + +_base_attributes = ('Trace', 'reuse_factor') + + +class QuantConstantParameters(OptimizerPass): + """Remove Constant from the Qaunt node parameters (but not input[0])""" + + def match(self, node): + is_match = isinstance(node, Quant) and ( + (node.get_input_node(node.inputs[1]) and isinstance(node.get_input_node(node.inputs[1]), Constant)) + or (node.get_input_node(node.inputs[2]) and isinstance(node.get_input_node(node.inputs[2]), Constant)) + or (node.get_input_node(node.inputs[3]) and isinstance(node.get_input_node(node.inputs[3]), Constant)) + ) + + return is_match + + def transform(self, model, node): + """ + Remove Constant from the Qaunt node parameters (but not input[0]) + """ + if node.get_input_node(node.inputs[1]): + scale_node = node.get_input_node(node.inputs[1]) + if isinstance(scale_node, Constant): + node.set_attr('scale', scale_node.value) + node.inputs[1] = '' + model.remove_node(scale_node, rewire=False) + + if node.get_input_node(node.inputs[2]): + zeropt_node = node.get_input_node(node.inputs[2]) + if isinstance(zeropt_node, Constant): + node.set_attr('zeropt', zeropt_node.value) + node.inputs[2] = '' + model.remove_node(zeropt_node, rewire=False) + + if node.get_input_node(node.inputs[3]): + bitwidth_node = node.get_input_node(node.inputs[3]) + if isinstance(bitwidth_node, Constant): + if np.squeeze(bitwidth_node.value).shape: + raise RuntimeError("Only scalar bitwidth values are supporeted by the Quant node") + node.set_attr('bitwidth', bitwidth_node.value) + node.inputs[3] = '' + model.remove_node(bitwidth_node, rewire=False) + + return True + + +class QuantToActivation(OptimizerPass): + ''' + This is for the case when scale is 1 and zeropt is 0. It is a a 1:1 transformation of + a Quant to an Activation. + + As an optimization, this is not called when the input is constant. + + UPDATE: this is also called when scale is scalar and power of 2, not just 1. + ''' + + def match(self, node): + # only matches after the other inputs are already folded + + is_match = ( + isinstance(node, Quant) + and not isinstance(node.get_input_node(node.inputs[0]), Constant) + and not node.get_input_node(node.inputs[1]) + and not node.get_input_node(node.inputs[2]) + and not node.get_input_node(node.inputs[3]) + ) + + # Only match if the scale is 1s and the zero-point is 0s + if is_match: # to make sure this is a quant node with inputs + scale = node.get_attr("scale") + bias = node.get_attr("zeropt") + is_match = is_match and (bias == np.zeros_like(bias)).all() + + # check if scale is ones-like or a power of two + scale_unit_or_po2 = (scale == np.ones_like(scale)).all() + if not scale_unit_or_po2 and _ALSO_MATCH_PO2: + sqscale = np.squeeze(scale) + if not sqscale.shape: + # not an array + mantissa, _ = np.frexp(sqscale) + scale_unit_or_po2 = mantissa == 0.5 + + is_match = is_match and scale_unit_or_po2 + + return is_match + + def transform(self, model, node): + ''' + Change quant node to Activation + ''' + input_shape = node.get_input_variable().shape + + n_in = np.prod(input_shape) + + rounding_mode = node.get_attr("rounding_mode") + narrow = node.get_attr("narrow") + signed = node.get_attr("signed") + bitwidth = node.get_attr("bitwidth") + integer = bitwidth + scale = node.get_attr("scale") + if _ALSO_MATCH_PO2 and not (scale == np.ones_like(scale)).all(): + _, exp = np.frexp(np.squeeze(scale)) + integer = bitwidth + exp - 1 + + precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode) + + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes.update({'activation': 'linear', 'quant_precision': precision, 'quantizer': quantizer, 'n_in': n_in}) + + new_node = model.make_node(Activation, f'{node.name}_act', attributes, [node.inputs[0]], [x for x in node.outputs]) + new_node.get_output_variable().type.precision = precision + model.replace_node(node, new_node) + + return True + + +class FuseQuantWithConstant(OptimizerPass): + ''' + This is for the case when scale is 1 and zeropt is 0. It directly applies the quantization to a constant. + UPDATE: this is also called when scale is scalar and power of 2, not just 1. + ''' + + def match(self, node): + # only matches after the other inputs are already folded + is_match = ( + isinstance(node, Quant) + and isinstance(node.get_input_node(node.inputs[0]), Constant) + and not node.get_input_node(node.inputs[1]) + and not node.get_input_node(node.inputs[2]) + and not node.get_input_node(node.inputs[3]) + ) + + # Only match if the scale is 1s and the zero-point is 0s + if is_match: # to make sure this is a quant node with inputs + scale = node.get_attr("scale") + bias = node.get_attr("zeropt") + is_match = is_match and (bias == np.zeros_like(bias)).all() + + # check if scale is ones-like or a power of two + scale_unit_or_po2 = (scale == np.ones_like(scale)).all() + if not scale_unit_or_po2 and _ALSO_MATCH_PO2: + sqscale = np.squeeze(scale) + if not sqscale.shape: + # not an array + mantissa, _ = np.frexp(sqscale) + scale_unit_or_po2 = mantissa == 0.5 + + is_match = is_match and scale_unit_or_po2 + + return is_match + + def transform(self, model, node): + ''' + Fuse Quant with Constant. + ''' + + rounding_mode = node.get_attr("rounding_mode") + narrow = node.get_attr("narrow") + signed = node.get_attr("signed") + bitwidth = node.get_attr("bitwidth") + integer = bitwidth + scale = node.get_attr("scale") + if _ALSO_MATCH_PO2 and not (scale == np.ones_like(scale)).all(): + _, exp = np.frexp(np.squeeze(scale)) + integer = bitwidth + exp - 1 + + precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode) + + const_node = node.get_input_node(node.inputs[0]) + const_node.set_attr("quant_precision", precision) + const_node.set_attr("quantizer", quantizer) + + # reinitialize (which also runs quantization if quantizer exists) + const_node.initialize() + + # remove the Quant node + model.remove_node(node, rewire=True) + + return True + + +class QuantToAlphaActivationAlpha(OptimizerPass): + ''' + This is for the case when scale is not 1 or zeropt is not 0. It is a a 1:3 transformation of + a Quant to an ApplyAlpha (to scale), Activatio, ApplyAlpho (to rescale). + + As an optimization, this is not called when the input is constant. + ''' + + def match(self, node): + # only matches after the other inputs are already folded + is_match = ( + isinstance(node, Quant) + and not isinstance(node.get_input_node(node.inputs[0]), Constant) + and not node.get_input_node(node.inputs[1]) + and not node.get_input_node(node.inputs[2]) + and not node.get_input_node(node.inputs[3]) + ) + + if is_match: # to make sure this is a quant node with inputs + scale = node.get_attr("scale") + bias = node.get_attr("zeropt") + is_match = is_match and ((scale != np.ones_like(scale)).any() or (bias != np.zeros_like(bias)).any()) + return is_match + + def transform(self, model, node): + ''' + Change quant node to ApplyAlhpa, Activation, ApplyAlpha + ''' + + # Do the Activation as in the simple case + + input_shape = node.get_input_variable().shape + + n_in = np.prod(input_shape) + + rounding_mode = node.get_attr("rounding_mode") + narrow = node.get_attr("narrow") + signed = node.get_attr("signed") + bitwidth = node.get_attr("bitwidth") + + precision, quantizer = _calculate_precision_quantizer(bitwidth, bitwidth, signed, narrow, rounding_mode) + + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes.update({'activation': 'linear', 'quant_precision': precision, 'quantizer': quantizer, 'n_in': n_in}) + + new_node = model.make_node(Activation, f'{node.name}_act', attributes, [node.inputs[0]], [x for x in node.outputs]) + new_node.get_output_variable().type.precision = precision + model.replace_node(node, new_node) + + # but now add the ApplyAlhpas before and after + + scale = node.get_attr("scale") + bias = node.get_attr("zeropt") + + attributes_scale = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes_scale.update({'n_in': n_in, 'n_out': n_in, 'n_filt': -1}) + + attributes_rescale = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes_rescale.update({'n_in': n_in, 'n_out': n_in, 'n_filt': -1}) + + firstscale = 1 / scale + firstbias = bias + attributes_scale["scale_data"] = firstscale + attributes_scale["bias_data"] = firstbias + + scale_node = model.make_node(ApplyAlpha, node.name + '_scale', attributes_scale, [node.inputs[0]]) + model.insert_node(scale_node) + + rescale = scale + rebias = -bias * scale + attributes_rescale["scale_data"] = rescale + attributes_rescale["bias_data"] = rebias + + rescale_node = model.make_node(ApplyAlpha, node.name + '_rescale', attributes_rescale, [new_node.outputs[0]]) + model.insert_node(rescale_node) + + return True + + +class ConstQuantToConstAlpha(OptimizerPass): + ''' + This is for the case when scale is not 1 or zeropt is not 0. It is a a 1:3 transformation of + a Quant to an ApplyAlpha (to scale), Activation, ApplyAlpho (to unscale), but an input + consts allows for optimization, so the ApplyAlpha (to scale), Activation are + optimized away right away. + ''' + + def match(self, node): + # only matches after the other inputs are already folded + is_match = ( + isinstance(node, Quant) + and isinstance(node.get_input_node(node.inputs[0]), Constant) + and not node.get_input_node(node.inputs[1]) + and not node.get_input_node(node.inputs[2]) + and not node.get_input_node(node.inputs[3]) + ) + + if is_match: # to make sure this is a quant node with inputs + scale = node.get_attr("scale") + bias = node.get_attr("zeropt") + is_match = is_match and ((scale != np.ones_like(scale)).any() or (bias != np.zeros_like(bias)).any()) + return is_match + + def transform(self, model, node): + ''' + Change Constant + Quant node to Constant, ApplyAlpha + ''' + + # Do the Activation as in the simple case + + input_shape = node.get_input_variable().shape + + n_in = np.prod(input_shape) + + rounding_mode = node.get_attr("rounding_mode") + narrow = node.get_attr("narrow") + signed = node.get_attr("signed") + bitwidth = node.get_attr("bitwidth") + + precision, quantizer = _calculate_precision_quantizer(bitwidth, bitwidth, signed, narrow, rounding_mode) + + const_node = node.get_input_node(node.inputs[0]) + + scale = node.get_attr("scale") + bias = node.get_attr("zeropt") + + # caclucate the new value + new_val = const_node.value / scale + bias + const_node.set_attr('value', new_val) + const_node.set_attr("quant_precision", precision) + const_node.set_attr("quantizer", quantizer) + + # reinitialize (which also runs quantization if quantizer exists) + const_node.initialize() + + attributes_rescale = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes_rescale.update({'n_in': n_in, 'n_out': n_in, 'n_filt': -1}) + + rescale = scale + rebias = -bias * scale + attributes_rescale["scale_data"] = rescale + attributes_rescale["bias_data"] = rebias + + rescale_node = model.make_node( + ApplyAlpha, node.name + '_rescale', attributes_rescale, [x for x in node.inputs], [x for x in node.outputs] + ) + model.replace_node(node, rescale_node) + + return True + + +def _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode): + ''' + A function to determine the precision and quantizer + ''' + if rounding_mode == "ROUND": + bn_round = "AP_RND_CONV" + elif rounding_mode == "FLOOR": + bn_round = "AP_TRN" + else: + raise NotImplementedError( + f"Rounding mode {rounding_mode} not supported in Quant node. Only ROUND and FLOOR supported." + ) + + if narrow and not signed: + raise NotImplementedError("Narrow mode is only supported for singed numbers.") + + if narrow: + bn_sat = "AP_SAT_SYM" + else: + bn_sat = "AP_SAT" + + bitwidth = math.ceil(bitwidth) + integer = math.ceil(integer) + + precision = FixedPrecisionType(bitwidth, integer, signed, bn_round, bn_sat) + quantizer = QuantNodeQuantizer(precision) + return (precision, quantizer) diff --git a/hls4ml/model/optimizer/passes/reshape_const.py b/hls4ml/model/optimizer/passes/reshape_const.py new file mode 100644 index 0000000000..0012b2761e --- /dev/null +++ b/hls4ml/model/optimizer/passes/reshape_const.py @@ -0,0 +1,27 @@ +from hls4ml.model.layers import Constant, Reshape +from hls4ml.model.optimizer import OptimizerPass + + +class ReshapeConstant(OptimizerPass): + """ + ONNX has the target shape come as an input, not a parameter. This removes + the Constant input from new shape input. (Non-constant inputs are not supported.) + The constant value was already used; this is just a cleanup uptimization. + """ + + def match(self, node): + is_match = isinstance(node, Reshape) and len(node.inputs) > 1 and node.get_input_node(node.inputs[1]) + + return is_match + + def transform(self, model, node): + """ + Remove Constant from new shape input. Note, input shape node is already used on initialize + """ + shape_node = node.get_input_node(node.inputs[1]) + node.inputs[1] = '' + if not isinstance(shape_node, Constant): + raise RuntimeError("Nonconstant shape inputs are not currently supported") + model.remove_node(shape_node, rewire=False) + + return True From 5cea82d1bf0b6b82c3302bda6c7f482d603d8937 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jul 2023 11:49:57 -0500 Subject: [PATCH 023/272] snapshot that runs qonnx test, but gets incorrect results --- hls4ml/backends/fpga/fpga_backend.py | 12 +++- hls4ml/model/layers.py | 5 ++ hls4ml/model/optimizer/__init__.py | 10 +++ .../model/optimizer/passes/batchnorm_opt.py | 2 +- hls4ml/model/optimizer/passes/merge_const.py | 16 ++--- ...recision.py => propagate_acc_precision.py} | 53 ++++++++++---- .../passes/propagate_dense_precision.py | 70 ------------------- 7 files changed, 74 insertions(+), 94 deletions(-) rename hls4ml/model/optimizer/passes/{propagate_conv_precision.py => propagate_acc_precision.py} (59%) delete mode 100644 hls4ml/model/optimizer/passes/propagate_dense_precision.py diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 97e458f7fd..27620b1949 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -25,6 +25,7 @@ GlobalPooling1D, GlobalPooling2D, MatMul, + Merge, Pooling1D, Pooling2D, Quant, @@ -76,7 +77,16 @@ def __init__(self, name): attrs.append(TypeAttribute('accum')) self.attribute_map[layer] = attrs - rf_layers = accum_layers + [BatchNormalization, Activation, Embedding, GarNet, GarNetStack, Quant, BatchNormOnnx] + rf_layers = accum_layers + [ + BatchNormalization, + Activation, + Embedding, + GarNet, + GarNetStack, + Quant, + BatchNormOnnx, + Merge, + ] for layer in rf_layers: attrs = self.attribute_map.get(layer, []) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index bd465ff7b9..1922dcec8c 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -249,6 +249,11 @@ def add_output_variable( self.set_attr(out_name, out) + def update_output_precision(self, precision, output_name=None): + if output_name is None: + output_name = self.outputs[0] + self.variables[output_name].type.precision = precision + def add_weights(self, quantizer=None, compression=False): self.add_weights_variable( name='weight', var_name='w{index}', data='weight', quantizer=quantizer, compression=compression diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index db65370e40..38844992db 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -43,6 +43,16 @@ 'fuse_quant_with_constant', 'quant_to_alpha_activation_alpha', 'const_quant_to_const_alpha', + 'batch_norm_onnx_constant_parameters', + 'constant_batch_norm_fusion', + 'merge_two_constants', + 'scale_down_add', + 'scale_down_mat_mul', + 'scale_down_weight_conv', + 'scale_down_bias_conv', + 'scale_down_conv', + 'merge_to_apply_alpha', + 'merge_to_apply_alpha_div', 'matmul_const_to_dense', 'conv_to_conv_x_d', 'output_rounding_saturation_mode', diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index a7b0c27209..b9c651fd8f 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -3,7 +3,7 @@ from hls4ml.model.layers import BatchNormalization, BatchNormOnnx, Constant from hls4ml.model.optimizer import OptimizerPass -_base_attributes = ('Trace', 'reuse_factor', 'n_in', 'n_filt') +_base_attributes = ('Trace', 'reuse_factor', 'epsilon', 'n_in', 'n_filt') class BatchNormOnnxConstantParameters(OptimizerPass): diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index 4e339ccc3f..da70eb55f3 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -1,7 +1,7 @@ import numpy as np from hls4ml.converters.onnx.quantizer import QuantNodeQuantizer -from hls4ml.model.layers import BatchNormalization, Constant, Merge +from hls4ml.model.layers import ApplyAlpha, Constant, Merge from hls4ml.model.optimizer import OptimizerPass _base_attributes = ('Trace', 'reuse_factor', 'n_in') @@ -69,8 +69,8 @@ def transform(self, model, node): return True -class MergeToBatchNormalization(OptimizerPass): - """Convert Add, Sub, Mul, or Div Merges with consant to BatchNormalization""" +class MergeToApplyAlpha(OptimizerPass): + """Convert Add, Sub, Mul, or Div Merges with consant to ApplyAlpha""" def match(self, node): is_match = ( @@ -147,7 +147,7 @@ def transform(self, model, node): ) bn_layer = model.make_node( - BatchNormalization, f"bn_{node.name}", attributes, [node.inputs[input_node_idx]], [x for x in node.outputs] + ApplyAlpha, f"bn_{node.name}", attributes, [node.inputs[input_node_idx]], [x for x in node.outputs] ) model.remove_node(const_node, rewire=False) @@ -156,9 +156,9 @@ def transform(self, model, node): return True -class MergeToBatchNormalizationDiv(OptimizerPass): +class MergeToApplyAlphaDiv(OptimizerPass): """ - Convert Div Merges with consant to BatchNormalization + Convert Div Merges with consant to ApplyAlpha TODO: propagate precision """ @@ -182,9 +182,7 @@ def transform(self, model, node): attributes = {k: node.attributes.get(k, None) for k in _base_attributes} attributes.update({"scale_data": scale, "bias_data": bias, "n_in": n_in, "n_out": n_in, "n_filt": -1}) - bn_layer = model.make_node( - "BatchNormalization", f"bn_{node.name}", attributes, [node.inputs[0]], [x for x in node.outputs] - ) + bn_layer = model.make_node(ApplyAlpha, f"bn_{node.name}", attributes, [node.inputs[0]], [x for x in node.outputs]) model.remove_node(const_node, rewire=False) model.replace_node(node, bn_layer) diff --git a/hls4ml/model/optimizer/passes/propagate_conv_precision.py b/hls4ml/model/optimizer/passes/propagate_acc_precision.py similarity index 59% rename from hls4ml/model/optimizer/passes/propagate_conv_precision.py rename to hls4ml/model/optimizer/passes/propagate_acc_precision.py index 17e357df88..6c1facc23b 100644 --- a/hls4ml/model/optimizer/passes/propagate_conv_precision.py +++ b/hls4ml/model/optimizer/passes/propagate_acc_precision.py @@ -2,11 +2,43 @@ import numpy as np -from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.layers import Conv1D, Conv2D, Dense from hls4ml.model.optimizer import OptimizerPass from hls4ml.model.types import FixedPrecisionType, NamedType +class PropagateDensePrecision(OptimizerPass): + """ + Propagate precision for Dense nodes. Restrict it to only cases where + the precision is set by a quant node, since otherwise the values get huge. + """ + + def match(self, node): + is_match = isinstance(node, Dense) + return is_match + + def transform(self, model, node): + input_precision = node.get_input_node().get_attr("quant_precision") + weight_precision = node.get_attr("weight_precision") + if not input_precision or not weight_precision: + return False + + bias_precision = node.get_attr("bias_precision") + input_variable = node.get_input_variable() + num_acc = input_variable.shape[-1] + + accum_precision = _propagate_type_acc(input_precision, weight_precision, bias_precision, num_acc) + + accum_t = NamedType(f'layer{node.index}_accum_t', accum_precision) + node.set_attr('accum_t', accum_t) + + if not node.get_attr("quant_precision"): + # output precision not set by quant node + node.update_output_precision(accum_precision) + + return False + + class PropagateConvPrecision(OptimizerPass): """Propagate precision for conv nodes. Restrict it to only cases where the precision is set by a quant node, since otherwise the values get huge. @@ -27,14 +59,9 @@ def transform(self, model, node): filt_width = node.get_attr('filt_width') filt_height = node.get_attr('filt_height', 1) - accum_precision = _propagate_type_conv( - input_precision, - weight_precision, - bias_precision, - num_feature_maps=num_feature_maps, - filt_width=filt_width, - filt_height=filt_height, - ) + num_acc = filt_width * filt_height * num_feature_maps + + accum_precision = _propagate_type_acc(input_precision, weight_precision, bias_precision, num_acc) accum_t = NamedType(f'layer{node.index}_accum_t', accum_precision) node.set_attr('accum_t', accum_t) @@ -46,14 +73,14 @@ def transform(self, model, node): return False -def _propagate_type_conv(input_precision, weight_precision, bias_precision, num_feature_maps, filt_width, filt_height): +def _propagate_type_acc(input_precision, weight_precision, bias_precision, num_acc): ''' Propagate the precion type across a multiply. Rounding modes are propagated from input_precision ''' - Nacc = filt_width * filt_height * num_feature_maps - bitwidth = weight_precision.width + input_precision.width + math.ceil(np.log2(Nacc)) - integer = weight_precision.integer + input_precision.integer + math.ceil(np.log2(Nacc)) + # check to make sure none are None + bitwidth = weight_precision.width + input_precision.width + math.ceil(np.log2(num_acc)) + integer = weight_precision.integer + input_precision.integer + math.ceil(np.log2(num_acc)) signed = weight_precision.signed or input_precision.signed # Because calculating precision, no need to round or sautration diff --git a/hls4ml/model/optimizer/passes/propagate_dense_precision.py b/hls4ml/model/optimizer/passes/propagate_dense_precision.py deleted file mode 100644 index cc50bb7553..0000000000 --- a/hls4ml/model/optimizer/passes/propagate_dense_precision.py +++ /dev/null @@ -1,70 +0,0 @@ -import math # prefer to use math.ceil for scalar values (returns int) - -import numpy as np - -from hls4ml.model.layers import Dense -from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.types import FixedPrecisionType, NamedType - - -class PropagateDensePrecision(OptimizerPass): - """ - Propagate precision for Dense nodes. Restrict it to only cases where - the precision is set by a quant node, since otherwise the values get huge. - """ - - def match(self, node): - is_match = isinstance(node, Dense) - return is_match - - def transform(self, model, node): - input_precision = node.get_input_node().get_attr("quant_precision") - weight_precision = node.get_attr("weight_precision") - if not input_precision or not weight_precision: - return False - - bias_precision = node.get_attr("bias_precision") - input_variable = node.get_input_variable() - num_acc = input_variable.shape[-1] - - accum_precision = _propagate_type_dense(input_precision, weight_precision, bias_precision, num_acc) - - accum_t = NamedType(f'layer{node.index}_accum_t', accum_precision) - node.set_attr('accum_t', accum_t) - - if not node.get_attr("quant_precision"): - # output precision not set by quant node - node.update_output_precision(accum_precision) - - return False - - -def _propagate_type_dense(input_precision, weight_precision, bias_precision, num_acc): - ''' - Propagate the precion type across a multiply. Rounding modes are propagated from input_precision - ''' - - # check to make sure none are None - bitwidth = weight_precision.width + input_precision.width + math.ceil(np.log2(num_acc)) - integer = weight_precision.integer + input_precision.integer + math.ceil(np.log2(num_acc)) - signed = weight_precision.signed or input_precision.signed - - # Because calculating precision, no need to round or sautration - rounding_mode = None - saturation_mode = None - - frac = bitwidth - integer - - # correct for bias - if bias_precision: - integer = ( - max( - integer + (bias_precision.signed and not signed), - bias_precision.integer + (signed and not bias_precision.signed), - ) - + 1 - ) - bitwidth = integer + max(frac, bias_precision.width - bias_precision.integer) - signed = signed or bias_precision.signed - - return FixedPrecisionType(bitwidth, integer, signed, rounding_mode, saturation_mode) From d5394d4e59046daa6069ca22c8e3aec9ad934db7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jul 2023 11:55:33 -0500 Subject: [PATCH 024/272] add quant node quantizer --- hls4ml/converters/onnx/quantizer.py | 97 +++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 hls4ml/converters/onnx/quantizer.py diff --git a/hls4ml/converters/onnx/quantizer.py b/hls4ml/converters/onnx/quantizer.py new file mode 100644 index 0000000000..7f69652c04 --- /dev/null +++ b/hls4ml/converters/onnx/quantizer.py @@ -0,0 +1,97 @@ +""" +Quantizer for the Quant node, after scale and zeropoint hafe been extracted +(unless scale is a power of 2, if doing special case po2) + +This is based on the sample implementation in finn-base +""" + +import numpy as np + +from hls4ml.model.types import Quantizer, RoundingMode, SaturationMode + + +class QuantNodeQuantizer(Quantizer): + """This implements a quantizer for a FixedPrecisionType with width==integer""" + + def __init__(self, precision): + super().__init__(precision.width, precision) + + def __call__(self, data): + """Apply the quantization on the data""" + + scale = 2 ** (self.hls_type.width - self.hls_type.integer) + + data = data * scale # (not using *= to avoid modifying data) + # Clamping + min_int_val = self._min_int(self.hls_type.signed, self.hls_type.saturation_mode, self.bits) + max_int_val = self._max_int(self.hls_type.signed, self.bits) + data = np.where(data > max_int_val, max_int_val, data) + data = np.where(data < min_int_val, min_int_val, data) + # Rounding + rounding_fx = self._resolve_rounding_mode(self.hls_type.rounding_mode) + return rounding_fx(data) / scale + + @staticmethod + def _min_int(signed: bool, saturation_mode: str, bit_width: int) -> int: + """Compute the minimum integer representable by a given number of bits. + Args: + signed (bool): Indicates whether the represented integer is signed or not. + saturation_mode (bool): Indicates the saturation mode used (AP_SAT_SYM or AP_SAT) + bit_width (int): Number of bits available for the representation. + Returns: + int: Maximum unsigned integer that can be represented according to + the input arguments. + Examples: + >>> min_int(signed=True, saturation_mode='AP_SAT_SYM', bit_width=8) + int(-127) + >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8) + int(0) + >>> min_int(signed=True, saturation_mode='AP_SAT', bit_width=8) + int(-128) + >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8) + int(0) + """ + if saturation_mode not in (SaturationMode.SAT_SYM, SaturationMode.SAT): + raise ValueError(f"Saturation mode {saturation_mode} not supported. Only AP_SAT_SYM, AP_SAT supported") + if signed and saturation_mode == SaturationMode.SAT_SYM: + value = -(2 ** (bit_width - 1)) + 1 + elif signed: + value = -(2 ** (bit_width - 1)) + else: + value = 0 + return value + + @staticmethod + def _max_int(signed: bool, bit_width: int) -> int: + """Compute the maximum integer representable by a given number of bits. + (Note, narrow and unsigned is not supported by the implementation, so saturation mode is not used) + Args: + signed (bool): Indicates whether the represented integer is signed or not. + bit_width (int): Number of bits available for the representation. + Returns: + Tensor: Maximum integer that can be represented according to + the input arguments. + Examples: + >>> max_int(signed=True, bit_width=8) + int(127) + >>> max_int(signed=False, bit_width=8) + int(255) + """ + if not signed: + value = (2**bit_width) - 1 + else: + value = (2 ** (bit_width - 1)) - 1 + return value + + @staticmethod + def _resolve_rounding_mode(mode): + """Resolve the rounding mode of Quant and Trunc ops + to the corresponding numpy functions.""" + if mode == RoundingMode.RND_CONV: + return np.round + # elif mode_string == "CEIL": # not supported + # return np.ceil + elif mode == RoundingMode.TRN: + return np.floor + else: + raise ValueError(f"Rounding mode {mode} not supported.") From 9817ed36f034f73030a6a5820a451f4199812641 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jul 2023 15:14:53 -0500 Subject: [PATCH 025/272] fix broadcasting when going from Merge to ApplyAlpha --- hls4ml/model/optimizer/passes/merge_const.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index da70eb55f3..f38bfd841d 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -131,6 +131,12 @@ def transform(self, model, node): scale_precision = const_node.get_attr("quant_precision") scale_quantizer = const_node.get_attr("quantizer") + # because C++ doesn't do broadcasting, we may have to change the shapes of the scale and bias + if scale.shape != tuple(input_shape) and np.squeeze(scale).shape != tuple(input_shape): + scale = np.broadcast_to(scale, input_shape) + if bias.shape != tuple(input_shape) and np.squeeze(bias).shape != tuple(input_shape): + bias = np.broadcast_to(bias, input_shape) + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} attributes.update( { @@ -179,6 +185,12 @@ def transform(self, model, node): scale = 1 / const_node.value bias = np.array(0) + # because C++ doesn't do broadcasting, we may have to change the shapes of the scale and bias + if scale.shape != tuple(input_shape) and np.squeeze(scale).shape != tuple(input_shape): + scale = np.broadcast_to(scale, input_shape) + if bias.shape != tuple(input_shape) and np.squeeze(bias).shape != tuple(input_shape): + bias = np.broadcast_to(bias, input_shape) + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} attributes.update({"scale_data": scale, "bias_data": bias, "n_in": n_in, "n_out": n_in, "n_filt": -1}) From e494f435b55f396e2bf8d3c8c1350f5fa753fbb3 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jul 2023 15:49:56 -0500 Subject: [PATCH 026/272] update linear merging --- hls4ml/model/optimizer/__init__.py | 1 + hls4ml/model/optimizer/passes/linear.py | 42 +++++++++++++++++++++++++ hls4ml/model/optimizer/passes/nop.py | 14 --------- 3 files changed, 43 insertions(+), 14 deletions(-) create mode 100644 hls4ml/model/optimizer/passes/linear.py delete mode 100644 hls4ml/model/optimizer/passes/nop.py diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 38844992db..e41973b4e2 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -70,6 +70,7 @@ 'fuse_batch_normalization', 'replace_multidimensional_dense_with_conv', 'set_precision_concat', + 'merge_linear_activation', ], requires=['convert'], ) diff --git a/hls4ml/model/optimizer/passes/linear.py b/hls4ml/model/optimizer/passes/linear.py new file mode 100644 index 0000000000..72d6dade9f --- /dev/null +++ b/hls4ml/model/optimizer/passes/linear.py @@ -0,0 +1,42 @@ +from hls4ml.model.layers import Activation, BatchNormalization, Conv1D, Conv2D, Dense +from hls4ml.model.optimizer import OptimizerPass + + +class EliminateLinearActivation(OptimizerPass): + def match(self, node): + cast = False + if isinstance(node, Activation): + cast = node.get_input_variable().type.precision != node.get_output_variable().type.precision + return isinstance(node, Activation) and node.get_attr('activation') == 'linear' and not cast + + def transform(self, model, node): + model.remove_node(node) + return True + + +# TODO: Move migrate this to auto precisoin check from quant precision check +class MergeLinearActivation(OptimizerPass): + ''' + For many objects it's safe to change the output precision independently of the calculation. + ''' + + def match(self, node): + ''' + Only match if the parent is safe and the precision is not explicitly set. + ''' + if isinstance(node, Activation) and node.get_attr('activation') == 'linear': + parent = node.get_input_node(node.inputs[0]) + safe_parent = isinstance(parent, (Dense, Conv1D, Conv2D, BatchNormalization)) + parent_type_fixed = parent.get_attr("quant_precision") + return safe_parent and not parent_type_fixed + else: + return False + + def transform(self, model, node): + prev_node = node.get_input_node(node.inputs[0]) + quant_precision = node.get_attr("quant_precision") + prev_node.set_attr("quant_precision", quant_precision) + prev_node.set_attr("quantizer", node.get_attr("quantizer")) + prev_node.update_output_precision(quant_precision) + model.remove_node(node) + return True diff --git a/hls4ml/model/optimizer/passes/nop.py b/hls4ml/model/optimizer/passes/nop.py deleted file mode 100644 index 55fcf16e93..0000000000 --- a/hls4ml/model/optimizer/passes/nop.py +++ /dev/null @@ -1,14 +0,0 @@ -from hls4ml.model.layers import Activation -from hls4ml.model.optimizer import OptimizerPass - - -class EliminateLinearActivation(OptimizerPass): - def match(self, node): - cast = False - if isinstance(node, Activation): - cast = node.get_input_variable().type.precision != node.get_output_variable().type.precision - return isinstance(node, Activation) and node.get_attr('activation') == 'linear' and not cast - - def transform(self, model, node): - model.remove_node(node) - return True From ffddb5e898a7689cf73cdaf50ca118c4104f3c35 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jul 2023 16:25:03 -0500 Subject: [PATCH 027/272] update automatic setting of accumulators (QONNX-only for now) --- hls4ml/model/optimizer/__init__.py | 2 ++ hls4ml/model/optimizer/passes/propagate_acc_precision.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index e41973b4e2..6af9698a51 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -69,6 +69,8 @@ 'fuse_consecutive_batch_normalization', 'fuse_batch_normalization', 'replace_multidimensional_dense_with_conv', + 'propagate_dense_precision', + 'propagate_conv_precision', 'set_precision_concat', 'merge_linear_activation', ], diff --git a/hls4ml/model/optimizer/passes/propagate_acc_precision.py b/hls4ml/model/optimizer/passes/propagate_acc_precision.py index 6c1facc23b..375979de4e 100644 --- a/hls4ml/model/optimizer/passes/propagate_acc_precision.py +++ b/hls4ml/model/optimizer/passes/propagate_acc_precision.py @@ -6,6 +6,8 @@ from hls4ml.model.optimizer import OptimizerPass from hls4ml.model.types import FixedPrecisionType, NamedType +# TODO: Update these to use the new auto precision, not depdening only on QONNX values + class PropagateDensePrecision(OptimizerPass): """ From 57c89fb7da6cebdd8d8fe4e72ea6a31ea0c1a16a Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jul 2023 17:39:47 -0500 Subject: [PATCH 028/272] update qonnx tests --- test/pytest/test_qonnx.py | 144 ++++++++++++++------------------------ 1 file changed, 54 insertions(+), 90 deletions(-) mode change 100755 => 100644 test/pytest/test_qonnx.py diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py old mode 100755 new mode 100644 index be567d81f9..535bffb0da --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python import os import urllib from pathlib import Path @@ -17,8 +16,11 @@ test_root_path = Path(__file__).parent -def test_tfc_2w2a(): - # download test model +@pytest.fixture(scope='module') +def tfc_2w2a_model(): + ''' + Load the tiny fully-connected model + ''' dl_dir = test_root_path dl_file = str(dl_dir / "qonnx-tfc-2w2a.onnx") tfc_w2a2_qonnx_url = ( @@ -32,50 +34,60 @@ def test_tfc_2w2a(): # cleanup qonnx.util.cleanup.cleanup(dl_file, out_file=out_file) model = ModelWrapper(out_file) + return model - # Execute QONNX model inference - # TODO make the test bigger - ishape = (1, 1, 28, 28) - np.random.seed(0) - X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) - idict = {model.graph.input[0].name: X} - y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] - # Convert QONNX model, compile, and run inference - config = hls4ml.utils.config_from_onnx_model(model) - # Some hand-derived config - # TODO should be auto-derived by QuantizeDenseOutput pass after some adaptation - config['LayerName'] = {} - config['LayerName']['global_in'] = {'Precision': 'ap_fixed<16,2>'} - hls_model = hls4ml.converters.convert_from_onnx_model( - model, output_dir=str(test_root_path / 'hls4mlprj_qonnx_tfc-2w2a'), part='xcu250-figd2104-2L-e', hls_config=config +@pytest.fixture(scope='module') +def cnv_2w2a_model(): + ''' + Load the small convolution model + ''' + dl_dir = test_root_path + dl_file = str(dl_dir / "qonnx-cnv-2w2a.onnx") + cnv_w2a2_qonnx_url = ( + "https://raw.githubusercontent.com/fastmachinelearning/" + "QONNX_model_zoo/main/models/CIFAR10/Brevitas_FINN_CNV/CNV_2W2A.onnx" ) - hls_model.compile() - y_hls4ml = hls_model.predict(X) + urllib.request.urlretrieve(cnv_w2a2_qonnx_url, dl_file) + assert os.path.isfile(dl_file) + out_clean = str(dl_dir / "qonnx-cnv-2w2a-clean.onnx") + out_chanlast = str(dl_dir / "qonnx-cnv-2w2a-clean-channels-last.onnx") + out_file = str(dl_dir / "qonnx-cnv-2w2a-clean-channels-last-clean.onnx") - np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) + # cleanup + qonnx.util.cleanup.cleanup(dl_file, out_file=out_clean) + qonnx.util.to_channels_last.to_channels_last(out_clean, make_input_channels_last=True, out_file=out_chanlast) + qonnx.util.cleanup.cleanup(out_chanlast, out_file=out_file) + model = ModelWrapper(out_file) + return model -def test_tfc_2w2a_quartus(): - # download test model +@pytest.fixture(scope='module') +def jettagging_model(): + ''' + Load the 3 hidden layer QKeras example model trained on the jet tagging dataset + ''' dl_dir = test_root_path - dl_file = str(dl_dir / "qonnx-tfc-2w2a.onnx") - tfc_w2a2_qonnx_url = ( + dl_file = str(dl_dir / "qkeras_jettagging.onnx") + jet_tagging_qonnx_url = ( "https://raw.githubusercontent.com/fastmachinelearning/" - "QONNX_model_zoo/main/models/MNIST/Brevitas_FINN_TFC/TFC/TFC_2W2A.onnx" + "QONNX_model_zoo/main/models/JetTagging/QKeras_hls4ml_3layer/qkeras_jettagging.onnx" ) - urllib.request.urlretrieve(tfc_w2a2_qonnx_url, dl_file) + urllib.request.urlretrieve(jet_tagging_qonnx_url, dl_file) assert os.path.isfile(dl_file) - out_file = str(dl_dir / "qonnx-tfc-2w2a-clean.onnx") + out_file = str(dl_dir / "qkeras_jettagging-clean.onnx") # cleanup qonnx.util.cleanup.cleanup(dl_file, out_file=out_file) model = ModelWrapper(out_file) + return model + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +def test_tfc_2w2a(tfc_2w2a_model, backend): + model = tfc_2w2a_model - # Execute QONNX model inference - # TODO make the test bigger ishape = (1, 1, 28, 28) - np.random.seed(0) X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] @@ -83,15 +95,10 @@ def test_tfc_2w2a_quartus(): # Convert QONNX model, compile, and run inference config = hls4ml.utils.config_from_onnx_model(model) # Some hand-derived config - # TODO should be auto-derived by QuantizeDenseOutput pass after some adaptation config['LayerName'] = {} - config['LayerName']['global_in'] = {'Precision': 'ac_fixed<16,2>'} + config['LayerName']['global_in'] = {'Precision': 'ap_fixed<16,2>'} hls_model = hls4ml.converters.convert_from_onnx_model( - model, - output_dir=str(test_root_path / 'hls4mlprj_qonnx_tfc-2w2a-quartus'), - part='Arria10', - backend='Quartus', - hls_config=config, + model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_tfc-2w2a_{backend}'), backend=backend, hls_config=config ) hls_model.compile() y_hls4ml = hls_model.predict(X) @@ -99,45 +106,22 @@ def test_tfc_2w2a_quartus(): np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) -def test_cnv_2w2a(): - # download test model - dl_dir = test_root_path - dl_file = str(dl_dir / "qonnx-cnv-2w2a.onnx") - cnv_w2a2_qonnx_url = ( - "https://raw.githubusercontent.com/fastmachinelearning/" - "QONNX_model_zoo/main/models/CIFAR10/Brevitas_FINN_CNV/CNV_2W2A.onnx" - ) - urllib.request.urlretrieve(cnv_w2a2_qonnx_url, dl_file) - assert os.path.isfile(dl_file) - out_clean = str(dl_dir / "qonnx-cnv-2w2a-clean.onnx") - out_chanlast = str(dl_dir / "qonnx-cnv-2w2a-clean-channels-last.onnx") - out_file = str(dl_dir / "qonnx-cnv-2w2a-clean-channels-last-clean.onnx") - - # cleanup - qonnx.util.cleanup.cleanup(dl_file, out_file=out_clean) - qonnx.util.to_channels_last.to_channels_last(out_clean, make_input_channels_last=True, out_file=out_chanlast) - qonnx.util.cleanup.cleanup(out_chanlast, out_file=out_file) - model = ModelWrapper(out_file) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +def test_cnv_2w2a(cnv_2w2a_model, backend): + model = cnv_2w2a_model - # Execute QONNX model inference - # TODO make the test bigger ishape = (1, 32, 32, 3) - np.random.seed(1) X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] # Convert QONNX model, compile, and run inference - config = hls4ml.utils.config_from_onnx_model(model) - config['Model']['Precision'] = 'ap_fixed<32,16>' - # Some hand-derived config - # TODO should be auto-derived by QuantizeDenseOutput pass after some adaptation - + config = hls4ml.utils.config_from_onnx_model(model, default_precision='fixed<32,16>') hls_model = hls4ml.converters.convert_from_onnx_model( model, - output_dir=str(test_root_path / 'hls4mlprj_qonnx_cnv-2w2a'), - part='xcu250-figd2104-2L-e', + output_dir=str(test_root_path / f'hls4mlprj_qonnx_cnv-2w2a_{backend}'), io_type='io_stream', + backend=backend, hls_config=config, ) hls_model.compile() @@ -146,35 +130,19 @@ def test_cnv_2w2a(): np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) -def test_jet_tagging(backend): - # download test model - dl_dir = test_root_path - dl_file = dl_dir / "qkeras_jettagging.onnx" - jet_tagging_qonnx_url = ( - "https://raw.githubusercontent.com/fastmachinelearning/" - "QONNX_model_zoo/main/models/JetTagging/QKeras_hls4ml_3layer/qkeras_jettagging.onnx" - ) - urllib.request.urlretrieve(jet_tagging_qonnx_url, dl_file) - assert os.path.isfile(dl_file) - out_file = dl_dir / "qkeras_jettagging-clean.onnx" - - # cleanup - qonnx.util.cleanup.cleanup(dl_file, out_file=out_file) - model = ModelWrapper(out_file) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +def test_jet_tagging(jettagging_model, backend): + model = jettagging_model # Execute QONNX model inference # TODO make the test bigger ishape = (1, 16) - np.random.seed(0) X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] # Convert QONNX model, compile, and run inference config = hls4ml.utils.config_from_onnx_model(model) - # Some hand-derived config - # TODO should be auto-derived by QuantizeDenseOutput pass after some adaptation hls_model = hls4ml.converters.convert_from_onnx_model( model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_jettag_{backend}'), backend=backend, hls_config=config @@ -183,7 +151,3 @@ def test_jet_tagging(backend): y_hls4ml = hls_model.predict(X) np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) - - -if __name__ == '__main__': - test_tfc_2w2a() From 233905a0dac338e720a114ec2671aca1a2cd64f4 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 18 Jul 2023 11:32:37 -0500 Subject: [PATCH 029/272] remove batch dimension from flatten in Keras --- hls4ml/converters/keras/reshape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/converters/keras/reshape.py b/hls4ml/converters/keras/reshape.py index bd9d519a2a..1f6dc2a759 100644 --- a/hls4ml/converters/keras/reshape.py +++ b/hls4ml/converters/keras/reshape.py @@ -11,8 +11,8 @@ def parse_flatten_layer(keras_layer, input_names, input_shapes, data_reader): layer = parse_default_keras_layer(keras_layer, input_names) layer['class_name'] = 'Reshape' - layer['target_shape'] = [input_shapes[0][0], np.prod(input_shapes[0][1:])] - output_shape = layer['target_shape'] + layer['target_shape'] = [np.prod(input_shapes[0][1:])] # target shape has no batch dimension + output_shape = input_shapes[0][:1] + layer['target_shape'] return layer, output_shape From 6f119551c9586ada7cdb6e9c64c5956b1198023c Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 3 Aug 2023 17:15:26 -0500 Subject: [PATCH 030/272] fix optimizer that fuses consecutive batch norms --- hls4ml/model/optimizer/passes/batchnorm_opt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index b9c651fd8f..a74047676d 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -162,8 +162,8 @@ def transform(self, model, node): bias_new = s1 * b0 + b1 # call function so that quantizer would be called if needed - node.add_weights(scale_new, quantizer=s_quantizer) - node.add_bias(bias_new, quantizer=b_quantizer) + node.add_weights_variable(name='scale', var_name='s{index}', data=scale_new) + node.add_weights_variable(name='bias', var_name='b{index}', data=bias_new) model.remove_node(prev_node, rewire=True) return True From 6ee8f6fbc215723a5b8cc6470968f5ad3eade87d Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Thu, 10 Aug 2023 17:13:30 -0400 Subject: [PATCH 031/272] cleaner first try at GRU support --- hls4ml/converters/pytorch/recurrent.py | 69 ++++++++++++++++++++++++++ hls4ml/converters/pytorch_to_hls.py | 27 +++++++++- test/pytest/test_recurrent_pytorch.py | 46 +++++++++++++++++ 3 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 hls4ml/converters/pytorch/recurrent.py create mode 100644 test/pytest/test_recurrent_pytorch.py diff --git a/hls4ml/converters/pytorch/recurrent.py b/hls4ml/converters/pytorch/recurrent.py new file mode 100644 index 0000000000..c868924481 --- /dev/null +++ b/hls4ml/converters/pytorch/recurrent.py @@ -0,0 +1,69 @@ +import warnings + +from hls4ml.converters.pytorch_to_hls import get_weights_data, pytorch_handler + +rnn_layers = ['SimpleRNN', 'LSTM', 'GRU'] + + +@pytorch_handler(*rnn_layers) +def parse_rnn_layer(operation, layer_name, input_names, input_shapes, node, class_object, data_reader, config): + assert operation in rnn_layers or operation == "RNN" + + layer = {} + + layer["name"] = layer_name + + layer['inputs'] = [input_names[0]] + if len(input_names) > 1: + warnings.warn( + 'hls4ml disregards the initial value of the hidden state passed to the model, assuming that it is all zeros', + stacklevel=2, + ) + layer['class_name'] = operation + if operation == "RNN": + layer['class_name'] = 'SimpleRNN' + + layer['return_sequences'] = False # parameter does not exist in pytorch + layer['return_state'] = False # parameter does not exist in pytorch + + if layer['class_name'] == 'SimpleRNN': + layer['activation'] = class_object.nonlinearity # GRU and LSTM are hard-coded to use tanh in pytorch + else: + layer['activation'] = "tanh" # GRU and LSTM are hard-coded to use tanh in pytorch + + layer['recurrent_activation'] = layer['activation'] # pytorch does not seem to differentiate between the two + if layer['class_name'] == 'GRU': + layer['recurrent_activation'] = 'sigmoid' # seems to be hard-coded in pytorch? + + layer['time_major'] = not class_object.batch_first + # TODO Should we handle time_major? + if layer['time_major']: + raise Exception('hls4ml only supports "batch-first == True"') + + layer['n_timesteps'] = input_shapes[0][1] + layer['n_in'] = input_shapes[0][2] + + layer['n_out'] = class_object.hidden_size + + if class_object.num_layers > 1: + raise Exception('hls4ml does not support num_layers > 1') + + if class_object.bidirectional: + raise Exception('hls4ml does not support birectional RNNs') + + if class_object.dropout > 0: + raise Exception('hls4ml does not support RNNs with dropout') + + ( + layer['weight_data'], + layer['recurrent_weight_data'], + layer['bias_data'], + layer['recurrent_bias_data'], + ) = get_weights_data(data_reader, layer['name'], ['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0']) + + if layer['class_name'] == 'GRU': + layer['apply_reset_gate'] = 'after' # Might be true for pytorch? It's not a free parameter + + output_shape = [[input_shapes[0][0], layer['n_timesteps'], layer['n_out']], [1, layer['n_out']]] + + return layer, output_shape diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py index ddddbc04c7..b49287a2dd 100644 --- a/hls4ml/converters/pytorch_to_hls.py +++ b/hls4ml/converters/pytorch_to_hls.py @@ -28,6 +28,11 @@ def get_weights_data(self, layer_name, var_name): 'beta': 'bias', 'moving_mean': 'running_mean', 'moving_variance': 'running_var', + # Recurrent layers + 'weight_ih_l0': 'weight_ih_l0', + 'weight_hh_l0': 'weight_hh_l0', + 'bias_ih_l0': 'bias_ih_l0', + 'bias_hh_l0': 'bias_hh_l0', } # Workaround for naming schme in nn.Sequential, @@ -46,6 +51,8 @@ def get_weights_data(self, layer_name, var_name): if layer_name.split('_')[-1].isdigit() and len(layer_name.split('_')) > 1: layer_name = '_'.join(layer_name.split('_')[:-1]) + # print (self.state_dict) + print(layer_name + '.' + var_name) if layer_name + '.' + var_name in self.state_dict: data = self.state_dict[layer_name + '.' + var_name].numpy() return data @@ -167,7 +174,7 @@ def pytorch_to_hls(config): # All supported layers supported_layers = get_supported_pytorch_layers() + skip_layers - + print(supported_layers) input_layers = [] # Output shape tracking @@ -180,6 +187,8 @@ def pytorch_to_hls(config): n_inputs = 0 + print(traced_model.graph) + for node in traced_model.graph.nodes: # If part of a nn.Sequntial, the node name will start with an "_" which messes up the parsing if node.name[0] == '_': @@ -220,7 +229,19 @@ def pytorch_to_hls(config): # parse info from class object input_names = [str(i) for i in node.args] - input_shapes = [output_shapes[str(i)] for i in node.args] + if pytorch_class in ["RNN", "GRU", "LSTM"]: + # we currently don't support the passing of the initial value of the hidden state to RNN models + input_names = [str(node.args[0])] + input_shapes = [output_shapes[str(node.args[0])]] + # if a 'getitem' is the input to a node, step back in the graph to find the real source of the input + elif "getitem" in node.args[0].name: + for tmp_node in traced_model.graph.nodes: + if tmp_node.name == node.args[0]: + input_names = [str(tmp_node.args[0])] + input_shapes = [output_shapes[str(tmp_node.args[0])]] + node.args = tmp_node.args[0] + else: + input_shapes = [output_shapes[str(i)] for i in node.args] # for Conv layers if 'Conv' in pytorch_class: @@ -275,6 +296,8 @@ def pytorch_to_hls(config): operation = layer_name_map[operation] # only a limited number of functions are supported + if operation == "getitem": + continue if operation not in supported_layers: raise Exception(f'Unsupported function {operation}') if operation == 'PReLU' or operation == 'batch_norm' or operation == 'conv1d' or operation == 'conv2d': diff --git a/test/pytest/test_recurrent_pytorch.py b/test/pytest/test_recurrent_pytorch.py new file mode 100644 index 0000000000..48f0090236 --- /dev/null +++ b/test/pytest/test_recurrent_pytorch.py @@ -0,0 +1,46 @@ +from pathlib import Path + +import numpy as np +import pytest +import torch +import torch.nn as nn + +from hls4ml.converters import convert_from_pytorch_model +from hls4ml.utils.config import config_from_pytorch_model + +test_root_path = Path(__file__).parent + + +class GRUNet(nn.Module): + def __init__(self): + super().__init__() + self.rnn = nn.GRU(10, 20, num_layers=1, batch_first=True) + + def forward(self, x, h0): + output, hnn = self.rnn(x, h0) + return output + + +@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +def test_gru(backend, io_type): + model = GRUNet() + model.eval() + + X_input = torch.randn(1, 1, 10) + h0 = torch.zeros(1, 1, 20) + + pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0)).detach().numpy() + + config = config_from_pytorch_model(model, inputs_channel_last=True, transpose_outputs=False) + output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_linear_{backend}_{io_type}') + + hls_model = convert_from_pytorch_model( + model, [(None, 10, 1), (None, 20, 1)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + ) + + hls_model.compile() + + hls_prediction = np.reshape(hls_model.predict([X_input.detach().numpy(), h0.detach().numpy()]), (1, 1, 20)) + + np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2) From 2d40d46d970e6ad1a5e60c05a4ebe9e0cb3f0865 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 15 Aug 2023 14:00:24 -0400 Subject: [PATCH 032/272] fix RNN and LSTM --- hls4ml/converters/__init__.py | 3 + hls4ml/converters/pytorch/recurrent.py | 13 ++- hls4ml/converters/pytorch_to_hls.py | 32 ++----- .../passes/convert_to_channels_last.py | 88 ++++++++++--------- hls4ml/utils/config.py | 10 ++- test/pytest/test_merge_pytorch.py | 2 +- test/pytest/test_pytorch_api.py | 8 +- test/pytest/test_recurrent_pytorch.py | 88 ++++++++++++++++++- .../pytest/test_sequential_parsing_pytorch.py | 46 ++++++++++ 9 files changed, 202 insertions(+), 88 deletions(-) create mode 100644 test/pytest/test_sequential_parsing_pytorch.py diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py index 4bf139b673..195a4b9707 100644 --- a/hls4ml/converters/__init__.py +++ b/hls4ml/converters/__init__.py @@ -38,7 +38,9 @@ model_types = ['keras', 'pytorch', 'onnx'] for model_type in model_types: + print(model_type) for module in os.listdir(os.path.dirname(__file__) + f'/{model_type}'): + print(module) if module == '__init__.py' or module[-3:] != '.py': continue try: @@ -49,6 +51,7 @@ # and is defined in this module (i.e., not imported) if callable(func) and hasattr(func, 'handles') and func.__module__ == lib.__name__: for layer in func.handles: + print(layer) if model_type == 'keras': register_keras_layer_handler(layer, func) elif model_type == 'pytorch': diff --git a/hls4ml/converters/pytorch/recurrent.py b/hls4ml/converters/pytorch/recurrent.py index c868924481..5055e3087f 100644 --- a/hls4ml/converters/pytorch/recurrent.py +++ b/hls4ml/converters/pytorch/recurrent.py @@ -2,12 +2,12 @@ from hls4ml.converters.pytorch_to_hls import get_weights_data, pytorch_handler -rnn_layers = ['SimpleRNN', 'LSTM', 'GRU'] +rnn_layers = ['RNN', 'LSTM', 'GRU'] @pytorch_handler(*rnn_layers) def parse_rnn_layer(operation, layer_name, input_names, input_shapes, node, class_object, data_reader, config): - assert operation in rnn_layers or operation == "RNN" + assert operation in rnn_layers layer = {} @@ -27,13 +27,12 @@ def parse_rnn_layer(operation, layer_name, input_names, input_shapes, node, clas layer['return_state'] = False # parameter does not exist in pytorch if layer['class_name'] == 'SimpleRNN': - layer['activation'] = class_object.nonlinearity # GRU and LSTM are hard-coded to use tanh in pytorch + layer['activation'] = class_object.nonlinearity # Default is tanh, can also be ReLU in pytorch else: layer['activation'] = "tanh" # GRU and LSTM are hard-coded to use tanh in pytorch - layer['recurrent_activation'] = layer['activation'] # pytorch does not seem to differentiate between the two - if layer['class_name'] == 'GRU': - layer['recurrent_activation'] = 'sigmoid' # seems to be hard-coded in pytorch? + if layer['class_name'] == 'GRU' or layer['class_name'] == 'LSTM': + layer['recurrent_activation'] = 'sigmoid' # GRU and LSTM are hard-coded to use tanh in pytorch layer['time_major'] = not class_object.batch_first # TODO Should we handle time_major? @@ -42,7 +41,7 @@ def parse_rnn_layer(operation, layer_name, input_names, input_shapes, node, clas layer['n_timesteps'] = input_shapes[0][1] layer['n_in'] = input_shapes[0][2] - + print(layer['n_in']) layer['n_out'] = class_object.hidden_size if class_object.num_layers > 1: diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py index b49287a2dd..69a026ba7e 100644 --- a/hls4ml/converters/pytorch_to_hls.py +++ b/hls4ml/converters/pytorch_to_hls.py @@ -18,33 +18,14 @@ def get_weights_data(self, layer_name, var_name): data = None # Parameter mapping from pytorch to keras - torch_paramap = { - # Conv - 'kernel': 'weight', - # Batchnorm - 'gamma': 'weight', - # Activiation - 'alpha': 'weight', - 'beta': 'bias', - 'moving_mean': 'running_mean', - 'moving_variance': 'running_var', - # Recurrent layers - 'weight_ih_l0': 'weight_ih_l0', - 'weight_hh_l0': 'weight_hh_l0', - 'bias_ih_l0': 'bias_ih_l0', - 'bias_hh_l0': 'bias_hh_l0', - } # Workaround for naming schme in nn.Sequential, # have to remove the prefix we previously had to add to make sure the tensors are found if 'layer_' in layer_name: layer_name = layer_name.split('layer_')[-1] - if var_name not in list(torch_paramap.keys()) + ['weight', 'bias']: - raise Exception('Pytorch parameter not yet supported!') - - elif var_name in list(torch_paramap.keys()): - var_name = torch_paramap[var_name] + elif '_' in layer_name: + layer_name = '.'.join(layer_name.split('_')) # if a layer is reused in the model, torch.FX will append a "_n" for the n-th use # have to snap that off to find the tensors @@ -52,12 +33,13 @@ def get_weights_data(self, layer_name, var_name): layer_name = '_'.join(layer_name.split('_')[:-1]) # print (self.state_dict) - print(layer_name + '.' + var_name) + # print (self.state_dict) if layer_name + '.' + var_name in self.state_dict: data = self.state_dict[layer_name + '.' + var_name].numpy() return data else: + print("not found") return None @@ -174,7 +156,7 @@ def pytorch_to_hls(config): # All supported layers supported_layers = get_supported_pytorch_layers() + skip_layers - print(supported_layers) + print("supported layers:") input_layers = [] # Output shape tracking @@ -190,7 +172,7 @@ def pytorch_to_hls(config): print(traced_model.graph) for node in traced_model.graph.nodes: - # If part of a nn.Sequntial, the node name will start with an "_" which messes up the parsing + # If part of an unnamend nn.Sequntial, the node name will start with an "_" which messes up the parsing if node.name[0] == '_': node.name = 'layer' + node.name @@ -242,7 +224,7 @@ def pytorch_to_hls(config): node.args = tmp_node.args[0] else: input_shapes = [output_shapes[str(i)] for i in node.args] - + print(input_names) # for Conv layers if 'Conv' in pytorch_class: if not class_object.padding_mode == 'zeros': diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py index cef4d947d1..8f700ecb16 100644 --- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py +++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py @@ -16,14 +16,14 @@ def match(self, node): def transform(self, model, node): # If this parameter has not been set, this model does not need to be converted - if 'InputsChannelLast' not in model.config.config['HLSConfig']['Model']: + if 'ChannelsLastConversion' not in model.config.config['HLSConfig']['Model']: node.channels_last_converted = True return False outshape = node.get_output_variable().shape if isinstance(node, Input): # if inputs are not yet transposed into channels_last, add transpose layer - if not model.config.config['HLSConfig']['Model']['InputsChannelLast'] and len(outshape) > 1: + if model.config.config['HLSConfig']['Model']['ChannelsLastConversion'] == "full" and len(outshape) > 1: # Add transpose for input layer input = node.name if len(outshape) == 2: @@ -38,7 +38,7 @@ def transform(self, model, node): transpose_node.channels_last_converted = True model.insert_node(transpose_node) - else: + elif model.config.config['HLSConfig']['Model']['ChannelsLastConversion'] == "internal" and len(outshape) > 1: input_shape = node.get_output_variable().shape input_shape.append(input_shape.pop(0)) node.get_output_variable().shape = input_shape @@ -46,51 +46,53 @@ def transform(self, model, node): node.get_output_variable().dim_names = dim_names else: # Transpose weight tensors - tensors = ['weight', 'depthwise', 'pointwise', 'zero_bias', 'scale', 'recurrent_weight'] - for tensor in tensors: + if True: + # if not model.config.config['HLSConfig']['Model']['ChannelsLastConversion'] == "off": + tensors = ['weight', 'depthwise', 'pointwise', 'zero_bias', 'scale', 'recurrent_weight'] + for tensor in tensors: + try: + if len(node.get_weights(tensor).shape) == 2: + weights_channels_last = node.get_weights(tensor).data.transpose() + node.get_weights(tensor).data = weights_channels_last + elif len(node.get_weights(tensor).shape) == 3: + weights_channels_last = node.get_weights(tensor).data.transpose([2, 1, 0]) + node.get_weights(tensor).data = weights_channels_last + elif len(node.get_weights(tensor).shape) == 4: + weights_channels_last = node.get_weights(tensor).data.transpose([2, 3, 1, 0]) + node.get_weights(tensor).data = weights_channels_last + except KeyError: + pass try: - if len(node.get_weights(tensor).shape) == 2: - weights_channels_last = node.get_weights(tensor).data.transpose() - node.get_weights(tensor).data = weights_channels_last - elif len(node.get_weights(tensor).shape) == 3: - weights_channels_last = node.get_weights(tensor).data.transpose([2, 1, 0]) - node.get_weights(tensor).data = weights_channels_last - elif len(node.get_weights(tensor).shape) == 4: - weights_channels_last = node.get_weights(tensor).data.transpose([2, 3, 1, 0]) - node.get_weights(tensor).data = weights_channels_last - except KeyError: + node.set_attr('data_format', 'channels_last') + except AttributeError: pass - try: - node.set_attr('data_format', 'channels_last') - except AttributeError: - pass - # Adjust axis of operation - if isinstance(node, Concatenate): - old_axis = node.get_attr('axis') + # Adjust axis of operation + if isinstance(node, Concatenate): + old_axis = node.get_attr('axis') + if len(outshape) == 2: + if old_axis == -1 or old_axis == 2: + node.set_attr('axis', 1) + else: + node.set_attr('axis', 2) + elif len(outshape) == 3: + if old_axis == 3 or old_axis == -1: + node.set_attr('axis', 1) + elif old_axis == 2 or old_axis == -2: + node.set_attr('axis', 2) # Not required, but left for clarity + else: + node.set_attr('axis', 3) + + # Adjust output shape + outdims = node.get_output_variable().dim_names if len(outshape) == 2: - if old_axis == -1 or old_axis == 2: - node.set_attr('axis', 1) - else: - node.set_attr('axis', 2) + shape = [outshape[1], outshape[0]] + dims = [outdims[1], outdims[0]] + node.add_output_variable(shape, dims) elif len(outshape) == 3: - if old_axis == 3 or old_axis == -1: - node.set_attr('axis', 1) - elif old_axis == 2 or old_axis == -2: - node.set_attr('axis', 2) # Not required, but left for clarity - else: - node.set_attr('axis', 3) - - # Adjust output shape - outdims = node.get_output_variable().dim_names - if len(outshape) == 2: - shape = [outshape[1], outshape[0]] - dims = [outdims[1], outdims[0]] - node.add_output_variable(shape, dims) - elif len(outshape) == 3: - shape = [outshape[1], outshape[2], outshape[0]] - dims = [outdims[1], outdims[2], outdims[0]] - node.add_output_variable(shape, dims) + shape = [outshape[1], outshape[2], outshape[0]] + dims = [outdims[1], outdims[2], outdims[0]] + node.add_output_variable(shape, dims) # Add transpose for output layer if ( diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py index 3743659649..991ce16c52 100644 --- a/hls4ml/utils/config.py +++ b/hls4ml/utils/config.py @@ -247,7 +247,7 @@ def config_from_pytorch_model( backend=None, default_precision='ap_fixed<16,6>', default_reuse_factor=1, - inputs_channel_last=False, + channels_last_conversion='full', transpose_outputs=True, ): """Create an HLS conversion config given the PyTorch model. @@ -270,8 +270,10 @@ def config_from_pytorch_model( backend(str, optional): Name of the backend to use default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'. default_reuse_factor (int, optional): Default reuse factor. Defaults to 1. - inputs_channel_last (bool, optional): Set to 'True' if input to the model comes in format - 'channels_last'. Defaults to 'False'. If False, inputs will be transposed internally. + channels_last_conversion (string, optional): Configures the conversion of pytorch layers to + 'channels_last' dataformate. Can be set to 'full', 'internal', or 'off'. If 'full', both the inputs + and internal layers will be converted. If 'internal', only internal layers will be converted; this + assumes the inputs are converted by the user. If 'off', no conversion is performed. transpose_outputs (bool, optional): Set to 'False' if the output should not be transposed from channels_last into channels_first data format. Defaults to 'False'. If False, outputs needs to be transposed manually. @@ -288,7 +290,7 @@ def config_from_pytorch_model( model_config = {} model_config['Precision'] = default_precision model_config['ReuseFactor'] = default_reuse_factor - model_config['InputsChannelLast'] = inputs_channel_last + model_config['ChannelsLastConversion'] = channels_last_conversion model_config['TransposeOutputs'] = transpose_outputs model_config['Strategy'] = 'Latency' diff --git a/test/pytest/test_merge_pytorch.py b/test/pytest/test_merge_pytorch.py index 17aa4d075e..ac42a7bb42 100644 --- a/test/pytest/test_merge_pytorch.py +++ b/test/pytest/test_merge_pytorch.py @@ -43,7 +43,7 @@ def test_merge(merge_op, io_type, backend): batch_input_shape = (None,) + input_shape config = hls4ml.utils.config_from_pytorch_model( - model, default_precision='ap_fixed<32,16>', inputs_channel_last=True, transpose_outputs=False + model, default_precision='ap_fixed<32,16>', channels_last_conversion="internal", transpose_outputs=False ) output_dir = str(test_root_path / f'hls4mlprj_merge_pytorch_{merge_op}_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_pytorch_model( diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index ff2bae2a43..56bfb094e7 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -216,9 +216,9 @@ def test_conv1d(padds, backend, io_type): if io_type == 'io_stream': X_input = np.ascontiguousarray(X_input.transpose(0, 2, 1)) - config = config_from_pytorch_model(model, inputs_channel_last=True, transpose_outputs=False) + config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) else: - config = config_from_pytorch_model(model, inputs_channel_last=False, transpose_outputs=True) + config = config_from_pytorch_model(model, channels_last_conversion="full", transpose_outputs=True) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_conv1d_{padds}_{backend}_{io_type}') hls_model = convert_from_pytorch_model( @@ -327,9 +327,9 @@ def test_conv2d(padds, backend, io_type): if io_type == 'io_stream': X_input = np.ascontiguousarray(X_input.transpose(0, 2, 3, 1)) - config = config_from_pytorch_model(model, inputs_channel_last=True, transpose_outputs=False) + config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) else: - config = config_from_pytorch_model(model, inputs_channel_last=False, transpose_outputs=True) + config = config_from_pytorch_model(model, channels_last_conversion="full", transpose_outputs=True) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_conv2d_{padds}_{backend}_{io_type}') hls_model = convert_from_pytorch_model( diff --git a/test/pytest/test_recurrent_pytorch.py b/test/pytest/test_recurrent_pytorch.py index 48f0090236..feff906a6d 100644 --- a/test/pytest/test_recurrent_pytorch.py +++ b/test/pytest/test_recurrent_pytorch.py @@ -21,7 +21,7 @@ def forward(self, x, h0): return output -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_gru(backend, io_type): model = GRUNet() @@ -32,8 +32,8 @@ def test_gru(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0)).detach().numpy() - config = config_from_pytorch_model(model, inputs_channel_last=True, transpose_outputs=False) - output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_linear_{backend}_{io_type}') + config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_gru_{backend}_{io_type}') hls_model = convert_from_pytorch_model( model, [(None, 10, 1), (None, 20, 1)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type @@ -43,4 +43,84 @@ def test_gru(backend, io_type): hls_prediction = np.reshape(hls_model.predict([X_input.detach().numpy(), h0.detach().numpy()]), (1, 1, 20)) - np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2) + np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=1e-1) + + +class LSTM(nn.Module): + def __init__(self): + super().__init__() + self.rnn = nn.LSTM(10, 20, num_layers=1, batch_first=True) + + def forward(self, x, h0, c0): + output, (_, _) = self.rnn(x, (h0, c0)) + return output + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +def test_lstm(backend, io_type): + if not (backend == "Quartus" and io_type == "io_stream"): + model = LSTM() + model.eval() + + X_input = torch.randn(1, 1, 10) + h0 = torch.zeros(1, 1, 20) + c0 = torch.zeros(1, 1, 20) + + pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0), torch.tensor(c0)).detach().numpy() + + config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_lstm_{backend}_{io_type}') + + hls_model = convert_from_pytorch_model( + model, + [(None, 10, 1), (None, 20, 1), (None, 20, 1)], + hls_config=config, + output_dir=output_dir, + backend=backend, + io_type=io_type, + ) + + hls_model.compile() + + hls_prediction = np.reshape( + hls_model.predict([X_input.detach().numpy(), h0.detach().numpy(), c0.detach().numpy()]), (1, 1, 20) + ) + + np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=1e-1) + + +class RNN(nn.Module): + def __init__(self): + super().__init__() + self.rnn = nn.RNN(10, 20, num_layers=1, batch_first=True) + + def forward(self, x, h0): + output, _ = self.rnn(x, h0) + return output + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +def test_rnn(backend, io_type): + if not (backend == "Quartus" and io_type == "io_stream"): + model = RNN() + model.eval() + + X_input = torch.randn(1, 1, 10) + h0 = torch.zeros(1, 1, 20) + + pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0)).detach().numpy() + + config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_rnn_{backend}_{io_type}') + + hls_model = convert_from_pytorch_model( + model, [(None, 10, 1), (None, 20, 1)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + ) + + hls_model.compile() + + hls_prediction = np.reshape(hls_model.predict([X_input.detach().numpy(), h0.detach().numpy()]), (1, 1, 20)) + + np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=1e-1) diff --git a/test/pytest/test_sequential_parsing_pytorch.py b/test/pytest/test_sequential_parsing_pytorch.py new file mode 100644 index 0000000000..19d8ea1feb --- /dev/null +++ b/test/pytest/test_sequential_parsing_pytorch.py @@ -0,0 +1,46 @@ +from pathlib import Path + +import pytest +import torch.nn as nn + +from hls4ml.converters import convert_from_pytorch_model +from hls4ml.utils.config import config_from_pytorch_model + +test_root_path = Path(__file__).parent + +# simple model with unnamed sequential +model = nn.Sequential(nn.Conv2d(1, 20, 5), nn.ReLU(), nn.Conv2d(20, 64, 5), nn.ReLU()) + + +# simple model with namend sequential +class SeqModel(nn.Module): + def __init__(self): + super().__init__() + self.layer = nn.Sequential(nn.Conv2d(1, 20, 5), nn.ReLU(), nn.Conv2d(20, 64, 5), nn.ReLU()) + + def forward(self, x): + output = self.layer(x) + return output + + +@pytest.mark.parametrize('backend', ['Vivado']) +@pytest.mark.parametrize('io_type', ['io_parallel']) +def test_named(backend, io_type): + config = config_from_pytorch_model(model) + output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_gru_{backend}_{io_type}') + + convert_from_pytorch_model( + model, (None, 1, 5, 5), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + ) + + +@pytest.mark.parametrize('backend', ['Vivado']) +@pytest.mark.parametrize('io_type', ['io_parallel']) +def test_unnnamed(backend, io_type): + pytorch_model = SeqModel() + config = config_from_pytorch_model(pytorch_model) + output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_gru_{backend}_{io_type}') + + convert_from_pytorch_model( + model, (None, 1, 5, 5), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + ) From 66fb8b9465701e146984c545b584057e9b814de4 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 15 Aug 2023 14:06:27 -0400 Subject: [PATCH 033/272] clean diff --- hls4ml/converters/__init__.py | 3 - hls4ml/converters/pytorch_to_hls.py | 8 +- .../passes/convert_to_channels_last.py | 118 +++++++++--------- 3 files changed, 59 insertions(+), 70 deletions(-) diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py index 195a4b9707..4bf139b673 100644 --- a/hls4ml/converters/__init__.py +++ b/hls4ml/converters/__init__.py @@ -38,9 +38,7 @@ model_types = ['keras', 'pytorch', 'onnx'] for model_type in model_types: - print(model_type) for module in os.listdir(os.path.dirname(__file__) + f'/{model_type}'): - print(module) if module == '__init__.py' or module[-3:] != '.py': continue try: @@ -51,7 +49,6 @@ # and is defined in this module (i.e., not imported) if callable(func) and hasattr(func, 'handles') and func.__module__ == lib.__name__: for layer in func.handles: - print(layer) if model_type == 'keras': register_keras_layer_handler(layer, func) elif model_type == 'pytorch': diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py index 69a026ba7e..35e463f45b 100644 --- a/hls4ml/converters/pytorch_to_hls.py +++ b/hls4ml/converters/pytorch_to_hls.py @@ -32,14 +32,11 @@ def get_weights_data(self, layer_name, var_name): if layer_name.split('_')[-1].isdigit() and len(layer_name.split('_')) > 1: layer_name = '_'.join(layer_name.split('_')[:-1]) - # print (self.state_dict) - # print (self.state_dict) if layer_name + '.' + var_name in self.state_dict: data = self.state_dict[layer_name + '.' + var_name].numpy() return data else: - print("not found") return None @@ -156,7 +153,7 @@ def pytorch_to_hls(config): # All supported layers supported_layers = get_supported_pytorch_layers() + skip_layers - print("supported layers:") + input_layers = [] # Output shape tracking @@ -169,8 +166,6 @@ def pytorch_to_hls(config): n_inputs = 0 - print(traced_model.graph) - for node in traced_model.graph.nodes: # If part of an unnamend nn.Sequntial, the node name will start with an "_" which messes up the parsing if node.name[0] == '_': @@ -224,7 +219,6 @@ def pytorch_to_hls(config): node.args = tmp_node.args[0] else: input_shapes = [output_shapes[str(i)] for i in node.args] - print(input_names) # for Conv layers if 'Conv' in pytorch_class: if not class_object.padding_mode == 'zeros': diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py index 8f700ecb16..abd3c4de92 100644 --- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py +++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py @@ -46,74 +46,72 @@ def transform(self, model, node): node.get_output_variable().dim_names = dim_names else: # Transpose weight tensors - if True: - # if not model.config.config['HLSConfig']['Model']['ChannelsLastConversion'] == "off": - tensors = ['weight', 'depthwise', 'pointwise', 'zero_bias', 'scale', 'recurrent_weight'] - for tensor in tensors: - try: - if len(node.get_weights(tensor).shape) == 2: - weights_channels_last = node.get_weights(tensor).data.transpose() - node.get_weights(tensor).data = weights_channels_last - elif len(node.get_weights(tensor).shape) == 3: - weights_channels_last = node.get_weights(tensor).data.transpose([2, 1, 0]) - node.get_weights(tensor).data = weights_channels_last - elif len(node.get_weights(tensor).shape) == 4: - weights_channels_last = node.get_weights(tensor).data.transpose([2, 3, 1, 0]) - node.get_weights(tensor).data = weights_channels_last - except KeyError: - pass + tensors = ['weight', 'depthwise', 'pointwise', 'zero_bias', 'scale', 'recurrent_weight'] + for tensor in tensors: try: - node.set_attr('data_format', 'channels_last') - except AttributeError: + if len(node.get_weights(tensor).shape) == 2: + weights_channels_last = node.get_weights(tensor).data.transpose() + node.get_weights(tensor).data = weights_channels_last + elif len(node.get_weights(tensor).shape) == 3: + weights_channels_last = node.get_weights(tensor).data.transpose([2, 1, 0]) + node.get_weights(tensor).data = weights_channels_last + elif len(node.get_weights(tensor).shape) == 4: + weights_channels_last = node.get_weights(tensor).data.transpose([2, 3, 1, 0]) + node.get_weights(tensor).data = weights_channels_last + except KeyError: pass + try: + node.set_attr('data_format', 'channels_last') + except AttributeError: + pass - # Adjust axis of operation - if isinstance(node, Concatenate): - old_axis = node.get_attr('axis') - if len(outshape) == 2: - if old_axis == -1 or old_axis == 2: - node.set_attr('axis', 1) - else: - node.set_attr('axis', 2) - elif len(outshape) == 3: - if old_axis == 3 or old_axis == -1: - node.set_attr('axis', 1) - elif old_axis == 2 or old_axis == -2: - node.set_attr('axis', 2) # Not required, but left for clarity - else: - node.set_attr('axis', 3) - - # Adjust output shape - outdims = node.get_output_variable().dim_names + # Adjust axis of operation + if isinstance(node, Concatenate): + old_axis = node.get_attr('axis') if len(outshape) == 2: - shape = [outshape[1], outshape[0]] - dims = [outdims[1], outdims[0]] - node.add_output_variable(shape, dims) + if old_axis == -1 or old_axis == 2: + node.set_attr('axis', 1) + else: + node.set_attr('axis', 2) elif len(outshape) == 3: - shape = [outshape[1], outshape[2], outshape[0]] - dims = [outdims[1], outdims[2], outdims[0]] - node.add_output_variable(shape, dims) + if old_axis == 3 or old_axis == -1: + node.set_attr('axis', 1) + elif old_axis == 2 or old_axis == -2: + node.set_attr('axis', 2) # Not required, but left for clarity + else: + node.set_attr('axis', 3) - # Add transpose for output layer - if ( - node.get_attr("name") in model.outputs - and len(outshape) > 1 - and model.config.config['HLSConfig']['Model']['TransposeOutputs'] - ): - input = node.name - outshape = node.get_output_variable().shape - print(outshape) - if len(outshape) == 2: - attributes = {'perm': [1, 0]} - else: - attributes = {'perm': [2, 0, 1]} + # Adjust output shape + outdims = node.get_output_variable().dim_names + if len(outshape) == 2: + shape = [outshape[1], outshape[0]] + dims = [outdims[1], outdims[0]] + node.add_output_variable(shape, dims) + elif len(outshape) == 3: + shape = [outshape[1], outshape[2], outshape[0]] + dims = [outdims[1], outdims[2], outdims[0]] + node.add_output_variable(shape, dims) - transpose_node = model.make_node( - 'Transpose', f'transpose_ouput_for_{node.get_attr("name")}', attributes, [input] - ) - transpose_node.channels_last_converted = True + # Add transpose for output layer + if ( + node.get_attr("name") in model.outputs + and len(outshape) > 1 + and model.config.config['HLSConfig']['Model']['TransposeOutputs'] + ): + input = node.name + outshape = node.get_output_variable().shape + print(outshape) + if len(outshape) == 2: + attributes = {'perm': [1, 0]} + else: + attributes = {'perm': [2, 0, 1]} - model.insert_node(transpose_node) + transpose_node = model.make_node( + 'Transpose', f'transpose_ouput_for_{node.get_attr("name")}', attributes, [input] + ) + transpose_node.channels_last_converted = True + + model.insert_node(transpose_node) node.channels_last_converted = True return True From 1c06c6ba1874ce6261535e76c9bb443f26f56719 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 15 Aug 2023 14:08:22 -0400 Subject: [PATCH 034/272] clean diff v2 --- .../passes/convert_to_channels_last.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py index abd3c4de92..9e5078a578 100644 --- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py +++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py @@ -92,26 +92,26 @@ def transform(self, model, node): dims = [outdims[1], outdims[2], outdims[0]] node.add_output_variable(shape, dims) - # Add transpose for output layer - if ( - node.get_attr("name") in model.outputs - and len(outshape) > 1 - and model.config.config['HLSConfig']['Model']['TransposeOutputs'] - ): - input = node.name - outshape = node.get_output_variable().shape - print(outshape) - if len(outshape) == 2: - attributes = {'perm': [1, 0]} - else: - attributes = {'perm': [2, 0, 1]} + # Add transpose for output layer + if ( + node.get_attr("name") in model.outputs + and len(outshape) > 1 + and model.config.config['HLSConfig']['Model']['TransposeOutputs'] + ): + input = node.name + outshape = node.get_output_variable().shape + print(outshape) + if len(outshape) == 2: + attributes = {'perm': [1, 0]} + else: + attributes = {'perm': [2, 0, 1]} - transpose_node = model.make_node( - 'Transpose', f'transpose_ouput_for_{node.get_attr("name")}', attributes, [input] - ) - transpose_node.channels_last_converted = True + transpose_node = model.make_node( + 'Transpose', f'transpose_ouput_for_{node.get_attr("name")}', attributes, [input] + ) + transpose_node.channels_last_converted = True - model.insert_node(transpose_node) + model.insert_node(transpose_node) node.channels_last_converted = True return True From dde9c69b08307538ebb68d3c3f007286283aa21a Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Wed, 16 Aug 2023 15:00:12 -0400 Subject: [PATCH 035/272] getting close --- .../quartus/passes/recurrent_templates.py | 2 ++ .../vivado/passes/recurrent_templates.py | 2 ++ hls4ml/converters/pytorch/recurrent.py | 14 ++++++++--- hls4ml/model/layers.py | 9 ++++++-- .../firmware/nnet_utils/nnet_recurrent.h | 12 ++++++++-- .../vivado/nnet_utils/nnet_recurrent.h | 23 +++++++++++++++---- test/pytest/test_recurrent_pytorch.py | 10 ++++---- 7 files changed, 56 insertions(+), 16 deletions(-) diff --git a/hls4ml/backends/quartus/passes/recurrent_templates.py b/hls4ml/backends/quartus/passes/recurrent_templates.py index 2bf45351bb..3d74ce9b7f 100644 --- a/hls4ml/backends/quartus/passes/recurrent_templates.py +++ b/hls4ml/backends/quartus/passes/recurrent_templates.py @@ -66,6 +66,7 @@ using activation_recr = nnet::activation::{recurrent_activation}; static const unsigned reuse_factor = {reuse}; + static const unsigned pytorch_order = {pytorch}; static const bool store_weights_in_bram = false; }};\n''' @@ -92,6 +93,7 @@ def format(self, node): params['config_mult_h'] = f'config{node.index}_h_mult' params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') params['act_recurrent_t'] = '{}_config{}'.format(node.get_attr('recurrent_activation'), str(node.index) + '_rec_act') + params['pytorch'] = 'true' if "pytorch" in node.attributes.keys() else 'false' gru_config = self.gru_template.format(**params) # Activation is on candidate hidden state, dimensionality (1, n_units) diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py index aae806b35c..3c550a6dff 100644 --- a/hls4ml/backends/vivado/passes/recurrent_templates.py +++ b/hls4ml/backends/vivado/passes/recurrent_templates.py @@ -62,6 +62,7 @@ static const unsigned reuse_factor = {reuse}; static const bool store_weights_in_bram = false; static const bool use_static = {static}; + static const bool pytorch_order = {pytorch}; }};\n""" recr_function_template = 'nnet::{recr_type}_stack<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});' @@ -97,6 +98,7 @@ def format(self, node): params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), node.index) params['strategy'] = node.get_attr('strategy') params['static'] = 'true' if node.attributes['static'] else 'false' + params['pytorch'] = 'true' if "pytorch" in node.attributes.keys() else 'false' params['recr_type'] = node.class_name.lower() params['RECR_TYPE'] = node.class_name diff --git a/hls4ml/converters/pytorch/recurrent.py b/hls4ml/converters/pytorch/recurrent.py index 5055e3087f..2607e3bdbf 100644 --- a/hls4ml/converters/pytorch/recurrent.py +++ b/hls4ml/converters/pytorch/recurrent.py @@ -1,5 +1,7 @@ import warnings +import numpy as np + from hls4ml.converters.pytorch_to_hls import get_weights_data, pytorch_handler rnn_layers = ['RNN', 'LSTM', 'GRU'] @@ -32,7 +34,7 @@ def parse_rnn_layer(operation, layer_name, input_names, input_shapes, node, clas layer['activation'] = "tanh" # GRU and LSTM are hard-coded to use tanh in pytorch if layer['class_name'] == 'GRU' or layer['class_name'] == 'LSTM': - layer['recurrent_activation'] = 'sigmoid' # GRU and LSTM are hard-coded to use tanh in pytorch + layer['recurrent_activation'] = 'sigmoid' # GRU and LSTM are hard-coded to use sigmoid in pytorch layer['time_major'] = not class_object.batch_first # TODO Should we handle time_major? @@ -41,7 +43,7 @@ def parse_rnn_layer(operation, layer_name, input_names, input_shapes, node, clas layer['n_timesteps'] = input_shapes[0][1] layer['n_in'] = input_shapes[0][2] - print(layer['n_in']) + layer['n_out'] = class_object.hidden_size if class_object.num_layers > 1: @@ -60,9 +62,15 @@ def parse_rnn_layer(operation, layer_name, input_names, input_shapes, node, clas layer['recurrent_bias_data'], ) = get_weights_data(data_reader, layer['name'], ['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0']) + if class_object.bias is False: + layer['bias_data'] = np.zeros(layer['weight_data'].shape[0]) + layer['recurrent_bias_data'] = np.zeros(layer['recurrent_weight_data'].shape[0]) + if layer['class_name'] == 'GRU': layer['apply_reset_gate'] = 'after' # Might be true for pytorch? It's not a free parameter - output_shape = [[input_shapes[0][0], layer['n_timesteps'], layer['n_out']], [1, layer['n_out']]] + output_shape = [input_shapes[0][0], layer['n_out']] + + layer['pytorch'] = True # need to switch some behaviors to match pytorch implementations return layer, output_shape diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index d9da2cc741..0445cf602d 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -1005,6 +1005,8 @@ def initialize(self): # biases self.add_weights_variable(name='bias', var_name='b{index}') + if "pytorch" in self.attributes.keys(): + self.add_weights_variable(name='recurrent_bias', var_name='br{index}') class LSTM(Layer): @@ -1056,8 +1058,11 @@ def initialize(self): # biases self.add_weights_variable(name='bias', var_name='b{index}') - recurrent_bias = np.zeros(recurrent_weight.shape[1]) - self.add_weights_variable(name='recurrent_bias', var_name='br{index}', data=recurrent_bias) + if "pytorch" in self.attributes.keys(): + self.add_weights_variable(name='recurrent_bias', var_name='br{index}') + else: + recurrent_bias = np.zeros(recurrent_weight.shape[1]) + self.add_weights_variable(name='recurrent_bias', var_name='br{index}', data=recurrent_bias) class GRU(Layer): diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h index 464c6d415b..b3ca1c32c7 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h @@ -87,6 +87,7 @@ struct gru_config { // Resource reuse info static const unsigned io_type = io_parallel; static const unsigned reuse_factor = 1; + static const bool pytorch_order = false; static const bool store_weights_in_bram = false; // Activation @@ -133,7 +134,10 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units], hls_register typename CONFIG_T::accum_t hadamard_r_h[CONFIG_T::n_units]; #pragma unroll recurrent_unroll_factor for (int i = 0; i < (CONFIG_T::n_units); i++) { - hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units]; + if (CONFIG_T::pytorch_order) + hadamard_r_h[i] = z_r_act[i] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units]; + else + hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units]; } // The candidate state; X * W_{hx} + hadmard(r(t), h_(t-1)) * W_{hh} + b_{h} @@ -152,7 +156,11 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units], // Update state #pragma unroll recurrent_unroll_factor for (int i = 0; i < (CONFIG_T::n_units); i++) { - h[i] = static_cast(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]); + if (CONFIG_T::pytorch_order) + h[i] = static_cast(h_cand_act[i] * (1 - z_r_act[i + CONFIG_T::n_units]) + + h[i] * z_r_act[i + CONFIG_T::n_units]); + else + h[i] = static_cast(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]); } } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h index 6e8681482b..d3b96ba5fb 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h @@ -316,6 +316,7 @@ struct gru_config { static const unsigned reuse_factor = 1; static const bool store_weights_in_bram = false; static const bool use_static = true; + static const bool pytorch_order = false; static const unsigned n_zeros = 0; template using activation_recr = nnet::activation::relu; @@ -368,7 +369,10 @@ void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_ // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { #pragma HLS UNROLL - tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; + if (CONFIG_T::pytorch_order) + tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; + else + tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; } // Assuming reset_after is false @@ -385,7 +389,11 @@ void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_ // Mix the stat with the previous state for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { #pragma HLS UNROLL - h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]); + if (CONFIG_T::pytorch_order) + h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) + + h_newstate[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]); + else + h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]); } } @@ -444,7 +452,10 @@ void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[ // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { #pragma HLS UNROLL - tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; + if (CONFIG_T::pytorch_order) + tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; + else + tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; } // Assuming reset_after is false @@ -461,7 +472,11 @@ void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[ // Mix the stat with the previous state for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { #pragma HLS UNROLL - h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]); + if (CONFIG_T::pytorch_order) + h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) + + h_state[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]); + else + h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]); h_newstate[iacc] = h_state[iacc]; } } diff --git a/test/pytest/test_recurrent_pytorch.py b/test/pytest/test_recurrent_pytorch.py index feff906a6d..0335a29bdf 100644 --- a/test/pytest/test_recurrent_pytorch.py +++ b/test/pytest/test_recurrent_pytorch.py @@ -14,7 +14,7 @@ class GRUNet(nn.Module): def __init__(self): super().__init__() - self.rnn = nn.GRU(10, 20, num_layers=1, batch_first=True) + self.rnn = nn.GRU(10, 20, num_layers=1, batch_first=True, bias=True) def forward(self, x, h0): output, hnn = self.rnn(x, h0) @@ -49,7 +49,7 @@ def test_gru(backend, io_type): class LSTM(nn.Module): def __init__(self): super().__init__() - self.rnn = nn.LSTM(10, 20, num_layers=1, batch_first=True) + self.rnn = nn.LSTM(10, 20, num_layers=1, batch_first=True, bias=True) def forward(self, x, h0, c0): output, (_, _) = self.rnn(x, (h0, c0)) @@ -93,15 +93,15 @@ def test_lstm(backend, io_type): class RNN(nn.Module): def __init__(self): super().__init__() - self.rnn = nn.RNN(10, 20, num_layers=1, batch_first=True) + self.rnn = nn.RNN(10, 20, num_layers=1, batch_first=True, bias=True) def forward(self, x, h0): output, _ = self.rnn(x, h0) return output -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) -@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +@pytest.mark.parametrize('backend', ['Quartus']) +@pytest.mark.parametrize('io_type', ['io_parallel']) def test_rnn(backend, io_type): if not (backend == "Quartus" and io_type == "io_stream"): model = RNN() From d60dadde62ecfcad5218cff54362efa9676eb97f Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Wed, 16 Aug 2023 15:21:08 -0400 Subject: [PATCH 036/272] fix RNN biases --- .../quartus/passes/recurrent_templates.py | 9 +- .../firmware/nnet_utils/nnet_recurrent.h | 125 ++++++++++++++++++ 2 files changed, 133 insertions(+), 1 deletion(-) diff --git a/hls4ml/backends/quartus/passes/recurrent_templates.py b/hls4ml/backends/quartus/passes/recurrent_templates.py index 3d74ce9b7f..e4faceaf5a 100644 --- a/hls4ml/backends/quartus/passes/recurrent_templates.py +++ b/hls4ml/backends/quartus/passes/recurrent_templates.py @@ -258,6 +258,9 @@ def format(self, node): }};\n""" simple_rnn_function_template = 'nnet::simple_rnn<{input_t}, {output_t}, {config}>({input}, {output}, {weights});' +simple_rnn_pytorch_function_template = ( + 'nnet::simple_rnn_pytorch<{input_t}, {output_t}, {config}>({input}, {output}, {weights});' +) class SimpleRNNConfigTemplate(LayerConfigTemplate): @@ -303,5 +306,9 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) - params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index)) + if "pytorch" in node.attributes.keys(): + self.template = simple_rnn_pytorch_function_template + params['weights'] = 'w{0}, wr{0}, b{0}, br{0}'.format(str(node.index)) + else: + params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index)) return self.template.format(**params) diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h index b3ca1c32c7..dbcf8e41ab 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h @@ -323,6 +323,131 @@ void simple_rnn(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[C } } } +//---------------------- +// SimpleRNN with pytorch biases +//---------------------- + +struct simpleRNN_pytorch_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_outputs = 1; + static const unsigned n_timesteps = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void simple_rnn_pytorch_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out], + res_T hidden_state_o[CONFIG_T::n_out], + const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::bias_t bias[CONFIG_T::n_out], + const typename CONFIG_T::bias_t rec_bias[CONFIG_T::n_out]) { + // Weight multiplication + typename CONFIG_T::accum_t afterW[CONFIG_T::n_out] hls_register; + multiply_W( + inputs, afterW, kernel); + + // Bias addition + typename CONFIG_T::accum_t afterBias[CONFIG_T::n_out] hls_register; + add_bias( + afterW, afterBias, bias); + + // Hidden state + typename CONFIG_T::accum_t hiddenCand[CONFIG_T::n_out] hls_register; + multiply_U(hidden_state, hiddenCand, + rec_kernel); + + // Hidden state bias addition + typename CONFIG_T::accum_t hiddenBias[CONFIG_T::n_out] hls_register; + add_bias( + hiddenCand, hiddenBias, rec_bias); + + // Vector addition + typename CONFIG_T::accum_t afterAdd[CONFIG_T::n_out]; + add_vectors(afterBias, hiddenBias, afterAdd); + + // Activation + CONFIG_T::template activation::activation( + afterAdd, hidden_state_o); +} + +template +void simple_rnn_pytorch(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], + res_T res[CONFIG_T::n_outputs * CONFIG_T::n_out], + const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::bias_t bias[CONFIG_T::n_out], + const typename CONFIG_T::bias_t rec_bias[CONFIG_T::n_out]) { + res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register; + res_T hidden_state_temp[CONFIG_T::n_out] hls_register; + res_T h[CONFIG_T::n_out] hls_register; + data_T in[CONFIG_T::n_in] hls_register; + +// Set initially hidden state (output) to zero +INIT_LOOP: + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[x][0] = 0; + } + + #pragma disable_loop_pipelining + for (int i = 0; i < CONFIG_T::n_timesteps; i++) { + + // Data at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_in; x++) { + in[x] = data[x + i * CONFIG_T::n_in]; + } + + // Hidden state at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state_temp[x] = hidden_state[x][i]; + } + + // Do SimpleRNN + simple_rnn_pytorch_cell(in, hidden_state_temp, h, kernel, rec_kernel, bias, rec_bias); + + // Write result + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[x][i + 1] = h[x]; + } + } + + if (CONFIG_T::return_sequences == 0) { + // Output when return_sequences is false + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + res[x] = hidden_state[x][CONFIG_T::n_timesteps]; + } + } else { + // Output when return_sequences is true + #pragma unroll + for (int x = 0; x < CONFIG_T::n_timesteps; x++) { + #pragma unroll + for (int h = 0; h < CONFIG_T::n_out; h++) { + res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1]; + } + } + } +} //---------------------- // LSTM From d247057310668df97a59b8f64712963448ecd730 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Thu, 17 Aug 2023 10:13:06 -0400 Subject: [PATCH 037/272] fix pytests --- test/pytest/test_recurrent_pytorch.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/pytest/test_recurrent_pytorch.py b/test/pytest/test_recurrent_pytorch.py index 0335a29bdf..c1672c73b9 100644 --- a/test/pytest/test_recurrent_pytorch.py +++ b/test/pytest/test_recurrent_pytorch.py @@ -32,11 +32,11 @@ def test_gru(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0)).detach().numpy() - config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + config = config_from_pytorch_model(model, channels_last_conversion="off", transpose_outputs=False) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_gru_{backend}_{io_type}') hls_model = convert_from_pytorch_model( - model, [(None, 10, 1), (None, 20, 1)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + model, [(None, 1, 10), (None, 1, 20)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type ) hls_model.compile() @@ -69,12 +69,12 @@ def test_lstm(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0), torch.tensor(c0)).detach().numpy() - config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + config = config_from_pytorch_model(model, channels_last_conversion="off", transpose_outputs=False) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_lstm_{backend}_{io_type}') hls_model = convert_from_pytorch_model( model, - [(None, 10, 1), (None, 20, 1), (None, 20, 1)], + [(None, 1, 10), (None, 1, 20), (None, 1, 20)], hls_config=config, output_dir=output_dir, backend=backend, @@ -112,11 +112,11 @@ def test_rnn(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0)).detach().numpy() - config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + config = config_from_pytorch_model(model, channels_last_conversion="off", transpose_outputs=False) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_rnn_{backend}_{io_type}') hls_model = convert_from_pytorch_model( - model, [(None, 10, 1), (None, 20, 1)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + model, [(None, 1, 10), (None, 1, 20)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type ) hls_model.compile() From ed3eaa442045167d621534b47758d311822cf8bd Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Thu, 17 Aug 2023 10:26:23 -0400 Subject: [PATCH 038/272] precommit --- hls4ml/converters/pytorch_to_hls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py index 840b613a3e..f5d90d4b9e 100644 --- a/hls4ml/converters/pytorch_to_hls.py +++ b/hls4ml/converters/pytorch_to_hls.py @@ -17,13 +17,11 @@ def __init__(self, config): def get_weights_data(self, layer_name, var_name): data = None - # Workaround for naming schme in nn.Sequential, # have to remove the prefix we previously had to add to make sure the tensors are found if 'layer_' in layer_name: layer_name = layer_name.split('layer_')[-1] - # if a layer is reused in the model, torch.FX will append a "_n" for the n-th use # have to snap that off to find the tensors if layer_name.split('_')[-1].isdigit() and len(layer_name.split('_')) > 1: From c2ca3c6d6af39beec3d5119a15a1cab579093065 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Aug 2023 14:46:14 +0000 Subject: [PATCH 039/272] [pre-commit.ci] auto fixes from pre-commit hooks --- hls4ml/converters/pytorch_to_hls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py index 840b613a3e..f5d90d4b9e 100644 --- a/hls4ml/converters/pytorch_to_hls.py +++ b/hls4ml/converters/pytorch_to_hls.py @@ -17,13 +17,11 @@ def __init__(self, config): def get_weights_data(self, layer_name, var_name): data = None - # Workaround for naming schme in nn.Sequential, # have to remove the prefix we previously had to add to make sure the tensors are found if 'layer_' in layer_name: layer_name = layer_name.split('layer_')[-1] - # if a layer is reused in the model, torch.FX will append a "_n" for the n-th use # have to snap that off to find the tensors if layer_name.split('_')[-1].isdigit() and len(layer_name.split('_')) > 1: From a01080dc210ef23640b766f0b9a24090ac540f58 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 7 Oct 2023 22:09:14 -0700 Subject: [PATCH 040/272] use code gen --- hls4ml/backends/fpga/fpga_backend.py | 57 +++ hls4ml/backends/fpga/passes/codegen.py | 22 ++ .../vivado/passes/convolution_templates.py | 7 + .../vivado/nnet_utils/nnet_code_gen.h | 90 +++++ .../templates/vivado/nnet_utils/nnet_conv1d.h | 4 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 351 ------------------ test/pytest/test_pointwiseconv.py | 7 +- 7 files changed, 183 insertions(+), 355 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 8cfaec8b3f..349a5ddbc8 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -860,6 +860,63 @@ def generate_conv2d_line_buffer_fn( return generated_code + def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): + """Generate a C++ function for a pointwise convolution layer. + + Args: + layer_idx (int): Index of layer ('index' attribute). + reuse_factor (int): Number of partitions to divide the input into. + + Returns: + str: Generated C++ function + """ + + generated_code = ( + "template\n" + "class pointwise_conv_{index} : public PointwiseConv1D {{\n" + " public:\n" + " static void pointwise_conv(\n" + " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" + " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" + " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" + " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" + " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" + " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" + " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" + "RFInputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" + " #pragma HLS UNROLL\n" + " InnerInputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" + " #pragma HLS UNROLL\n" + " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];" + "\n" + " }}\n" + " }}\n\n" + ).format(index=layer_idx) + for i in range(reuse_factor): + generated_code += ( + f" pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" + ) + + generated_code += ( + "\n" + "RFOutputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" + " #pragma HLS UNROLL\n" + " InnerOutputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" + " #pragma HLS UNROLL\n" + " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" + " }\n" + " }\n" + "}\n" + "};\n" + ) + + return generated_code + @model_optimizer() def write_hls(self, model): self.writer.write_hls(model) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index f1f1080996..6d7243dd8b 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -49,3 +49,25 @@ def _generate_im2col_2d(self, node): ) node.set_attr('line_buffer_codegen', Source(code_str)) + + +class GeneratePointwiseConv1D(OptimizerPass): + '''Generates code for pointwise 1D convolution''' + + def match(self, node): + return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' + + def transform(self, model, node): + node_class = node.__class__.__name__ + if '1D' in node_class: + self._generate_pointwise_conv1d(node) + else: + raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') + + def _generate_pointwise_conv1d(self, node): + code_str = node.model.config.backend.generate_pointwise_conv1d_fn( + node.get_attr('index'), + node.get_attr('reuse_factor'), + ) + + node.set_attr('pointwise_conv1d_codegen', Source(code_str)) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 874349aab3..a4fbdd405f 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -56,6 +56,8 @@ typedef {config_t} mult_config; template using scale_index = nnet::{scale_index_type}; + template + using pointwise_conv = nnet::{pointwise_fn}; }}; const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" @@ -89,6 +91,11 @@ def format(self, node): else: params['fill_fn'] = 'FillConv1DBuffer' + if node.get_attr('filt_width') == 1 and node.model.config.get_config_value('IOType') == 'io_parallel': + params['pointwise_fn'] = f'pointwise_conv_{node.index}' + else: + params['pointwise_fn'] = 'PointwiseConv1D' + conv_config = self.template.format(**params) mult_params = self._default_config_params(node) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index e4db43682e..32fa7321c5 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -25,6 +25,96 @@ template class FillConv2DBuffer { } }; +template class PointwiseConv1D { + public: + static void pointwise_conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // To be implemented in subclasses + } +}; + +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + // hls4ml insert code } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index 0f2e89ac8f..7cceabfe1b 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -56,8 +56,8 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], if (CONFIG_T::strategy == nnet::latency) { if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation - if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { - pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); + if (CONFIG_T::reuse_factor > 1) { + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); } else { assert(CONFIG_T::reuse_factor == 1); pointwise_conv_1d_latency_cl(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index aabc869823..0d9afb10cb 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,356 +84,5 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } -template -void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - assert(CONFIG_T::filt_width == 1); - - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; - - #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 - #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 - - // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases - #pragma HLS function_instantiate variable=weights,biases - - // Parallel mode - #pragma HLS PIPELINE II=CONFIG_T::reuse_factor - #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 - #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 - - // Limit multipliers to control parallelization - int multiplier_limit = - ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / - float(CONFIG_T::reuse_factor)); -#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit - -// Convolve, saving all multiplication results to accumulate later -ConvOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - ConvFilt: - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - ConvChan: - for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { - #pragma HLS UNROLL - int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; - int index_weight = cc * CONFIG_T::n_filt + ff; - int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; - - if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || - (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { - mult[index_mult] = 0; - } else { - mult[index_mult] = data[index_data] * weights[index_weight]; - } - } // end channel loop - } // end filter loop - } // end output loop - - // Initialize accumulator with input biases - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - acc[ii][ff] = biases[ff]; - } - } - -// Accumulate multiplication result -AccumOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - AccumFilt: - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - // Do "dot product" sum within filter and sum over channels - AccumChan: - for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { - int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; - acc[ii][ff] += mult[index_mult]; - } // end channel loop - } // end filter loop - } // end output loop - - // Cast to "res_t" type - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); - } - } -} - -template -void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - - data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; - #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 - res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor]; - #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 - -RFInputLoop: - for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerInputLoop: - for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) { - #pragma HLS UNROLL - data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii]; - } - } - - pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); - pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); - if (CONFIG_T::reuse_factor > 2) - pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); - if (CONFIG_T::reuse_factor > 3) - pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); - if (CONFIG_T::reuse_factor > 4) - pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); - if (CONFIG_T::reuse_factor > 5) - pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); - if (CONFIG_T::reuse_factor > 6) - pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); - if (CONFIG_T::reuse_factor > 7) - pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); - if (CONFIG_T::reuse_factor > 8) - pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); - if (CONFIG_T::reuse_factor > 9) - pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); - if (CONFIG_T::reuse_factor > 10) - pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); - if (CONFIG_T::reuse_factor > 11) - pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); - if (CONFIG_T::reuse_factor > 12) - pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); - if (CONFIG_T::reuse_factor > 13) - pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); - if (CONFIG_T::reuse_factor > 14) - pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); - if (CONFIG_T::reuse_factor > 15) - pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); - if (CONFIG_T::reuse_factor > 16) - pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); - if (CONFIG_T::reuse_factor > 17) - pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); - if (CONFIG_T::reuse_factor > 18) - pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); - if (CONFIG_T::reuse_factor > 19) - pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); - if (CONFIG_T::reuse_factor > 20) - pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); - if (CONFIG_T::reuse_factor > 21) - pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); - if (CONFIG_T::reuse_factor > 22) - pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); - if (CONFIG_T::reuse_factor > 23) - pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); - if (CONFIG_T::reuse_factor > 24) - pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); - if (CONFIG_T::reuse_factor > 25) - pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); - if (CONFIG_T::reuse_factor > 26) - pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); - if (CONFIG_T::reuse_factor > 27) - pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); - if (CONFIG_T::reuse_factor > 28) - pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); - if (CONFIG_T::reuse_factor > 29) - pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); - if (CONFIG_T::reuse_factor > 30) - pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); - if (CONFIG_T::reuse_factor > 31) - pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); - if (CONFIG_T::reuse_factor > 32) - pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); - if (CONFIG_T::reuse_factor > 33) - pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); - if (CONFIG_T::reuse_factor > 34) - pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); - if (CONFIG_T::reuse_factor > 35) - pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); - if (CONFIG_T::reuse_factor > 36) - pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); - if (CONFIG_T::reuse_factor > 37) - pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); - if (CONFIG_T::reuse_factor > 38) - pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); - if (CONFIG_T::reuse_factor > 39) - pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); - if (CONFIG_T::reuse_factor > 40) - pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); - if (CONFIG_T::reuse_factor > 41) - pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); - if (CONFIG_T::reuse_factor > 42) - pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); - if (CONFIG_T::reuse_factor > 43) - pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); - if (CONFIG_T::reuse_factor > 44) - pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); - if (CONFIG_T::reuse_factor > 45) - pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 46) - pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 47) - pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); - if (CONFIG_T::reuse_factor > 48) - pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); - if (CONFIG_T::reuse_factor > 49) - pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); - if (CONFIG_T::reuse_factor > 50) - pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); - if (CONFIG_T::reuse_factor > 51) - pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); - if (CONFIG_T::reuse_factor > 52) - pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); - if (CONFIG_T::reuse_factor > 53) - pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); - if (CONFIG_T::reuse_factor > 54) - pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); - if (CONFIG_T::reuse_factor > 55) - pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 56) - pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 57) - pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); - if (CONFIG_T::reuse_factor > 58) - pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); - if (CONFIG_T::reuse_factor > 59) - pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); - if (CONFIG_T::reuse_factor > 60) - pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); - if (CONFIG_T::reuse_factor > 61) - pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); - if (CONFIG_T::reuse_factor > 62) - pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); - if (CONFIG_T::reuse_factor > 63) - pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); - if (CONFIG_T::reuse_factor > 64) - pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); - if (CONFIG_T::reuse_factor > 65) - pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); - if (CONFIG_T::reuse_factor > 66) - pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); - if (CONFIG_T::reuse_factor > 67) - pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); - if (CONFIG_T::reuse_factor > 68) - pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); - if (CONFIG_T::reuse_factor > 69) - pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); - if (CONFIG_T::reuse_factor > 70) - pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); - if (CONFIG_T::reuse_factor > 71) - pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); - if (CONFIG_T::reuse_factor > 72) - pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); - if (CONFIG_T::reuse_factor > 73) - pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); - if (CONFIG_T::reuse_factor > 74) - pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); - if (CONFIG_T::reuse_factor > 75) - pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); - if (CONFIG_T::reuse_factor > 76) - pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); - if (CONFIG_T::reuse_factor > 77) - pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); - if (CONFIG_T::reuse_factor > 78) - pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); - if (CONFIG_T::reuse_factor > 79) - pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); - if (CONFIG_T::reuse_factor > 80) - pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); - if (CONFIG_T::reuse_factor > 81) - pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); - if (CONFIG_T::reuse_factor > 82) - pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); - if (CONFIG_T::reuse_factor > 83) - pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); - if (CONFIG_T::reuse_factor > 84) - pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); - if (CONFIG_T::reuse_factor > 85) - pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); - if (CONFIG_T::reuse_factor > 86) - pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); - if (CONFIG_T::reuse_factor > 87) - pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); - if (CONFIG_T::reuse_factor > 88) - pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); - if (CONFIG_T::reuse_factor > 89) - pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); - if (CONFIG_T::reuse_factor > 90) - pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); - if (CONFIG_T::reuse_factor > 91) - pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); - if (CONFIG_T::reuse_factor > 92) - pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); - if (CONFIG_T::reuse_factor > 93) - pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); - if (CONFIG_T::reuse_factor > 94) - pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); - if (CONFIG_T::reuse_factor > 95) - pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); - if (CONFIG_T::reuse_factor > 96) - pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); - if (CONFIG_T::reuse_factor > 97) - pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); - if (CONFIG_T::reuse_factor > 98) - pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); - if (CONFIG_T::reuse_factor > 99) - pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); - if (CONFIG_T::reuse_factor > 100) - pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); - if (CONFIG_T::reuse_factor > 101) - pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); - if (CONFIG_T::reuse_factor > 102) - pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); - if (CONFIG_T::reuse_factor > 103) - pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); - if (CONFIG_T::reuse_factor > 104) - pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); - if (CONFIG_T::reuse_factor > 105) - pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); - if (CONFIG_T::reuse_factor > 106) - pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); - if (CONFIG_T::reuse_factor > 107) - pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); - if (CONFIG_T::reuse_factor > 108) - pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); - if (CONFIG_T::reuse_factor > 109) - pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); - if (CONFIG_T::reuse_factor > 110) - pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); - if (CONFIG_T::reuse_factor > 111) - pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); - if (CONFIG_T::reuse_factor > 112) - pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); - if (CONFIG_T::reuse_factor > 113) - pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); - if (CONFIG_T::reuse_factor > 114) - pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); - if (CONFIG_T::reuse_factor > 115) - pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); - if (CONFIG_T::reuse_factor > 116) - pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); - if (CONFIG_T::reuse_factor > 117) - pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); - if (CONFIG_T::reuse_factor > 118) - pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); - if (CONFIG_T::reuse_factor > 119) - pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); - -RFOutputLoop: - for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerOutputLoop: - for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) { - #pragma HLS UNROLL - res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; - } - } -} - } // namespace nnet #endif diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index cbe2036712..a7ad3437b2 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -15,11 +15,13 @@ strides1d_options = [(1,), (2,)] strides2d_options = [(1, 1), (2, 2)] strategy_options = ['Latency', 'Resource'] +rf_options = [1, 2] @pytest.mark.parametrize('chans', chans_options) @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) +@pytest.mark.parametrize('rf', rf_options) @pytest.mark.parametrize( 'backend, io_type, strategy, conv_impl', [ @@ -36,7 +38,7 @@ ('Vitis', 'io_stream', 'resource', 'LineBuffer'), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl): +def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy, conv_impl): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -61,10 +63,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl + config['LayerName']['pointwise1d']['ReuseFactor'] = rf output_dir = str( test_root_path - / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_impl}' + / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{rf}_{backend}_{io_type}_{strategy}_{conv_impl}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend From 30c5c70f649553ab11611f6b02f8ab84bd86e801 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 8 Oct 2023 05:40:04 -0700 Subject: [PATCH 041/272] fix indent --- hls4ml/backends/fpga/fpga_backend.py | 57 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 349a5ddbc8..35151af348 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -874,44 +874,45 @@ def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): generated_code = ( "template\n" "class pointwise_conv_{index} : public PointwiseConv1D {{\n" - " public:\n" + " public:\n" " static void pointwise_conv(\n" - " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" - " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" - " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" - " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" - " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" - " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" - " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" - " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" - "RFInputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" - " #pragma HLS UNROLL\n" - " InnerInputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" - " #pragma HLS UNROLL\n" - " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];" - "\n" - " }}\n" - " }}\n\n" + " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" + " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" + " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" + " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" + " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" + " RFInputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" + " #pragma HLS UNROLL\n" + " InnerInputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" + " #pragma HLS UNROLL\n" + " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 + " }}\n" + " }}\n\n" ).format(index=layer_idx) + indent = " " for i in range(reuse_factor): + generated_code += indent generated_code += ( - f" pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" + f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" ) generated_code += ( "\n" - "RFOutputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" - " #pragma HLS UNROLL\n" - " InnerOutputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" - " #pragma HLS UNROLL\n" - " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" + " RFOutputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" + " #pragma HLS UNROLL\n" + " InnerOutputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" + " #pragma HLS UNROLL\n" + " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 + " }\n" " }\n" " }\n" - "}\n" "};\n" ) From a05bf69ebc99d7ce448db3f89398d615a52fe369 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 9 Oct 2023 13:28:57 -0700 Subject: [PATCH 042/272] update rf --- test/pytest/test_pointwiseconv.py | 32 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index a7ad3437b2..79fce34103 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -15,30 +15,30 @@ strides1d_options = [(1,), (2,)] strides2d_options = [(1, 1), (2, 2)] strategy_options = ['Latency', 'Resource'] -rf_options = [1, 2] @pytest.mark.parametrize('chans', chans_options) @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) -@pytest.mark.parametrize('rf', rf_options) @pytest.mark.parametrize( - 'backend, io_type, strategy, conv_impl', + 'backend, io_type, strategy, conv_impl, rf', [ - ('Quartus', 'io_parallel', 'resource', 'LineBuffer'), - ('Vivado', 'io_parallel', 'resource', 'LineBuffer'), - ('Vitis', 'io_parallel', 'resource', 'LineBuffer'), - ('Vivado', 'io_parallel', 'latency', 'LineBuffer'), - ('Vitis', 'io_parallel', 'latency', 'LineBuffer'), - ('Vivado', 'io_parallel', 'latency', 'Pointwise'), - ('Vitis', 'io_parallel', 'latency', 'Pointwise'), - ('Vivado', 'io_stream', 'latency', 'LineBuffer'), - ('Vivado', 'io_stream', 'resource', 'LineBuffer'), - ('Vitis', 'io_stream', 'latency', 'LineBuffer'), - ('Vitis', 'io_stream', 'resource', 'LineBuffer'), + ('Quartus', 'io_parallel', 'resource', 'LineBuffer', 1), + ('Vivado', 'io_parallel', 'resource', 'LineBuffer', 1), + ('Vitis', 'io_parallel', 'resource', 'LineBuffer', 1), + ('Vivado', 'io_parallel', 'latency', 'LineBuffer', 1), + ('Vitis', 'io_parallel', 'latency', 'LineBuffer', 1), + ('Vivado', 'io_parallel', 'latency', 'Pointwise', 1), + ('Vivado', 'io_parallel', 'latency', 'Pointwise', 14), + ('Vitis', 'io_parallel', 'latency', 'Pointwise', 1), + ('Vitis', 'io_parallel', 'latency', 'Pointwise', 14), + ('Vivado', 'io_stream', 'latency', 'LineBuffer', 1), + ('Vivado', 'io_stream', 'resource', 'LineBuffer', 1), + ('Vitis', 'io_stream', 'latency', 'LineBuffer', 1), + ('Vitis', 'io_stream', 'resource', 'LineBuffer', 1), ], ) -def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy, conv_impl): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl, rf): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -67,7 +67,7 @@ def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy, output_dir = str( test_root_path - / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{rf}_{backend}_{io_type}_{strategy}_{conv_impl}' + / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_{conv_impl}_rf{rf}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend From 445b2cd8744d3ba7928a69a1f556fe5c82c0e6d8 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 11 Oct 2023 19:58:42 -0700 Subject: [PATCH 043/272] address vlad comments part 1 --- hls4ml/backends/fpga/fpga_backend.py | 58 ------------- .../passes/{codegen.py => im2col_codegen.py} | 0 .../vivado/passes/pointwise_codegen.py | 25 ++++++ hls4ml/backends/vivado/vivado_backend.py | 58 +++++++++++++ hls4ml/templates/vivado/build_prj.tcl | 2 +- .../vivado/nnet_utils/nnet_code_gen.h | 81 +------------------ .../templates/vivado/nnet_utils/nnet_common.h | 1 + .../vivado/nnet_utils/nnet_conv1d_latency.h | 80 ++++++++++++++++++ hls4ml/writer/vivado_writer.py | 2 + 9 files changed, 168 insertions(+), 139 deletions(-) rename hls4ml/backends/fpga/passes/{codegen.py => im2col_codegen.py} (100%) create mode 100644 hls4ml/backends/vivado/passes/pointwise_codegen.py diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 35151af348..8cfaec8b3f 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -860,64 +860,6 @@ def generate_conv2d_line_buffer_fn( return generated_code - def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): - """Generate a C++ function for a pointwise convolution layer. - - Args: - layer_idx (int): Index of layer ('index' attribute). - reuse_factor (int): Number of partitions to divide the input into. - - Returns: - str: Generated C++ function - """ - - generated_code = ( - "template\n" - "class pointwise_conv_{index} : public PointwiseConv1D {{\n" - " public:\n" - " static void pointwise_conv(\n" - " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" - " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" - " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" - " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" - " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 - " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" - " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 - " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" - " RFInputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" - " #pragma HLS UNROLL\n" - " InnerInputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" - " #pragma HLS UNROLL\n" - " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 - " }}\n" - " }}\n\n" - ).format(index=layer_idx) - indent = " " - for i in range(reuse_factor): - generated_code += indent - generated_code += ( - f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" - ) - - generated_code += ( - "\n" - " RFOutputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" - " #pragma HLS UNROLL\n" - " InnerOutputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" - " #pragma HLS UNROLL\n" - " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 - " }\n" - " }\n" - " }\n" - "};\n" - ) - - return generated_code - @model_optimizer() def write_hls(self, model): self.writer.write_hls(model) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/im2col_codegen.py similarity index 100% rename from hls4ml/backends/fpga/passes/codegen.py rename to hls4ml/backends/fpga/passes/im2col_codegen.py diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py new file mode 100644 index 0000000000..f459d59208 --- /dev/null +++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py @@ -0,0 +1,25 @@ +from hls4ml.model.layers import Conv1D +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import Source + + +class GeneratePointwiseConv1D(OptimizerPass): + '''Generates code for pointwise 1D convolution''' + + def match(self, node): + return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' + + def transform(self, model, node): + node_class = node.__class__.__name__ + if '1D' in node_class: + self._generate_pointwise_conv1d(node) + else: + raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') + + def _generate_pointwise_conv1d(self, node): + code_str = node.model.config.backend.generate_pointwise_conv1d_fn( + node.get_attr('index'), + node.get_attr('reuse_factor'), + ) + + node.set_attr('pointwise_conv1d_codegen', Source(code_str)) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 011d576f64..8db278be9b 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -474,3 +474,61 @@ def init_garnet(self, layer): @layer_optimizer(GarNetStack) def init_garnet_stack(self, layer): self.init_garnet(layer) + + def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): + """Generate a C++ function for a pointwise convolution layer. + + Args: + layer_idx (int): Index of layer ('index' attribute). + reuse_factor (int): Number of partitions to divide the input into. + + Returns: + str: Generated C++ function + """ + + generated_code = ( + "template\n" + "class pointwise_conv_{index} : public PointwiseConv1D {{\n" + " public:\n" + " static void pointwise_conv(\n" + " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" + " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" + " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" + " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" + " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" + " RFInputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" + " #pragma HLS UNROLL\n" + " InnerInputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" + " #pragma HLS UNROLL\n" + " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 + " }}\n" + " }}\n\n" + ).format(index=layer_idx) + indent = " " + for i in range(reuse_factor): + generated_code += indent + generated_code += ( + f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" + ) + + generated_code += ( + "\n" + " RFOutputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" + " #pragma HLS UNROLL\n" + " InnerOutputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" + " #pragma HLS UNROLL\n" + " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 + " }\n" + " }\n" + " }\n" + "};\n" + ) + + return generated_code diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index 82b3c5a640..4ef8032d4f 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -161,7 +161,7 @@ if {$opt(reset)} { } else { open_solution "solution1" } -catch {config_array_partition -maximum_size 8192} +catch {config_array_partition -maximum_size $maximum_size} config_compile -name_max_length 80 set_part $part config_schedule -enable_dsp_full_reg=false diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index 32fa7321c5..1900aa2716 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -1,6 +1,7 @@ #ifndef NNET_INSTR_GEN_H_ #define NNET_INSTR_GEN_H_ +#include "nnet_conv1d_latency.h" #include "nnet_helpers.h" #include @@ -35,86 +36,6 @@ template class PointwiseConv1D { } }; -template -void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - assert(CONFIG_T::filt_width == 1); - - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; - - #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 - #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 - - // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases - #pragma HLS function_instantiate variable=weights,biases - - // Parallel mode - #pragma HLS PIPELINE II=CONFIG_T::reuse_factor - #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 - #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 - - // Limit multipliers to control parallelization - int multiplier_limit = - ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / - float(CONFIG_T::reuse_factor)); -#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit - -// Convolve, saving all multiplication results to accumulate later -ConvOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - ConvFilt: - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - ConvChan: - for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { - #pragma HLS UNROLL - int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; - int index_weight = cc * CONFIG_T::n_filt + ff; - int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; - - if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || - (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { - mult[index_mult] = 0; - } else { - mult[index_mult] = data[index_data] * weights[index_weight]; - } - } // end channel loop - } // end filter loop - } // end output loop - - // Initialize accumulator with input biases - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - acc[ii][ff] = biases[ff]; - } - } - -// Accumulate multiplication result -AccumOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - AccumFilt: - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - // Do "dot product" sum within filter and sum over channels - AccumChan: - for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { - int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; - acc[ii][ff] += mult[index_mult]; - } // end channel loop - } // end filter loop - } // end output loop - - // Cast to "res_t" type - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); - } - } -} - // hls4ml insert code } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index e942a1dc89..c3cf1a2de4 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -2,6 +2,7 @@ #define NNET_COMMON_H_ #include "ap_fixed.h" +#include "nnet_helpers.h" // This is a substitute for "ceil(n/(float)d)". #define DIV_ROUNDUP(n, d) ((n + d - 1) / d) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 0d9afb10cb..8fb9f769f4 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,5 +84,85 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + } // namespace nnet #endif diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 412bb8d667..2f7bb676f4 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -591,6 +591,8 @@ def write_build_script(self, model): f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) f.write('variable version\n') f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) + f.write('variable maximum_size\n') + f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4192'))) f.close() # build_prj.tcl From 1dd2603558f8ceb6d16b449c67e52567650d3eaf Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 11 Oct 2023 20:01:28 -0700 Subject: [PATCH 044/272] default 4096 --- hls4ml/writer/vivado_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 2f7bb676f4..80c4094a4f 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -592,7 +592,7 @@ def write_build_script(self, model): f.write('variable version\n') f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) f.write('variable maximum_size\n') - f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4192'))) + f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4096'))) f.close() # build_prj.tcl From 04997c234ffed74b35ff79074d5c8b9c7788477f Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 14 Oct 2023 21:27:51 -0700 Subject: [PATCH 045/272] only add pointwise function when optimizing --- hls4ml/backends/fpga/passes/im2col_codegen.py | 22 ----- .../vivado/passes/convolution_templates.py | 2 - hls4ml/backends/vivado/passes/pointwise.py | 82 ++++++++++++++++++- .../vivado/nnet_utils/nnet_code_gen.h | 10 +++ 4 files changed, 88 insertions(+), 28 deletions(-) diff --git a/hls4ml/backends/fpga/passes/im2col_codegen.py b/hls4ml/backends/fpga/passes/im2col_codegen.py index 6d7243dd8b..f1f1080996 100644 --- a/hls4ml/backends/fpga/passes/im2col_codegen.py +++ b/hls4ml/backends/fpga/passes/im2col_codegen.py @@ -49,25 +49,3 @@ def _generate_im2col_2d(self, node): ) node.set_attr('line_buffer_codegen', Source(code_str)) - - -class GeneratePointwiseConv1D(OptimizerPass): - '''Generates code for pointwise 1D convolution''' - - def match(self, node): - return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' - - def transform(self, model, node): - node_class = node.__class__.__name__ - if '1D' in node_class: - self._generate_pointwise_conv1d(node) - else: - raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') - - def _generate_pointwise_conv1d(self, node): - code_str = node.model.config.backend.generate_pointwise_conv1d_fn( - node.get_attr('index'), - node.get_attr('reuse_factor'), - ) - - node.set_attr('pointwise_conv1d_codegen', Source(code_str)) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index a4fbdd405f..60eddae806 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -56,8 +56,6 @@ typedef {config_t} mult_config; template using scale_index = nnet::{scale_index_type}; - template - using pointwise_conv = nnet::{pointwise_fn}; }}; const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py index c353a10604..0353787e8c 100644 --- a/hls4ml/backends/vivado/passes/pointwise.py +++ b/hls4ml/backends/vivado/passes/pointwise.py @@ -8,13 +8,87 @@ Conv1DFunctionTemplate, Conv2DConfigTemplate, Conv2DFunctionTemplate, - conv1d_config_template, - conv2d_config_template, conv_mult_config_template, ) from hls4ml.model.layers import register_layer from hls4ml.model.optimizer import OptimizerPass +pointwise_conv1d_config_template = """struct config{index} : nnet::conv1d_config {{ + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned filt_width = {filt_width}; + static const unsigned kernel_size = filt_width; + static const unsigned n_filt = {n_filt}; + static const unsigned stride_width = {stride_width}; + static const unsigned dilation = {dilation}; + static const unsigned out_width = {out_width}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::{strategy}; + static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned min_width = {min_width}; + static const ap_uint pixels[min_width]; + static const unsigned n_partitions = {n_partitions}; + static const unsigned n_pixels = out_width / n_partitions; + template + using fill_buffer = nnet::{fill_fn}; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; + template + using scale_index = nnet::{scale_index_type}; + template + using pointwise_conv = nnet::{pointwise_fn}; +}}; +const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" + +pointwise_conv2d_config_template = """struct config{index} : nnet::conv2d_config {{ + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned filt_height = {filt_height}; + static const unsigned filt_width = {filt_width}; + static const unsigned kernel_size = filt_height * filt_width; + static const unsigned n_filt = {n_filt}; + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const unsigned multiplier_limit = + DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::{strategy}; + static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned min_height = {min_height}; + static const unsigned min_width = {min_width}; + static const ap_uint pixels[min_height * min_width]; + static const unsigned n_partitions = {n_partitions}; + static const unsigned n_pixels = out_height * out_width / n_partitions; + template + using fill_buffer = nnet::{fill_fn}; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; + template + using scale_index_height = nnet::{scale_index_height_type}; + template + using scale_index_width = nnet::{scale_index_width_type}; + template + using pointwise_conv = nnet::{pointwise_fn}; +}}; +const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" + pointwise_conv1d_function_template = ( 'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' ) @@ -29,7 +103,7 @@ class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate): def __init__(self): super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D) - self.template = conv1d_config_template + self.template = pointwise_conv1d_config_template self.mult_template = conv_mult_config_template @@ -42,7 +116,7 @@ def __init__(self): class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate): def __init__(self): super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D) - self.template = conv2d_config_template + self.template = pointwise_conv2d_config_template self.mult_template = conv_mult_config_template diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index 1900aa2716..1e922bbfed 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -36,6 +36,16 @@ template class PointwiseConv1D { } }; +template class PointwiseConv2D { + public: + static void pointwise_conv(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // To be implemented in subclasses + } +}; + // hls4ml insert code } // namespace nnet From 76be67b5779b38486a094b465898e087fa9e3339 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 2 Feb 2024 18:52:57 -0600 Subject: [PATCH 046/272] snapshot of work --- hls4ml/converters/keras/core.py | 3 +- hls4ml/converters/keras/graph.py | 2 +- hls4ml/converters/keras/qkeras.py | 3 +- hls4ml/converters/onnx/quantizer.py | 97 ------- hls4ml/model/layers.py | 7 +- hls4ml/model/optimizer/passes/merge_const.py | 2 +- .../passes/propagate_acc_precision.py | 106 ------- hls4ml/model/optimizer/passes/qkeras.py | 3 +- hls4ml/model/optimizer/passes/quant_opt.py | 229 +++++++-------- hls4ml/model/quantizers.py | 261 ++++++++++++++++++ hls4ml/model/types.py | 156 ----------- test/pytest/test_qonnx.py | 6 +- 12 files changed, 378 insertions(+), 497 deletions(-) delete mode 100644 hls4ml/converters/onnx/quantizer.py delete mode 100644 hls4ml/model/optimizer/passes/propagate_acc_precision.py create mode 100644 hls4ml/model/quantizers.py diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py index f6119c016d..ca7d0b3541 100644 --- a/hls4ml/converters/keras/core.py +++ b/hls4ml/converters/keras/core.py @@ -1,5 +1,6 @@ from hls4ml.converters.keras_to_hls import get_weights_data, keras_handler, parse_default_keras_layer -from hls4ml.model.types import BinaryQuantizer, IntegerPrecisionType, TernaryQuantizer +from hls4ml.model.quantizers import BinaryQuantizer, TernaryQuantizer +from hls4ml.model.types import IntegerPrecisionType @keras_handler('InputLayer') diff --git a/hls4ml/converters/keras/graph.py b/hls4ml/converters/keras/graph.py index 5c5c2247c0..954bf20b8f 100644 --- a/hls4ml/converters/keras/graph.py +++ b/hls4ml/converters/keras/graph.py @@ -1,5 +1,5 @@ -from hls4ml.converters.keras.core import TernaryQuantizer from hls4ml.converters.keras_to_hls import get_weights_data, keras_handler, parse_default_keras_layer +from hls4ml.model.quantizers import TernaryQuantizer @keras_handler('GarNet', 'GarNetStack') diff --git a/hls4ml/converters/keras/qkeras.py b/hls4ml/converters/keras/qkeras.py index ba1401cce0..055ed3a8f4 100644 --- a/hls4ml/converters/keras/qkeras.py +++ b/hls4ml/converters/keras/qkeras.py @@ -3,7 +3,8 @@ from hls4ml.converters.keras.convolution import parse_conv1d_layer, parse_conv2d_layer from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_dense_layer from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer -from hls4ml.model.types import FixedPrecisionType, QKerasBinaryQuantizer, QKerasPO2Quantizer, QKerasQuantizer +from hls4ml.model.quantizers import QKerasBinaryQuantizer, QKerasPO2Quantizer, QKerasQuantizer +from hls4ml.model.types import FixedPrecisionType def get_quantizer_from_config(keras_layer, quantizer_var): diff --git a/hls4ml/converters/onnx/quantizer.py b/hls4ml/converters/onnx/quantizer.py deleted file mode 100644 index 7f69652c04..0000000000 --- a/hls4ml/converters/onnx/quantizer.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -Quantizer for the Quant node, after scale and zeropoint hafe been extracted -(unless scale is a power of 2, if doing special case po2) - -This is based on the sample implementation in finn-base -""" - -import numpy as np - -from hls4ml.model.types import Quantizer, RoundingMode, SaturationMode - - -class QuantNodeQuantizer(Quantizer): - """This implements a quantizer for a FixedPrecisionType with width==integer""" - - def __init__(self, precision): - super().__init__(precision.width, precision) - - def __call__(self, data): - """Apply the quantization on the data""" - - scale = 2 ** (self.hls_type.width - self.hls_type.integer) - - data = data * scale # (not using *= to avoid modifying data) - # Clamping - min_int_val = self._min_int(self.hls_type.signed, self.hls_type.saturation_mode, self.bits) - max_int_val = self._max_int(self.hls_type.signed, self.bits) - data = np.where(data > max_int_val, max_int_val, data) - data = np.where(data < min_int_val, min_int_val, data) - # Rounding - rounding_fx = self._resolve_rounding_mode(self.hls_type.rounding_mode) - return rounding_fx(data) / scale - - @staticmethod - def _min_int(signed: bool, saturation_mode: str, bit_width: int) -> int: - """Compute the minimum integer representable by a given number of bits. - Args: - signed (bool): Indicates whether the represented integer is signed or not. - saturation_mode (bool): Indicates the saturation mode used (AP_SAT_SYM or AP_SAT) - bit_width (int): Number of bits available for the representation. - Returns: - int: Maximum unsigned integer that can be represented according to - the input arguments. - Examples: - >>> min_int(signed=True, saturation_mode='AP_SAT_SYM', bit_width=8) - int(-127) - >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8) - int(0) - >>> min_int(signed=True, saturation_mode='AP_SAT', bit_width=8) - int(-128) - >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8) - int(0) - """ - if saturation_mode not in (SaturationMode.SAT_SYM, SaturationMode.SAT): - raise ValueError(f"Saturation mode {saturation_mode} not supported. Only AP_SAT_SYM, AP_SAT supported") - if signed and saturation_mode == SaturationMode.SAT_SYM: - value = -(2 ** (bit_width - 1)) + 1 - elif signed: - value = -(2 ** (bit_width - 1)) - else: - value = 0 - return value - - @staticmethod - def _max_int(signed: bool, bit_width: int) -> int: - """Compute the maximum integer representable by a given number of bits. - (Note, narrow and unsigned is not supported by the implementation, so saturation mode is not used) - Args: - signed (bool): Indicates whether the represented integer is signed or not. - bit_width (int): Number of bits available for the representation. - Returns: - Tensor: Maximum integer that can be represented according to - the input arguments. - Examples: - >>> max_int(signed=True, bit_width=8) - int(127) - >>> max_int(signed=False, bit_width=8) - int(255) - """ - if not signed: - value = (2**bit_width) - 1 - else: - value = (2 ** (bit_width - 1)) - 1 - return value - - @staticmethod - def _resolve_rounding_mode(mode): - """Resolve the rounding mode of Quant and Trunc ops - to the corresponding numpy functions.""" - if mode == RoundingMode.RND_CONV: - return np.round - # elif mode_string == "CEIL": # not supported - # return np.ceil - elif mode == RoundingMode.TRN: - return np.floor - else: - raise ValueError(f"Rounding mode {mode} not supported.") diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 0df69b753e..7da730b60a 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -349,17 +349,17 @@ def initialize(self): class Constant(Layer): + # one could consider making this a weight attribute, but given it's transient nature, I am not sure it helps _expected_attributes = [ Attribute('value', value_type=np.ndarray), ] def initialize(self): value = self.attributes['value'] - self.value = value # note, this is unquantized; Only here for easier access shape = list(value.shape) if not shape: shape = (1,) - self.value = np.array([self.value]) + self.set_attr('value', np.array([value])) dims = [f'{self.name}_{i}' for i in range(len(shape))] self.add_output_variable(shape, dims, var_name=self.name, precision=self.get_attr("precision")) @@ -455,7 +455,6 @@ class Conv(Layer): """ def initialize(self): - # use negative indexing because it is not clear if batch dimension is always stripped if self.attributes['n_dim'] == 1: # this is 1D convolution shape = [self.attributes['out_width'], self.attributes['n_filt']] @@ -932,6 +931,7 @@ def initialize(self): self.add_weights_variable(name='bias', var_name='b{index}', data=bias) +# TODO: discuss whether this should be renamed to soemthing more descriptive, and whether the class hierarchy makes sense class ApplyAlpha(BatchNormalization): '''A custom layer to scale the output of a QDense layer which used 'alpha != 1' Inference computation uses BatchNormalization methods''' @@ -941,6 +941,7 @@ def initialize(self): shape = inp.shape dims = inp.dim_names self.add_output_variable(shape, dims) + self.set_attr('n_in', inp.size()) scale = self.get_attr('scale_data') scale_quantizer = self.get_attr('scale_quantizer') diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index f38bfd841d..adc7dff093 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -1,8 +1,8 @@ import numpy as np -from hls4ml.converters.onnx.quantizer import QuantNodeQuantizer from hls4ml.model.layers import ApplyAlpha, Constant, Merge from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.quantizers import QuantNodeQuantizer _base_attributes = ('Trace', 'reuse_factor', 'n_in') diff --git a/hls4ml/model/optimizer/passes/propagate_acc_precision.py b/hls4ml/model/optimizer/passes/propagate_acc_precision.py deleted file mode 100644 index 375979de4e..0000000000 --- a/hls4ml/model/optimizer/passes/propagate_acc_precision.py +++ /dev/null @@ -1,106 +0,0 @@ -import math # prefer to use math.ceil for scalar values (returns int) - -import numpy as np - -from hls4ml.model.layers import Conv1D, Conv2D, Dense -from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.types import FixedPrecisionType, NamedType - -# TODO: Update these to use the new auto precision, not depdening only on QONNX values - - -class PropagateDensePrecision(OptimizerPass): - """ - Propagate precision for Dense nodes. Restrict it to only cases where - the precision is set by a quant node, since otherwise the values get huge. - """ - - def match(self, node): - is_match = isinstance(node, Dense) - return is_match - - def transform(self, model, node): - input_precision = node.get_input_node().get_attr("quant_precision") - weight_precision = node.get_attr("weight_precision") - if not input_precision or not weight_precision: - return False - - bias_precision = node.get_attr("bias_precision") - input_variable = node.get_input_variable() - num_acc = input_variable.shape[-1] - - accum_precision = _propagate_type_acc(input_precision, weight_precision, bias_precision, num_acc) - - accum_t = NamedType(f'layer{node.index}_accum_t', accum_precision) - node.set_attr('accum_t', accum_t) - - if not node.get_attr("quant_precision"): - # output precision not set by quant node - node.update_output_precision(accum_precision) - - return False - - -class PropagateConvPrecision(OptimizerPass): - """Propagate precision for conv nodes. Restrict it to only cases where - the precision is set by a quant node, since otherwise the values get huge. - """ - - def match(self, node): - is_match = isinstance(node, (Conv1D, Conv2D)) - return is_match - - def transform(self, model, node): - input_precision = node.get_input_node().get_attr("quant_precision") - weight_precision = node.get_attr("weight_precision") - if not input_precision or not weight_precision: - return False - - bias_precision = node.get_attr("bias_precision") - num_feature_maps = node.weights['weight'].data_unquantized.shape[-1] - filt_width = node.get_attr('filt_width') - filt_height = node.get_attr('filt_height', 1) - - num_acc = filt_width * filt_height * num_feature_maps - - accum_precision = _propagate_type_acc(input_precision, weight_precision, bias_precision, num_acc) - - accum_t = NamedType(f'layer{node.index}_accum_t', accum_precision) - node.set_attr('accum_t', accum_t) - - if not node.get_attr("quant_precision"): - # output precision not explicitly set by quant node - node.update_output_precision(accum_precision) - - return False - - -def _propagate_type_acc(input_precision, weight_precision, bias_precision, num_acc): - ''' - Propagate the precion type across a multiply. Rounding modes are propagated from input_precision - ''' - - # check to make sure none are None - bitwidth = weight_precision.width + input_precision.width + math.ceil(np.log2(num_acc)) - integer = weight_precision.integer + input_precision.integer + math.ceil(np.log2(num_acc)) - signed = weight_precision.signed or input_precision.signed - - # Because calculating precision, no need to round or sautration - rounding_mode = None - saturation_mode = None - - frac = bitwidth - integer - - # correct for bias - if bias_precision: - integer = ( - max( - integer + (bias_precision.signed and not signed), - bias_precision.integer + (signed and not bias_precision.signed), - ) - + 1 - ) - bitwidth = integer + max(frac, bias_precision.width - bias_precision.integer) - signed = signed or bias_precision.signed - - return FixedPrecisionType(bitwidth, integer, signed, rounding_mode, saturation_mode) diff --git a/hls4ml/model/optimizer/passes/qkeras.py b/hls4ml/model/optimizer/passes/qkeras.py index 7bed6cb1e7..a97438832d 100644 --- a/hls4ml/model/optimizer/passes/qkeras.py +++ b/hls4ml/model/optimizer/passes/qkeras.py @@ -3,7 +3,8 @@ from hls4ml.model.layers import ApplyAlpha from hls4ml.model.optimizer import ConfigurableOptimizerPass, OptimizerPass, register_pass -from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, QKerasPO2Quantizer +from hls4ml.model.quantizers import QKerasPO2Quantizer +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType class OutputRoundingSaturationMode(ConfigurableOptimizerPass): diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py index f0a5129d52..dc6deab14b 100644 --- a/hls4ml/model/optimizer/passes/quant_opt.py +++ b/hls4ml/model/optimizer/passes/quant_opt.py @@ -1,28 +1,25 @@ -''' +""" This file includes optimizations related to quant nodes. -As a first step, QuantConstantParameters converts the extra inputs to attributes. It is always the first step +As a first step, QuantConstantParameters converts the extra inputs to attributes. -The next step differs between the case of (1) unitary scale and zero offset, or (2) nonunitary scale and/or -nonzero offset. In the first case no scaling is required, so a Quant node effectively becomes a linear activation. -For the common case when this is applied on a constant weight, the activation is immediately merged with the weight, -qantizing the weights. In case 2, we need to explictly scale and unscale, so the Quant node becomes 3 nodes, an -ApplyAlpha node to apply a scale/shift, a Linear node to apply the quantization, and another ApplyAlpha to unscale/shift. -We depend on optimization steps to move the unscaling ApplyAlpha down as needed. Again, when the Quant is a applied to a -Constant, the scaling and Linear nodes are immediately merged into the Constant. This is done because it simplifies some -of the other optimizations. +The next step differs between the case of (1) (positive) power-of-2 scale and zero offset, or (2) other cases. In the first +case no explicit scaling is required, so a Quant node logically becomes a linear activation. (Cases when the scale is a +power of 2 not equal to one are implicitly scaled with fixed precision types.) When the activation is applied to a constant +weight, the activation is immediately merged with the weight, quantizing the weights. In case (2), we need to explicitly +scale and unscale, so the Quant node becomes 3 nodes, an ApplyAlpha node to apply a scale/shift, a Linear node to apply the +quantization, and another ApplyAlpha to unscale/shift. We depend on optimization steps to move the unscaling ApplyAlpha +down as needed so that we can do integer or fixed-point calculations. When the Quant is a applied to a weight, the scaling +and Linear nodes are immediately merged into the Constant. -UPDATE: Case 1 is loosened to also include power of 2 scalar scales, not just unitary scale, if - _ALSO_INCLUDE_PO2 is set to true (the default) - -''' +""" import math # prefer to use math.ceil for scalar values import numpy as np -from hls4ml.converters.onnx.quantizer import QuantNodeQuantizer from hls4ml.model.layers import Activation, ApplyAlpha, Constant, Quant from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.quantizers import QuantNodeQuantizer from hls4ml.model.types import FixedPrecisionType _ALSO_MATCH_PO2 = True @@ -44,28 +41,29 @@ def match(self, node): def transform(self, model, node): """ - Remove Constant from the Qaunt node parameters (but not input[0]) + Remove Constant from the Quant node parameters (but not input[0]) """ if node.get_input_node(node.inputs[1]): scale_node = node.get_input_node(node.inputs[1]) if isinstance(scale_node, Constant): - node.set_attr('scale', scale_node.value) + node.set_attr('scale', scale_node.get_attr('value')) node.inputs[1] = '' model.remove_node(scale_node, rewire=False) if node.get_input_node(node.inputs[2]): zeropt_node = node.get_input_node(node.inputs[2]) if isinstance(zeropt_node, Constant): - node.set_attr('zeropt', zeropt_node.value) + node.set_attr('zeropt', zeropt_node.get_attr('value')) node.inputs[2] = '' model.remove_node(zeropt_node, rewire=False) if node.get_input_node(node.inputs[3]): bitwidth_node = node.get_input_node(node.inputs[3]) if isinstance(bitwidth_node, Constant): - if np.squeeze(bitwidth_node.value).shape: - raise RuntimeError("Only scalar bitwidth values are supporeted by the Quant node") - node.set_attr('bitwidth', bitwidth_node.value) + bitwidth = bitwidth_node.get_attr('value') + if bitwidth.size != 1: + raise RuntimeError('Only scalar bitwidth values are supporeted by the Quant node') + node.set_attr('bitwidth', bitwidth) node.inputs[3] = '' model.remove_node(bitwidth_node, rewire=False) @@ -73,14 +71,12 @@ def transform(self, model, node): class QuantToActivation(OptimizerPass): - ''' - This is for the case when scale is 1 and zeropt is 0. It is a a 1:1 transformation of + """ + This is for the case when scale is a (positive) power of 2 and zeropt is 0. It is a a 1:1 transformation of a Quant to an Activation. As an optimization, this is not called when the input is constant. - - UPDATE: this is also called when scale is scalar and power of 2, not just 1. - ''' + """ def match(self, node): # only matches after the other inputs are already folded @@ -93,47 +89,43 @@ def match(self, node): and not node.get_input_node(node.inputs[3]) ) - # Only match if the scale is 1s and the zero-point is 0s + # Only match if the scale is power of 2 and the zero-point is 0s if is_match: # to make sure this is a quant node with inputs - scale = node.get_attr("scale") - bias = node.get_attr("zeropt") + scale = node.get_attr('scale') + bias = node.get_attr('zeropt') is_match = is_match and (bias == np.zeros_like(bias)).all() # check if scale is ones-like or a power of two scale_unit_or_po2 = (scale == np.ones_like(scale)).all() if not scale_unit_or_po2 and _ALSO_MATCH_PO2: - sqscale = np.squeeze(scale) - if not sqscale.shape: - # not an array - mantissa, _ = np.frexp(sqscale) + # This optimization only works if all scales are the same + if np.all(scale[0] == scale): + mantissa, _ = np.frexp(scale[0]) scale_unit_or_po2 = mantissa == 0.5 - is_match = is_match and scale_unit_or_po2 + is_match = scale_unit_or_po2 return is_match def transform(self, model, node): - ''' + """ Change quant node to Activation - ''' - input_shape = node.get_input_variable().shape - - n_in = np.prod(input_shape) + """ - rounding_mode = node.get_attr("rounding_mode") - narrow = node.get_attr("narrow") - signed = node.get_attr("signed") - bitwidth = node.get_attr("bitwidth") + rounding_mode = node.get_attr('rounding_mode') + narrow = node.get_attr('narrow') + signed = node.get_attr('signed') + bitwidth = node.get_attr('bitwidth') integer = bitwidth - scale = node.get_attr("scale") + scale = node.get_attr('scale') if _ALSO_MATCH_PO2 and not (scale == np.ones_like(scale)).all(): - _, exp = np.frexp(np.squeeze(scale)) + _, exp = np.frexp(scale[0]) integer = bitwidth + exp - 1 precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode) attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update({'activation': 'linear', 'quant_precision': precision, 'quantizer': quantizer, 'n_in': n_in}) + attributes.update({'activation': 'linear', 'quant_precision': precision, 'quantizer': quantizer}) new_node = model.make_node(Activation, f'{node.name}_act', attributes, [node.inputs[0]], [x for x in node.outputs]) new_node.get_output_variable().type.precision = precision @@ -143,10 +135,9 @@ def transform(self, model, node): class FuseQuantWithConstant(OptimizerPass): - ''' - This is for the case when scale is 1 and zeropt is 0. It directly applies the quantization to a constant. - UPDATE: this is also called when scale is scalar and power of 2, not just 1. - ''' + """ + This is for the case when scale is a positive power of 2 and zeropt is 0. + """ def match(self, node): # only matches after the other inputs are already folded @@ -158,36 +149,35 @@ def match(self, node): and not node.get_input_node(node.inputs[3]) ) - # Only match if the scale is 1s and the zero-point is 0s + # Only match if the scale is power of 2 and the zero-point is 0s if is_match: # to make sure this is a quant node with inputs - scale = node.get_attr("scale") - bias = node.get_attr("zeropt") + scale = node.get_attr('scale') + bias = node.get_attr('zeropt') is_match = is_match and (bias == np.zeros_like(bias)).all() # check if scale is ones-like or a power of two scale_unit_or_po2 = (scale == np.ones_like(scale)).all() if not scale_unit_or_po2 and _ALSO_MATCH_PO2: - sqscale = np.squeeze(scale) - if not sqscale.shape: - # not an array - mantissa, _ = np.frexp(sqscale) + # This optimization only works if all scales are the same + if np.all(scale[0] == scale): + mantissa, _ = np.frexp(scale[0]) scale_unit_or_po2 = mantissa == 0.5 - is_match = is_match and scale_unit_or_po2 + is_match = scale_unit_or_po2 return is_match def transform(self, model, node): - ''' + """ Fuse Quant with Constant. - ''' + """ - rounding_mode = node.get_attr("rounding_mode") - narrow = node.get_attr("narrow") - signed = node.get_attr("signed") - bitwidth = node.get_attr("bitwidth") + rounding_mode = node.get_attr('rounding_mode') + narrow = node.get_attr('narrow') + signed = node.get_attr('signed') + bitwidth = node.get_attr('bitwidth') integer = bitwidth - scale = node.get_attr("scale") + scale = node.get_attr('scale') if _ALSO_MATCH_PO2 and not (scale == np.ones_like(scale)).all(): _, exp = np.frexp(np.squeeze(scale)) integer = bitwidth + exp - 1 @@ -195,11 +185,9 @@ def transform(self, model, node): precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode) const_node = node.get_input_node(node.inputs[0]) - const_node.set_attr("quant_precision", precision) - const_node.set_attr("quantizer", quantizer) - - # reinitialize (which also runs quantization if quantizer exists) - const_node.initialize() + const_node.set_attr('quant_precision', precision) + const_node.set_attr('quantizer', quantizer) + const_node.get_output_variable().type.precision = precision # remove the Quant node model.remove_node(node, rewire=True) @@ -208,12 +196,12 @@ def transform(self, model, node): class QuantToAlphaActivationAlpha(OptimizerPass): - ''' + """ This is for the case when scale is not 1 or zeropt is not 0. It is a a 1:3 transformation of a Quant to an ApplyAlpha (to scale), Activatio, ApplyAlpho (to rescale). - As an optimization, this is not called when the input is constant. - ''' + NOTE: It needs to be scheduled after QuantToActivation (or we need to make the match criteria stricter) + """ def match(self, node): # only matches after the other inputs are already folded @@ -224,33 +212,24 @@ def match(self, node): and not node.get_input_node(node.inputs[2]) and not node.get_input_node(node.inputs[3]) ) - - if is_match: # to make sure this is a quant node with inputs - scale = node.get_attr("scale") - bias = node.get_attr("zeropt") - is_match = is_match and ((scale != np.ones_like(scale)).any() or (bias != np.zeros_like(bias)).any()) return is_match def transform(self, model, node): - ''' + """ Change quant node to ApplyAlhpa, Activation, ApplyAlpha - ''' + """ # Do the Activation as in the simple case - input_shape = node.get_input_variable().shape - - n_in = np.prod(input_shape) - - rounding_mode = node.get_attr("rounding_mode") - narrow = node.get_attr("narrow") - signed = node.get_attr("signed") - bitwidth = node.get_attr("bitwidth") + rounding_mode = node.get_attr('rounding_mode') + narrow = node.get_attr('narrow') + signed = node.get_attr('signed') + bitwidth = node.get_attr('bitwidth') precision, quantizer = _calculate_precision_quantizer(bitwidth, bitwidth, signed, narrow, rounding_mode) attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update({'activation': 'linear', 'quant_precision': precision, 'quantizer': quantizer, 'n_in': n_in}) + attributes.update({'activation': 'linear', 'quant_precision': precision, 'quantizer': quantizer}) new_node = model.make_node(Activation, f'{node.name}_act', attributes, [node.inputs[0]], [x for x in node.outputs]) new_node.get_output_variable().type.precision = precision @@ -258,27 +237,25 @@ def transform(self, model, node): # but now add the ApplyAlhpas before and after - scale = node.get_attr("scale") - bias = node.get_attr("zeropt") + scale = node.get_attr('scale') + bias = node.get_attr('zeropt') attributes_scale = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes_scale.update({'n_in': n_in, 'n_out': n_in, 'n_filt': -1}) attributes_rescale = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes_rescale.update({'n_in': n_in, 'n_out': n_in, 'n_filt': -1}) firstscale = 1 / scale firstbias = bias - attributes_scale["scale_data"] = firstscale - attributes_scale["bias_data"] = firstbias + attributes_scale['scale_data'] = firstscale + attributes_scale['bias_data'] = firstbias scale_node = model.make_node(ApplyAlpha, node.name + '_scale', attributes_scale, [node.inputs[0]]) model.insert_node(scale_node) rescale = scale rebias = -bias * scale - attributes_rescale["scale_data"] = rescale - attributes_rescale["bias_data"] = rebias + attributes_rescale['scale_data'] = rescale + attributes_rescale['bias_data'] = rebias rescale_node = model.make_node(ApplyAlpha, node.name + '_rescale', attributes_rescale, [new_node.outputs[0]]) model.insert_node(rescale_node) @@ -287,12 +264,12 @@ def transform(self, model, node): class ConstQuantToConstAlpha(OptimizerPass): - ''' + """ This is for the case when scale is not 1 or zeropt is not 0. It is a a 1:3 transformation of a Quant to an ApplyAlpha (to scale), Activation, ApplyAlpho (to unscale), but an input consts allows for optimization, so the ApplyAlpha (to scale), Activation are optimized away right away. - ''' + """ def match(self, node): # only matches after the other inputs are already folded @@ -305,39 +282,37 @@ def match(self, node): ) if is_match: # to make sure this is a quant node with inputs - scale = node.get_attr("scale") - bias = node.get_attr("zeropt") + scale = node.get_attr('scale') + bias = node.get_attr('zeropt') is_match = is_match and ((scale != np.ones_like(scale)).any() or (bias != np.zeros_like(bias)).any()) return is_match def transform(self, model, node): - ''' + """ Change Constant + Quant node to Constant, ApplyAlpha - ''' + """ # Do the Activation as in the simple case - input_shape = node.get_input_variable().shape - - n_in = np.prod(input_shape) + n_in = node.get_input_variable().size() - rounding_mode = node.get_attr("rounding_mode") - narrow = node.get_attr("narrow") - signed = node.get_attr("signed") - bitwidth = node.get_attr("bitwidth") + rounding_mode = node.get_attr('rounding_mode') + narrow = node.get_attr('narrow') + signed = node.get_attr('signed') + bitwidth = node.get_attr('bitwidth') precision, quantizer = _calculate_precision_quantizer(bitwidth, bitwidth, signed, narrow, rounding_mode) const_node = node.get_input_node(node.inputs[0]) - scale = node.get_attr("scale") - bias = node.get_attr("zeropt") + scale = node.get_attr('scale') + bias = node.get_attr('zeropt') # caclucate the new value - new_val = const_node.value / scale + bias + new_val = const_node.get_attr('value') / scale + bias const_node.set_attr('value', new_val) - const_node.set_attr("quant_precision", precision) - const_node.set_attr("quantizer", quantizer) + const_node.set_attr('quant_precision', precision) + const_node.set_attr('quantizer', quantizer) # reinitialize (which also runs quantization if quantizer exists) const_node.initialize() @@ -347,8 +322,8 @@ def transform(self, model, node): rescale = scale rebias = -bias * scale - attributes_rescale["scale_data"] = rescale - attributes_rescale["bias_data"] = rebias + attributes_rescale['scale_data'] = rescale + attributes_rescale['bias_data'] = rebias rescale_node = model.make_node( ApplyAlpha, node.name + '_rescale', attributes_rescale, [x for x in node.inputs], [x for x in node.outputs] @@ -359,25 +334,25 @@ def transform(self, model, node): def _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode): - ''' + """ A function to determine the precision and quantizer - ''' - if rounding_mode == "ROUND": - bn_round = "AP_RND_CONV" - elif rounding_mode == "FLOOR": - bn_round = "AP_TRN" + """ + if rounding_mode == 'ROUND': + bn_round = 'AP_RND_CONV' + elif rounding_mode == 'FLOOR': + bn_round = 'AP_TRN' else: raise NotImplementedError( - f"Rounding mode {rounding_mode} not supported in Quant node. Only ROUND and FLOOR supported." + f'Rounding mode {rounding_mode} not supported in Quant node. Only ROUND and FLOOR supported.' ) if narrow and not signed: - raise NotImplementedError("Narrow mode is only supported for singed numbers.") + raise NotImplementedError('Narrow mode is only supported for singed numbers.') if narrow: - bn_sat = "AP_SAT_SYM" + bn_sat = 'AP_SAT_SYM' else: - bn_sat = "AP_SAT" + bn_sat = 'AP_SAT' bitwidth = math.ceil(bitwidth) integer = math.ceil(integer) diff --git a/hls4ml/model/quantizers.py b/hls4ml/model/quantizers.py new file mode 100644 index 0000000000..c0a5869d5b --- /dev/null +++ b/hls4ml/model/quantizers.py @@ -0,0 +1,261 @@ +""" +Quantizer for the Quant node, after scale and zeropoint hafe been extracted +(unless scale is a power of 2, if doing special case po2) + +This is based on the sample implementation in finn-base +""" + +import numpy as np +import tensorflow as tf +from qkeras.quantizers import get_quantizer + +from hls4ml.model.types import ( + ExponentPrecisionType, + FixedPrecisionType, + IntegerPrecisionType, + RoundingMode, + SaturationMode, + XnorPrecisionType, +) + + +class Quantizer: + """ + Base class for representing quantizers in hls4ml. + + Subclasses of ``Quantizer`` are expected to wrap the quantizers of upstream tools (e.g., QKeras). + + Args: + bits (int): Total number of bits used by the quantizer. + hls_type (NamedType): The hls4ml type used by the quantizer. + """ + + def __init__(self, bits, hls_type): + self.bits = bits + self.hls_type = hls_type + + def __call__(self, data): + raise NotImplementedError + + +class BinaryQuantizer(Quantizer): + """Quantizer that quantizes to 0 and 1 (``bits=1``) or -1 and 1 (``bits==2``). + + Args: + bits (int, optional): Number of bits used by the quantizer. Defaults to 2. + + Raises: + Exception: Raised if ``bits>2`` + """ + + def __init__(self, bits=2): + if bits == 1: + hls_type = XnorPrecisionType() + elif bits == 2: + hls_type = IntegerPrecisionType(width=2) + else: + raise Exception(f'BinaryQuantizer suppots 1 or 2 bits, but called with bits={bits}') + super().__init__(bits, hls_type) + + def __call__(self, data): + zeros = np.zeros_like(data) + ones = np.ones_like(data) + quant_data = data + if self.bits == 1: + quant_data = np.where(data > 0, ones, zeros).astype('int') + if self.bits == 2: + quant_data = np.where(data > 0, ones, -ones) + return quant_data + + +class TernaryQuantizer(Quantizer): + """Quantizer that quantizes to -1, 0 and 1.""" + + def __init__(self): + super().__init__(2, IntegerPrecisionType(width=2)) + + def __call__(self, data): + zeros = np.zeros_like(data) + ones = np.ones_like(data) + return np.where(data > 0.5, ones, np.where(data <= -0.5, -ones, zeros)) + + +class QKerasQuantizer(Quantizer): + """Wrapper around QKeras quantizers. + + Args: + config (dict): Config of the QKeras quantizer to wrap. + """ + + def __init__(self, config): + self.quantizer_fn = get_quantizer(config) + self.alpha = config['config'].get('alpha', None) + if config['class_name'] == 'quantized_bits': + self.bits = config['config']['bits'] + self.hls_type = self._get_type(config) + # ! includes stochastic_ternary + elif 'ternary' in config['class_name']: + self.bits = 2 + self.hls_type = IntegerPrecisionType(width=2, signed=True) + # ! includes stochastic_binary + elif 'binary' in config['class_name']: + self.bits = 1 + self.hls_type = XnorPrecisionType() + else: + print("Unsupported quantizer: " + config['class_name']) + self.bits = 16 + self.hls_type = FixedPrecisionType(width=16, integer=6, signed=True) + + def __call__(self, data): + tf_data = tf.convert_to_tensor(data) + return self.quantizer_fn(tf_data).numpy() + # return self.quantizer_fn(data) + + def _get_type(self, quantizer_config): + width = quantizer_config['config']['bits'] + integer = quantizer_config['config'].get('integer', 0) + if quantizer_config['class_name'] == 'quantized_po2': + return ExponentPrecisionType(width=width, signed=True) + if width == integer: + if width == 1: + return XnorPrecisionType() + else: + return IntegerPrecisionType(width=width, signed=True) + else: + return FixedPrecisionType(width=width, integer=integer + 1, signed=True) + + +class QKerasBinaryQuantizer(Quantizer): + """Wrapper around QKeras binary quantizer. + + Args: + config (dict): Config of the QKeras quantizer to wrap. + """ + + def __init__(self, config, xnor=False): + self.bits = 1 if xnor else 2 + self.hls_type = XnorPrecisionType() if xnor else IntegerPrecisionType(width=2, signed=True) + self.alpha = config['config']['alpha'] + # Use the QKeras quantizer to handle any stochastic / alpha stuff + self.quantizer_fn = get_quantizer(config) + # Then we use our BinaryQuantizer to convert to '0,1' format + self.binary_quantizer = BinaryQuantizer(1) if xnor else BinaryQuantizer(2) + + def __call__(self, data): + x = tf.convert_to_tensor(data) + y = self.quantizer_fn(x).numpy() + return self.binary_quantizer(y) + + +class QKerasPO2Quantizer(Quantizer): + """Wrapper around QKeras power-of-2 quantizers. + + Args: + config (dict): Config of the QKeras quantizer to wrap. + """ + + def __init__(self, config): + self.bits = config['config']['bits'] + self.quantizer_fn = get_quantizer(config) + self.hls_type = ExponentPrecisionType(width=self.bits, signed=True) + + def __call__(self, data): + # Weights are quantized to nearest power of two + x = tf.convert_to_tensor(data) + y = self.quantizer_fn(x) + if hasattr(y, 'numpy'): + y = y.numpy() + return y + + +class QuantNodeQuantizer(Quantizer): + """ + This implements a quantizer for a FixedPrecisionType with width==integer + + This is based on the sample implementation in finn-base + """ + + def __init__(self, precision): + super().__init__(precision.width, precision) + if not isinstance(precision, FixedPrecisionType): + raise TypeError("QuantNodeQuantizer is only defined for FixedPrecisionType") + + def __call__(self, data): + """Apply the quantization on the data""" + + scale = 2 ** (self.hls_type.width - self.hls_type.integer) + + data = data * scale # (not using *= to avoid modifying data) + # Clamping + min_int_val = self._min_int(self.hls_type.signed, self.hls_type.saturation_mode, self.bits) + max_int_val = self._max_int(self.hls_type.signed, self.bits) + data = np.where(data > max_int_val, max_int_val, data) + data = np.where(data < min_int_val, min_int_val, data) + # Rounding + rounding_fx = self._resolve_rounding_mode(self.hls_type.rounding_mode) + return rounding_fx(data) / scale + + @staticmethod + def _min_int(signed: bool, saturation_mode: str, bit_width: int) -> int: + """Compute the minimum integer representable by a given number of bits. + Args: + signed (bool): Indicates whether the represented integer is signed or not. + saturation_mode (bool): Indicates the saturation mode used (AP_SAT_SYM or AP_SAT) + bit_width (int): Number of bits available for the representation. + Returns: + int: Maximum unsigned integer that can be represented according to + the input arguments. + Examples: + >>> min_int(signed=True, saturation_mode='AP_SAT_SYM', bit_width=8) + int(-127) + >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8) + int(0) + >>> min_int(signed=True, saturation_mode='AP_SAT', bit_width=8) + int(-128) + >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8) + int(0) + """ + if saturation_mode not in (SaturationMode.SAT_SYM, SaturationMode.SAT): + raise ValueError(f"Saturation mode {saturation_mode} not supported. Only AP_SAT_SYM, AP_SAT supported") + if signed and saturation_mode == SaturationMode.SAT_SYM: + value = -(2 ** (bit_width - 1)) + 1 + elif signed: + value = -(2 ** (bit_width - 1)) + else: + value = 0 + return value + + @staticmethod + def _max_int(signed: bool, bit_width: int) -> int: + """Compute the maximum integer representable by a given number of bits. + (Note, narrow and unsigned is not supported by the implementation, so saturation mode is not used) + Args: + signed (bool): Indicates whether the represented integer is signed or not. + bit_width (int): Number of bits available for the representation. + Returns: + Tensor: Maximum integer that can be represented according to + the input arguments. + Examples: + >>> max_int(signed=True, bit_width=8) + int(127) + >>> max_int(signed=False, bit_width=8) + int(255) + """ + if not signed: + value = (2**bit_width) - 1 + else: + value = (2 ** (bit_width - 1)) - 1 + return value + + @staticmethod + def _resolve_rounding_mode(mode): + """Resolve the rounding mode of Quant and Trunc ops + to the corresponding numpy functions.""" + if mode == RoundingMode.RND_CONV: + return np.round + # elif mode_string == "CEIL": # not supported + # return np.ceil + elif mode == RoundingMode.TRN: + return np.floor + else: + raise ValueError(f"Rounding mode {mode} not supported.") diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py index fc1cd98f19..8c182f4cca 100644 --- a/hls4ml/model/types.py +++ b/hls4ml/model/types.py @@ -8,162 +8,6 @@ from enum import Enum import numpy as np -import tensorflow as tf -from qkeras.quantizers import get_quantizer - -# region Quantizer definition - - -class Quantizer: - """ - Base class for representing quantizers in hls4ml. - - Subclasses of ``Quantizer`` are expected to wrap the quantizers of upstream tools (e.g., QKeras). - - Args: - bits (int): Total number of bits used by the quantizer. - hls_type (NamedType): The hls4ml type used by the quantizer. - """ - - def __init__(self, bits, hls_type): - self.bits = bits - self.hls_type = hls_type - - def __call__(self, data): - raise NotImplementedError - - -class BinaryQuantizer(Quantizer): - """Quantizer that quantizes to 0 and 1 (``bits=1``) or -1 and 1 (``bits==2``). - - Args: - bits (int, optional): Number of bits used by the quantizer. Defaults to 2. - - Raises: - Exception: Raised if ``bits>2`` - """ - - def __init__(self, bits=2): - if bits == 1: - hls_type = XnorPrecisionType() - elif bits == 2: - hls_type = IntegerPrecisionType(width=2) - else: - raise Exception(f'BinaryQuantizer suppots 1 or 2 bits, but called with bits={bits}') - super().__init__(bits, hls_type) - - def __call__(self, data): - zeros = np.zeros_like(data) - ones = np.ones_like(data) - quant_data = data - if self.bits == 1: - quant_data = np.where(data > 0, ones, zeros).astype('int') - if self.bits == 2: - quant_data = np.where(data > 0, ones, -ones) - return quant_data - - -class TernaryQuantizer(Quantizer): - """Quantizer that quantizes to -1, 0 and 1.""" - - def __init__(self): - super().__init__(2, IntegerPrecisionType(width=2)) - - def __call__(self, data): - zeros = np.zeros_like(data) - ones = np.ones_like(data) - return np.where(data > 0.5, ones, np.where(data <= -0.5, -ones, zeros)) - - -class QKerasQuantizer(Quantizer): - """Wrapper around QKeras quantizers. - - Args: - config (dict): Config of the QKeras quantizer to wrap. - """ - - def __init__(self, config): - self.quantizer_fn = get_quantizer(config) - self.alpha = config['config'].get('alpha', None) - if config['class_name'] == 'quantized_bits': - self.bits = config['config']['bits'] - self.hls_type = self._get_type(config) - # ! includes stochastic_ternary - elif 'ternary' in config['class_name']: - self.bits = 2 - self.hls_type = IntegerPrecisionType(width=2, signed=True) - # ! includes stochastic_binary - elif 'binary' in config['class_name']: - self.bits = 1 - self.hls_type = XnorPrecisionType() - else: - print("Unsupported quantizer: " + config['class_name']) - self.bits = 16 - self.hls_type = FixedPrecisionType(width=16, integer=6, signed=True) - - def __call__(self, data): - tf_data = tf.convert_to_tensor(data) - return self.quantizer_fn(tf_data).numpy() - # return self.quantizer_fn(data) - - def _get_type(self, quantizer_config): - width = quantizer_config['config']['bits'] - integer = quantizer_config['config'].get('integer', 0) - if quantizer_config['class_name'] == 'quantized_po2': - return ExponentPrecisionType(width=width, signed=True) - if width == integer: - if width == 1: - return XnorPrecisionType() - else: - return IntegerPrecisionType(width=width, signed=True) - else: - return FixedPrecisionType(width=width, integer=integer + 1, signed=True) - - -class QKerasBinaryQuantizer(Quantizer): - """Wrapper around QKeras binary quantizer. - - Args: - config (dict): Config of the QKeras quantizer to wrap. - """ - - def __init__(self, config, xnor=False): - self.bits = 1 if xnor else 2 - self.hls_type = XnorPrecisionType() if xnor else IntegerPrecisionType(width=2, signed=True) - self.alpha = config['config']['alpha'] - # Use the QKeras quantizer to handle any stochastic / alpha stuff - self.quantizer_fn = get_quantizer(config) - # Then we use our BinaryQuantizer to convert to '0,1' format - self.binary_quantizer = BinaryQuantizer(1) if xnor else BinaryQuantizer(2) - - def __call__(self, data): - x = tf.convert_to_tensor(data) - y = self.quantizer_fn(x).numpy() - return self.binary_quantizer(y) - - -class QKerasPO2Quantizer(Quantizer): - """Wrapper around QKeras power-of-2 quantizers. - - Args: - config (dict): Config of the QKeras quantizer to wrap. - """ - - def __init__(self, config): - self.bits = config['config']['bits'] - self.quantizer_fn = get_quantizer(config) - self.hls_type = ExponentPrecisionType(width=self.bits, signed=True) - - def __call__(self, data): - # Weights are quantized to nearest power of two - x = tf.convert_to_tensor(data) - y = self.quantizer_fn(x) - if hasattr(y, 'numpy'): - y = y.numpy() - return y - - -# endregion # region Precision types diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index 535bffb0da..2c314c13ca 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -88,7 +88,7 @@ def test_tfc_2w2a(tfc_2w2a_model, backend): model = tfc_2w2a_model ishape = (1, 1, 28, 28) - X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) + X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] @@ -111,7 +111,7 @@ def test_cnv_2w2a(cnv_2w2a_model, backend): model = cnv_2w2a_model ishape = (1, 32, 32, 3) - X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) + X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] @@ -137,7 +137,7 @@ def test_jet_tagging(jettagging_model, backend): # Execute QONNX model inference # TODO make the test bigger ishape = (1, 16) - X = np.random.uniform(low=-1, high=+1, size=np.product(ishape)).reshape(ishape).astype(np.float32) + X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] From 4d529756337961228216dc788aa1f8f79eb76cb3 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 5 Feb 2024 14:34:21 -0600 Subject: [PATCH 047/272] snapshot before removing redundant precision attributes --- .../model/optimizer/passes/conv_to_convxd.py | 25 +++--- .../optimizer/passes/matmul_const_to_dense.py | 24 +++--- hls4ml/model/optimizer/passes/merge_const.py | 79 ++++++++++--------- hls4ml/model/optimizer/passes/quant_opt.py | 45 +++++------ 4 files changed, 85 insertions(+), 88 deletions(-) diff --git a/hls4ml/model/optimizer/passes/conv_to_convxd.py b/hls4ml/model/optimizer/passes/conv_to_convxd.py index 28f4d4c0bd..efc5f3e89b 100644 --- a/hls4ml/model/optimizer/passes/conv_to_convxd.py +++ b/hls4ml/model/optimizer/passes/conv_to_convxd.py @@ -48,12 +48,13 @@ def transform(self, model, node): """Convert Conv with constant to a Conv1D or Conv2D layer""" weight_node = node.get_input_node(node.inputs[1]) - weight_precision = weight_node.get_attr("quant_precision") + weight_precision = weight_node.get_attr('quant_precision') + weight_data = weight_node.attributes['value'] bias_node = None bias_precision = None if len(node.inputs) == 3: bias_node = node.get_input_node(node.inputs[2]) - bias_precision = bias_node.get_attr("quant_precision") + bias_precision = bias_node.get_attr('quant_precision') # creating the attributes attributes = {k: node.attributes.get(k, None) for k in _base_attributes} @@ -61,24 +62,24 @@ def transform(self, model, node): # The ConvxD nodes expect the weight data to be in a different format, not (M, k1.., C) if node.attributes['n_dim'] == 1: newtype = Conv1D - attributes["weight_data"] = np.transpose(weight_node.value, (1, 2, 0)) + attributes['weight_data'] = np.transpose(weight_data, (1, 2, 0)) else: newtype = Conv2D - attributes["weight_data"] = np.transpose(weight_node.value, (1, 2, 3, 0)) - attributes["weight_precision"] = weight_precision - attributes["weight_quantizer"] = weight_node.get_attr("quantizer") + attributes['weight_data'] = np.transpose(weight_data, (1, 2, 3, 0)) + attributes['weight_precision'] = weight_precision + attributes['weight_quantizer'] = weight_node.get_attr('quantizer') if bias_node: - attributes["bias_data"] = bias_node.value - attributes["bias_precision"] = bias_precision - attributes["bias_quantizer"] = bias_node.get_attr("quantizer") + attributes['bias_data'] = bias_node.attributes['value'] + attributes['bias_precision'] = bias_precision + attributes['bias_quantizer'] = bias_node.get_attr('quantizer') else: - attributes["bias_data"] = np.zeros(attributes['n_filt']) - attributes["bias_precision"] = IntegerPrecisionType(1, False) + attributes['bias_data'] = np.zeros(attributes['n_filt']) + attributes['bias_precision'] = IntegerPrecisionType(1, False) # making new node new_node = model.make_node( - newtype, f"{newtype.__name__}_{node.name}", attributes, [node.inputs[0]], [x for x in node.outputs] + newtype, f'{newtype.__name__}_{node.name}', attributes, [node.inputs[0]], [x for x in node.outputs] ) # removing and replacing old nodes diff --git a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py index 82c7b56313..2a89ea0130 100644 --- a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py +++ b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py @@ -26,30 +26,32 @@ def transform(self, model, node): const_node = node.get_input_node(node.inputs[1]) other_var = node.get_input_variable(node.inputs[0]) - weight_precision = const_node.get_attr("quant_precision") - weight_quantizer = const_node.get_attr("quantizer") + weight_data = const_node.attributes['value'] + weight_precision = const_node.get_attr('quant_precision') + weight_quantizer = const_node.get_attr('quantizer') in_shape = other_var.shape n_in = np.prod(in_shape) - out_shape = list(in_shape[:-1]) + [const_node.value.shape[-1]] + out_shape = list(in_shape[:-1]) + [weight_data.shape[-1]] n_out = np.prod(out_shape) # creating the attributes attributes = {k: node.attributes.get(k, None) for k in _base_attributes} attributes.update( { - "weight_data": const_node.value, - "weight_precision": weight_precision, - "weight_quantizer": weight_quantizer, - "bias_data": np.zeros(out_shape), - "bias_precision": IntegerPrecisionType(1, False), - "n_in": n_in, - "n_out": n_out, + 'weight_data': weight_data, + 'weight_precision': weight_precision, + 'weight_quantizer': weight_quantizer, + 'bias_data': np.zeros(out_shape), + 'bias_precision': IntegerPrecisionType(1, False), + 'have_bias': False, + 'n_in': n_in, + 'n_out': n_out, } ) # making new node - new_dense = model.make_node(Dense, f"Dense_{node.name}", attributes, [node.inputs[0]], [x for x in node.outputs]) + new_dense = model.make_node(Dense, f'Dense_{node.name}', attributes, [node.inputs[0]], [x for x in node.outputs]) # removing and replacing old nodes model.remove_node(const_node, rewire=False) diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index adc7dff093..4b13982259 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -6,8 +6,6 @@ _base_attributes = ('Trace', 'reuse_factor', 'n_in') -# TODO This doesn't yet support quantization in the constants - class MergeTwoConstants(OptimizerPass): """Merge of two constants makes another constant""" @@ -23,15 +21,18 @@ def match(self, node): def transform(self, model, node): """ - Merge of two constants makes another constant + Merge of two constants makes another constant. + + Note: full precision is used in the calculation, and precision is not propagated. + The precision """ const_node0 = node.get_input_node(node.inputs[0]) const_node1 = node.get_input_node(node.inputs[1]) - val0 = const_node0.value - val1 = const_node1.value + val0 = const_node0.attributes['value'] + val1 = const_node1.attributes['value'] - op = node.attributes["op"] + op = node.attributes['op'] if op in ('add', 'sum'): new_val = val0 + val1 elif op == 'sub': @@ -47,16 +48,18 @@ def transform(self, model, node): elif op == 'min': new_val = np.minimum(val0, val1) else: - raise RuntimeError(f"Unexpected op_type: {op}") + raise RuntimeError(f'Unexpected op_type: {op}') - quantizer = node.get_attr("quantizer") # None if not defined + quantizer = node.get_attr('quantizer') # None if not defined + const_node0.set_attr('quantizer', quantizer) # overwrite the quantizer if quantizer: - const_node0.set_attr("quantizer", quantizer) - const_node0.set_attr("value", new_val) + const_node0.set_attr('quantizer', quantizer) + + const_node0.set_attr('value', new_val) - quant_precision = node.get_attr("quant_precision") + quant_precision = node.get_attr('quant_precision') if quant_precision: - const_node0.set_attr("quant_precision", quant_precision) + const_node0.set_attr('quant_precision', quant_precision) # reinitialize (which also runs quantization if quantizer exists) const_node0.initialize() @@ -75,7 +78,7 @@ class MergeToApplyAlpha(OptimizerPass): def match(self, node): is_match = ( isinstance(node, Merge) - and node.attributes["op"] in ("add", "sum", "sub", "mul") # Div is separate + and node.attributes['op'] in ('add', 'sum', 'sub', 'mul') # Div is separate and ( isinstance(node.get_input_node(node.inputs[0]), Constant) != isinstance(node.get_input_node(node.inputs[1]), Constant) @@ -103,21 +106,21 @@ def transform(self, model, node): bias_precision = None bias_quantizer = None - op = node.attributes["op"] + op = node.attributes['op'] if op in ('add', 'sum'): scale = np.array(1) - bias = const_node.value - bias_precision = const_node.get_attr("quant_precision") - bias_quantizer = const_node.get_attr("quantizer") + bias = const_node.attribute['value'] + bias_precision = const_node.get_attr('quant_precision') + bias_quantizer = const_node.get_attr('quantizer') elif op == 'sub': if node1const: scale = np.array(1) - bias = -const_node.value + bias = -const_node.attribute['value'] else: scale = np.array(-1) - bias = const_node.value - bias_precision = const_node.get_attr("quant_precision") - bias_quantizer = const_node.get_attr("quantizer") + bias = const_node.attribute['value'] + bias_precision = const_node.get_attr('quant_precision') + bias_quantizer = const_node.get_attr('quantizer') if bias_precision and not bias_precision.signed: # need to add a bit bias_precision.signed = 1 @@ -126,10 +129,10 @@ def transform(self, model, node): bias_quantizer = QuantNodeQuantizer(bias_precision) elif op == 'mul': - scale = const_node.value + scale = const_node.attribute['value'] bias = np.array(0) - scale_precision = const_node.get_attr("quant_precision") - scale_quantizer = const_node.get_attr("quantizer") + scale_precision = const_node.get_attr('quant_precision') + scale_quantizer = const_node.get_attr('quantizer') # because C++ doesn't do broadcasting, we may have to change the shapes of the scale and bias if scale.shape != tuple(input_shape) and np.squeeze(scale).shape != tuple(input_shape): @@ -140,20 +143,20 @@ def transform(self, model, node): attributes = {k: node.attributes.get(k, None) for k in _base_attributes} attributes.update( { - "scale_data": scale, - "bias_data": bias, - "n_in": n_in, - "n_out": n_in, - "n_filt": -1, - "scale_precision": scale_precision, - "scale_quantizer": scale_quantizer, - "bias_precision": bias_precision, - "bias_quantizer": bias_quantizer, + 'scale_data': scale, + 'bias_data': bias, + 'n_in': n_in, + 'n_out': n_in, + 'n_filt': -1, + 'scale_precision': scale_precision, + 'scale_quantizer': scale_quantizer, + 'bias_precision': bias_precision, + 'bias_quantizer': bias_quantizer, } ) bn_layer = model.make_node( - ApplyAlpha, f"bn_{node.name}", attributes, [node.inputs[input_node_idx]], [x for x in node.outputs] + ApplyAlpha, f'bn_{node.name}', attributes, [node.inputs[input_node_idx]], [x for x in node.outputs] ) model.remove_node(const_node, rewire=False) @@ -172,7 +175,7 @@ class MergeToApplyAlphaDiv(OptimizerPass): def match(self, node): is_match = ( isinstance(node, Merge) - and node.attributes["op"] == 'div' + and node.attributes['op'] == 'div' and isinstance(node.get_input_node(node.inputs[1]), Constant) ) # only second can be const @@ -182,7 +185,7 @@ def transform(self, model, node): input_shape = node.get_input_variable().shape n_in = np.prod(input_shape) const_node = node.get_input_node(node.inputs[1]) - scale = 1 / const_node.value + scale = 1 / const_node.attribute['value'] bias = np.array(0) # because C++ doesn't do broadcasting, we may have to change the shapes of the scale and bias @@ -192,9 +195,9 @@ def transform(self, model, node): bias = np.broadcast_to(bias, input_shape) attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update({"scale_data": scale, "bias_data": bias, "n_in": n_in, "n_out": n_in, "n_filt": -1}) + attributes.update({'scale_data': scale, 'bias_data': bias, 'n_in': n_in, 'n_out': n_in, 'n_filt': -1}) - bn_layer = model.make_node(ApplyAlpha, f"bn_{node.name}", attributes, [node.inputs[0]], [x for x in node.outputs]) + bn_layer = model.make_node(ApplyAlpha, f'bn_{node.name}', attributes, [node.inputs[0]], [x for x in node.outputs]) model.remove_node(const_node, rewire=False) model.replace_node(node, bn_layer) diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py index dc6deab14b..e49ff99bd7 100644 --- a/hls4ml/model/optimizer/passes/quant_opt.py +++ b/hls4ml/model/optimizer/passes/quant_opt.py @@ -31,10 +31,14 @@ class QuantConstantParameters(OptimizerPass): """Remove Constant from the Qaunt node parameters (but not input[0])""" def match(self, node): - is_match = isinstance(node, Quant) and ( - (node.get_input_node(node.inputs[1]) and isinstance(node.get_input_node(node.inputs[1]), Constant)) - or (node.get_input_node(node.inputs[2]) and isinstance(node.get_input_node(node.inputs[2]), Constant)) - or (node.get_input_node(node.inputs[3]) and isinstance(node.get_input_node(node.inputs[3]), Constant)) + is_match = ( + isinstance(node, Quant) + and len(node.inputs) == 4 + and ( + (node.get_input_node(node.inputs[1]) and isinstance(node.get_input_node(node.inputs[1]), Constant)) + or (node.get_input_node(node.inputs[2]) and isinstance(node.get_input_node(node.inputs[2]), Constant)) + or (node.get_input_node(node.inputs[3]) and isinstance(node.get_input_node(node.inputs[3]), Constant)) + ) ) return is_match @@ -67,6 +71,10 @@ def transform(self, model, node): node.inputs[3] = '' model.remove_node(bitwidth_node, rewire=False) + node.inputs = [inp for inp in node.inputs if inp] + if len(node.inputs) != 1: + raise RuntimeError("hls4ml only supports constant scale, zeropt, and bitwidth values") + return True @@ -83,10 +91,8 @@ def match(self, node): is_match = ( isinstance(node, Quant) + and len(node.inputs) == 1 and not isinstance(node.get_input_node(node.inputs[0]), Constant) - and not node.get_input_node(node.inputs[1]) - and not node.get_input_node(node.inputs[2]) - and not node.get_input_node(node.inputs[3]) ) # Only match if the scale is power of 2 and the zero-point is 0s @@ -142,11 +148,7 @@ class FuseQuantWithConstant(OptimizerPass): def match(self, node): # only matches after the other inputs are already folded is_match = ( - isinstance(node, Quant) - and isinstance(node.get_input_node(node.inputs[0]), Constant) - and not node.get_input_node(node.inputs[1]) - and not node.get_input_node(node.inputs[2]) - and not node.get_input_node(node.inputs[3]) + isinstance(node, Quant) and len(node.inputs) == 1 and isinstance(node.get_input_node(node.inputs[0]), Constant) ) # Only match if the scale is power of 2 and the zero-point is 0s @@ -197,7 +199,7 @@ def transform(self, model, node): class QuantToAlphaActivationAlpha(OptimizerPass): """ - This is for the case when scale is not 1 or zeropt is not 0. It is a a 1:3 transformation of + This is for the case when scale is not power-of-2 or zeropt is not 0. It is a a 1:3 transformation of a Quant to an ApplyAlpha (to scale), Activatio, ApplyAlpho (to rescale). NOTE: It needs to be scheduled after QuantToActivation (or we need to make the match criteria stricter) @@ -207,10 +209,8 @@ def match(self, node): # only matches after the other inputs are already folded is_match = ( isinstance(node, Quant) + and len(node.inputs) == 1 and not isinstance(node.get_input_node(node.inputs[0]), Constant) - and not node.get_input_node(node.inputs[1]) - and not node.get_input_node(node.inputs[2]) - and not node.get_input_node(node.inputs[3]) ) return is_match @@ -265,7 +265,7 @@ def transform(self, model, node): class ConstQuantToConstAlpha(OptimizerPass): """ - This is for the case when scale is not 1 or zeropt is not 0. It is a a 1:3 transformation of + This is for the case when scale is not power-of-2 or zeropt is not 0. It is a a 1:3 transformation of a Quant to an ApplyAlpha (to scale), Activation, ApplyAlpho (to unscale), but an input consts allows for optimization, so the ApplyAlpha (to scale), Activation are optimized away right away. @@ -274,11 +274,7 @@ class ConstQuantToConstAlpha(OptimizerPass): def match(self, node): # only matches after the other inputs are already folded is_match = ( - isinstance(node, Quant) - and isinstance(node.get_input_node(node.inputs[0]), Constant) - and not node.get_input_node(node.inputs[1]) - and not node.get_input_node(node.inputs[2]) - and not node.get_input_node(node.inputs[3]) + isinstance(node, Quant) and len(node.inputs) == 1 and isinstance(node.get_input_node(node.inputs[0]), Constant) ) if is_match: # to make sure this is a quant node with inputs @@ -292,10 +288,6 @@ def transform(self, model, node): Change Constant + Quant node to Constant, ApplyAlpha """ - # Do the Activation as in the simple case - - n_in = node.get_input_variable().size() - rounding_mode = node.get_attr('rounding_mode') narrow = node.get_attr('narrow') signed = node.get_attr('signed') @@ -318,7 +310,6 @@ def transform(self, model, node): const_node.initialize() attributes_rescale = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes_rescale.update({'n_in': n_in, 'n_out': n_in, 'n_filt': -1}) rescale = scale rebias = -bias * scale From cf5c9a105f27ffe3d2a81269c5664565e3362ffd Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 7 Feb 2024 10:24:33 -0600 Subject: [PATCH 048/272] snapshot --- hls4ml/model/layers.py | 24 ++- .../model/optimizer/passes/batchnorm_opt.py | 150 +++++++++++++----- hls4ml/model/optimizer/passes/bn_fuse.py | 41 ++++- .../model/optimizer/passes/conv_to_convxd.py | 10 +- .../optimizer/passes/matmul_const_to_dense.py | 5 +- hls4ml/model/optimizer/passes/merge_const.py | 70 +++++--- hls4ml/model/optimizer/passes/quant_opt.py | 6 +- hls4ml/model/quantizers.py | 12 +- hls4ml/model/types.py | 23 ++- 9 files changed, 248 insertions(+), 93 deletions(-) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 7da730b60a..b5d9f492af 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -22,6 +22,7 @@ IntegerPrecisionType, NamedType, TensorVariable, + UnspecifiedPrecisionType, WeightVariable, find_minimum_width, ) @@ -361,7 +362,12 @@ def initialize(self): shape = (1,) self.set_attr('value', np.array([value])) dims = [f'{self.name}_{i}' for i in range(len(shape))] - self.add_output_variable(shape, dims, var_name=self.name, precision=self.get_attr("precision")) + quantizer = self.get_attr('quantizer') + + # Should the else clause below be None or UnspecifiedPrecisionType + precision = quantizer.hls_type if quantizer is not None else UnspecifiedPrecisionType() + + self.add_output_variable(shape, dims, var_name=self.name, precision=precision) class Quant(Layer): # The QONNX quantization layer @@ -901,6 +907,7 @@ def initialize(self): self.add_output_variable(shape, dims) +# TODO: We currently seem to ignore the quantizers to mean, variance, etc. class BatchNormalization(Layer): _expected_attributes = [ Attribute('n_in'), @@ -943,19 +950,22 @@ def initialize(self): self.add_output_variable(shape, dims) self.set_attr('n_in', inp.size()) + # precision values are ignored if quantizer is not None scale = self.get_attr('scale_data') scale_quantizer = self.get_attr('scale_quantizer') + scale_precision = self.get_attr('scale_precision') bias = self.get_attr('bias_data') bias_quantizer = self.get_attr('bias_quantizer') + bias_precision = self.get_attr('bias_precision') - self.add_weights(scale, quantizer=scale_quantizer) - self.add_bias(bias, quantizer=bias_quantizer) + self.add_weights(scale, quantizer=scale_quantizer, precision=scale_precision) + self.add_bias(bias, quantizer=bias_quantizer, precision=bias_precision) - def add_weights(self, scale, quantizer=None): - self.add_weights_variable(name='scale', var_name='s{index}', data=scale, quantizer=quantizer) + def add_weights(self, scale, quantizer=None, precision=None): + self.add_weights_variable(name='scale', var_name='s{index}', data=scale, quantizer=quantizer, precision=precision) - def add_bias(self, bias, quantizer=None): - self.add_weights_variable(name='bias', var_name='b{index}', data=bias, quantizer=quantizer) + def add_bias(self, bias, quantizer=None, precision=None): + self.add_weights_variable(name='bias', var_name='b{index}', data=bias, quantizer=quantizer, precision=precision) class Merge(Layer): diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index a74047676d..3e0984dccb 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -1,7 +1,9 @@ import numpy as np -from hls4ml.model.layers import BatchNormalization, BatchNormOnnx, Constant +from hls4ml.model.layers import ApplyAlpha, BatchNormalization, BatchNormOnnx, Constant from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.quantizers import QuantNodeQuantizer +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, UnspecifiedPrecisionType _base_attributes = ('Trace', 'reuse_factor', 'epsilon', 'n_in', 'n_filt') @@ -17,49 +19,55 @@ def match(self, node): def transform(self, model, node): """ Remove Constant from the BatchNormalization node parameters (but not input[0]) + + TODO: Currently the quantizers are not actually used by the underlying layer. """ if not (len(node.inputs) == 5 and all(node.inputs)): - raise ValueError(f"All {len.node.inputs} BatchNormOnnnx inputs need to be defined") + raise ValueError(f'All {len.node.inputs} BatchNormOnnnx inputs need to be defined') attributes = {k: node.attributes.get(k, None) for k in _base_attributes} gamma_node = node.get_input_node(node.inputs[1]) if not isinstance(gamma_node, Constant): - raise TypeError("Only consant gammas supported") - gamma = gamma_node.value + raise TypeError('Only consant gammas supported') + gamma = gamma_node.attributes['value'] attributes['gamma_data'] = gamma + attributes['gamma_quantizer'] = gamma_node.get_attr['quantizer'] + node.inputs[1] = '' model.remove_node(gamma_node, rewire=False) beta_node = node.get_input_node(node.inputs[2]) if not isinstance(beta_node, Constant): - raise TypeError("Only consant betas supported") - beta = beta_node.value + raise TypeError('Only consant betas supported') + beta = beta_node.attributes['value'] attributes['beta_data'] = beta + attributes['beta_quantizer'] = beta_node.get_attr['quantizer'] node.inputs[2] = '' model.remove_node(beta_node, rewire=False) moving_mean_node = node.get_input_node(node.inputs[3]) if not isinstance(moving_mean_node, Constant): - raise TypeError("Only consant moving_means supported") - moving_mean = moving_mean_node.value + raise TypeError('Only consant moving_means supported') + moving_mean = moving_mean_node.attributes['value'] attributes['mean_data'] = moving_mean + attributes['mean_quantizer'] = moving_mean_node.get_attr['quantizer'] node.inputs[3] = '' model.remove_node(moving_mean_node, rewire=False) moving_variance_node = node.get_input_node(node.inputs[4]) if not isinstance(moving_variance_node, Constant): - raise TypeError("Only consant moving_variances supported") - moving_variance = moving_variance_node.value + raise TypeError('Only consant moving_variances supported') + moving_variance = moving_variance_node.attributes['value'] attributes['variance_data'] = moving_variance + attributes['variance_quantizer'] = moving_variance_node.get_attr['quantizer'] node.inputs[4] = '' model.remove_node(moving_variance_node, rewire=False) - # scale = gamma / np.sqrt(moving_variance + node.get_attr('epsilon')) - # bias = beta - gamma * moving_mean / np.sqrt(moving_variance + node.get_attr('epsilon')) - # attributes["scale_data"] = scale - # attributes["bias_data"] = bias + node.inputs = [inp for inp in node.inputs if inp] + if len(node.inputs) != 1: + raise RuntimeError('The QONNX batchnomr had unexpected inputs.') new_node = model.make_node(BatchNormalization, node.name, attributes, [node.inputs[0]], [x for x in node.outputs]) @@ -78,7 +86,6 @@ def match(self, node): isinstance(node, BatchNormalization) and not any(node.inputs[1:]) and isinstance(node.get_input_node(node.inputs[0]), Constant) - and not node.get_input_node(node.inputs[0]).get_attr("quant_precision") ) return is_match @@ -88,13 +95,48 @@ def transform(self, model, node): """ const_node = node.get_input_node(node.inputs[0]) - new_val = const_node.value * node.weights["scale"].data_unquantized + node.weights["bias"].data_unquantized - const_node.set_attr("value", new_val) - const_node.set_attr("quantizer", node.get_attr("quantizer")) # None if not defined - const_node.set_attr("quant_precision", node.get_attr("quant_precision")) - - # reinitialize (which also runs quantization if quantizer exists) - const_node.initialize() + const_prec = const_node.get_output_variable().type.precision + + new_val = const_node.value * node.weights['scale'].data_unquantized + node.weights['bias'].data_unquantized + + const_node.set_attr('value', new_val) + const_node.set_attr('quantizer', node.get_attr('quantizer')) # None if not defined + + if isinstance(node.get_output_variable().type.precision, UnspecifiedPrecisionType): + if isinstance(const_prec, UnspecifiedPrecisionType): + pass # leave it as is + else: + const_node.get_output_variable().type.precision = UnspecifiedPrecisionType() # default + # propagate precision + scale_q = node.get_attr('scale_quantizer') + bias_q = node.get_attr('bias_quantizer') + if scale_q and bias_q: + # propagate precsion + scale_prec = scale_q.hls_type + bias_prec = bias_q.hls_type + if scale_prec not in (IntegerPrecisionType, FixedPrecisionType) or bias_prec not in ( + IntegerPrecisionType, + FixedPrecisionType, + ): + print("Warning: output type not propagated for constant merge") + else: + signed_prod = const_prec.signed or scale_prec.signed + w_prod = const_prec.width + scale_prec.width + i_prod = const_prec.integer + scale_prec.integer + signed = signed_prod or bias_prec.signed + i_tot = ( + max( + i_prod + (bias_prec.signed and not signed_prod), + bias_prec.ingeter + (signed_prod and not bias_prec.signed), + ) + + 1 + ) + w_tot = i_tot + max(w_prod - i_prod, bias_prec.width - bias_prec.integer) + new_prec = FixedPrecisionType(w_tot, i_tot, signed) + const_node.set_attr('quantizer', QuantNodeQuantizer(new_prec)) + const_node.get_output_variable().type.precision = new_prec + else: + const_node.get_output_variable().type.precision = node.get_output_variable().type.precision # remove the batch norm node model.remove_node(node, rewire=True) @@ -103,17 +145,21 @@ def transform(self, model, node): class FuseConsecutiveBatchNormalization(OptimizerPass): - ''' + """ OptimizerPass to merge consecutive BatchNormalization layers, only if the earlier one does not have quantization specified - ''' + + Note: Consider restricting this to ApplyAlpha. Batch Normalization quantization seems to be ignored. + + Note: This optimizer may not be safe if weights are updateable. May need to turn off. + """ def match(self, node): prev_node = node.get_input_node(node.inputs[0]) basic_match = ( - isinstance(node, BatchNormalization) - and isinstance(prev_node, BatchNormalization) - and not prev_node.get_attr("quant_precision") + isinstance(node, ApplyAlpha) + and isinstance(prev_node, ApplyAlpha) + and isinstance(prev_node.get_output_variable().type.precision, UnspecifiedPrecisionType) ) # check for compatibility to merge @@ -123,12 +169,12 @@ def match(self, node): s1 = node.weights['scale'].data_unquantized b1 = node.weights['bias'].data_unquantized scale_compatible = ( - (prev_node.get_attr("scale_quantizer") is None and node.get_attr("scale_quantizer") is None) + (prev_node.get_attr('scale_quantizer') is None and node.get_attr('scale_quantizer') is None) or (s0 == np.ones_like(s0)).all() or (s1 == np.ones_like(s1)).all() ) bias_compatible = ( - (prev_node.get_attr("bias_quantizer") is None and node.get_attr("bias_quantizer") is None) + (prev_node.get_attr('bias_quantizer') is None and node.get_attr('bias_quantizer') is None) or (b0 == np.zeros_like(b0)).all() or (b1 == np.zeros_like(b1)).all() ) @@ -139,31 +185,57 @@ def match(self, node): def transform(self, model, node): prev_node = node.get_input_node(node.inputs[0]) + prev_map = prev_node.get_output_use_map() + if len(prev_map[prev_node.outputs[0]]) > 1: + return False + + # # Not sure why this part is needed + # node_map = node.get_output_use_map() + # if len(node_map[node.outputs[0]]) > 1: + # return False + s0 = prev_node.weights['scale'].data_unquantized b0 = prev_node.weights['bias'].data_unquantized s1 = node.weights['scale'].data_unquantized b1 = node.weights['bias'].data_unquantized s_quantizer = ( - node.get_attr("scale_quantizer") if (s0 == np.ones_like(s0)).all() else prev_node.get_attr("scale_quantizer") + node.get_attr('scale_quantizer') if (s0 == np.ones_like(s0)).all() else prev_node.get_attr('scale_quantizer') ) b_quantizer = ( - node.get_attr("bias_quantizer") if (b0 == np.zeros_like(b0)).all() else prev_node.get_attr("bias_quantizer") + node.get_attr('bias_quantizer') if (b0 == np.zeros_like(b0)).all() else prev_node.get_attr('bias_quantizer') ) - node.set_attr("scale_quantizer", s_quantizer) - node.set_attr("bias_quantizer", b_quantizer) - if s_quantizer: - node.set_attr("scale_precision", s_quantizer.hls_type) - if b_quantizer: - node.set_attr("bias_precision", b_quantizer.hls_type) + node.set_attr('scale_quantizer', s_quantizer) + node.set_attr('bias_quantizer', b_quantizer) scale_new = s0 * s1 bias_new = s1 * b0 + b1 + # Not sure if this setting of this is useful + s_prec = None + if s_quantizer is None and (scale_new == np.ones_like(scale_new)).all(): + if ( + isinstance(prev_node.weights['scale'].type, IntegerPrecisionType) + and isinstance(node.weights['scale'].type, IntegerPrecisionType) + and prev_node.weights['scale'].type.width == 1 + and node.weights['scale'].type.width == 1 + ): + s_prec = node.weights['scale'].type + + b_prec = None + if b_quantizer is None and (bias_new == np.zeros_like(bias_new)).all(): + if ( + isinstance(prev_node.weights['bias'].type, IntegerPrecisionType) + and isinstance(node.weights['bias'].type, IntegerPrecisionType) + and prev_node.weights['bias'].type.width == 1 + and node.weights['bias'].type.width == 1 + ): + b_prec = node.weights['bias'].type + # call function so that quantizer would be called if needed - node.add_weights_variable(name='scale', var_name='s{index}', data=scale_new) - node.add_weights_variable(name='bias', var_name='b{index}', data=bias_new) + node.add_weights_variable(name='scale', var_name='s{index}', data=scale_new, quantizer=s_quantizer, precision=s_prec) + node.add_weights_variable(name='bias', var_name='b{index}', data=bias_new, quantizer=b_quantizer, precision=b_prec) model.remove_node(prev_node, rewire=True) return True diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py index 02d9b849ed..3d79de7dc8 100644 --- a/hls4ml/model/optimizer/passes/bn_fuse.py +++ b/hls4ml/model/optimizer/passes/bn_fuse.py @@ -1,25 +1,50 @@ +import numpy as np + from hls4ml.model.layers import BatchNormalization, Conv1D, Conv2D, Dense from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import UnspecifiedPrecisionType class FuseBatchNormalization(OptimizerPass): def match(self, node): - is_match = ( + prev_node = node.get_input_node(node.inputs[0]) + basic_match = ( isinstance(node, BatchNormalization) - and isinstance(node.get_input_node(), (Dense, Conv1D, Conv2D)) - and node.get_input_node().get_attr('weight_quantizer') is None - and node.get_input_node().get_attr('bias_quantizer') is None + and isinstance(prev_node, (Dense, Conv1D, Conv2D)) + and isinstance(prev_node.get_output_variable().type.precision, UnspecifiedPrecisionType) ) - return is_match + if basic_match: + s0 = prev_node.weights['weight'].data_unquantized + b0 = prev_node.weights['bias'].data_unquantized + s1 = node.weights['scale'].data_unquantized + b1 = node.weights['bias'].data_unquantized + scale_compatible = ( + (prev_node.get_attr('weight_quantizer') is None and node.get_attr('scale_quantizer') is None) + or (s0 == np.ones_like(s0)).all() + or (s1 == np.ones_like(s1)).all() + ) + bias_compatible = ( + (prev_node.get_attr('bias_quantizer') is None and node.get_attr('bias_quantizer') is None) + or (b0 == np.zeros_like(b0)).all() + or (b1 == np.zeros_like(b1)).all() + ) + return scale_compatible and bias_compatible + + else: + return False def transform(self, model, node): - # Fuse weight and bias of Dense/Conv1D/Conv2D layer with BN values + """Fuse weight and bias of Dense/Conv1D/Conv2D layer with BN values.""" parent_node = node.get_input_node() parent_map = parent_node.get_output_use_map() - node_map = node.get_output_use_map() - if len(parent_map[parent_node.name]) > 1 or len(node_map[node.name]) > 1: + if len(parent_map[parent_node.outputs[0]]) > 1: return False + # # Not sure why this part is needed + # node_map = node.get_output_use_map() + # if len(node_map[node.outputs[0]]) > 1: + # return False + parent_weight = parent_node.weights['weight'] parent_bias = parent_node.weights['bias'] diff --git a/hls4ml/model/optimizer/passes/conv_to_convxd.py b/hls4ml/model/optimizer/passes/conv_to_convxd.py index efc5f3e89b..b61b0340be 100644 --- a/hls4ml/model/optimizer/passes/conv_to_convxd.py +++ b/hls4ml/model/optimizer/passes/conv_to_convxd.py @@ -2,6 +2,7 @@ from hls4ml.model.layers import Constant, Conv, Conv1D, Conv2D from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.quantizers import QuantNodeQuantizer from hls4ml.model.types import IntegerPrecisionType # these are attributes to copy @@ -48,13 +49,10 @@ def transform(self, model, node): """Convert Conv with constant to a Conv1D or Conv2D layer""" weight_node = node.get_input_node(node.inputs[1]) - weight_precision = weight_node.get_attr('quant_precision') weight_data = weight_node.attributes['value'] bias_node = None - bias_precision = None if len(node.inputs) == 3: bias_node = node.get_input_node(node.inputs[2]) - bias_precision = bias_node.get_attr('quant_precision') # creating the attributes attributes = {k: node.attributes.get(k, None) for k in _base_attributes} @@ -66,16 +64,16 @@ def transform(self, model, node): else: newtype = Conv2D attributes['weight_data'] = np.transpose(weight_data, (1, 2, 3, 0)) - attributes['weight_precision'] = weight_precision attributes['weight_quantizer'] = weight_node.get_attr('quantizer') if bias_node: attributes['bias_data'] = bias_node.attributes['value'] - attributes['bias_precision'] = bias_precision attributes['bias_quantizer'] = bias_node.get_attr('quantizer') + attributes['have_bias'] = True else: attributes['bias_data'] = np.zeros(attributes['n_filt']) - attributes['bias_precision'] = IntegerPrecisionType(1, False) + attributes['bias_quantizer'] = QuantNodeQuantizer(IntegerPrecisionType(1, False)) + attributes['have_bias'] = False # making new node new_node = model.make_node( diff --git a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py index 2a89ea0130..7eac0ccca3 100644 --- a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py +++ b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py @@ -2,6 +2,7 @@ from hls4ml.model.layers import Constant, Dense, MatMul from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.quantizers import QuantNodeQuantizer from hls4ml.model.types import IntegerPrecisionType _base_attributes = ('Trace', 'reuse_factor', 'weight', 'weight_t', 'bias', 'bias_t') @@ -27,7 +28,6 @@ def transform(self, model, node): other_var = node.get_input_variable(node.inputs[0]) weight_data = const_node.attributes['value'] - weight_precision = const_node.get_attr('quant_precision') weight_quantizer = const_node.get_attr('quantizer') in_shape = other_var.shape @@ -40,10 +40,9 @@ def transform(self, model, node): attributes.update( { 'weight_data': weight_data, - 'weight_precision': weight_precision, 'weight_quantizer': weight_quantizer, 'bias_data': np.zeros(out_shape), - 'bias_precision': IntegerPrecisionType(1, False), + 'bias_quantizer': QuantNodeQuantizer(IntegerPrecisionType(1, False)), 'have_bias': False, 'n_in': n_in, 'n_out': n_out, diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index 4b13982259..11848c9081 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -3,6 +3,7 @@ from hls4ml.model.layers import ApplyAlpha, Constant, Merge from hls4ml.model.optimizer import OptimizerPass from hls4ml.model.quantizers import QuantNodeQuantizer +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType _base_attributes = ('Trace', 'reuse_factor', 'n_in') @@ -57,10 +58,6 @@ def transform(self, model, node): const_node0.set_attr('value', new_val) - quant_precision = node.get_attr('quant_precision') - if quant_precision: - const_node0.set_attr('quant_precision', quant_precision) - # reinitialize (which also runs quantization if quantizer exists) const_node0.initialize() @@ -101,6 +98,7 @@ def transform(self, model, node): input_shape = node.get_input_variable(node.inputs[input_node_idx]).shape n_in = np.prod(input_shape) + # Note: precision is ignored if quantizer is not None scale_precision = None scale_quantizer = None bias_precision = None @@ -109,30 +107,40 @@ def transform(self, model, node): op = node.attributes['op'] if op in ('add', 'sum'): scale = np.array(1) + scale_precision = IntegerPrecisionType(1, False) bias = const_node.attribute['value'] - bias_precision = const_node.get_attr('quant_precision') bias_quantizer = const_node.get_attr('quantizer') elif op == 'sub': + bias_quantizer = const_node.get_attr('quantizer') if node1const: scale = np.array(1) + scale_precision = IntegerPrecisionType(1, False) bias = -const_node.attribute['value'] + if ( + bias_quantizer is not None + and isinstance(bias_quantizer.hls_type, (IntegerPrecisionType, FixedPrecisionType)) + and not bias_quantizer.hls_type.signed + ): + # need to make signed and increas the bit, if unsigned + bias_precision = FixedPrecisionType( + bias_quantizer.hls_type.width + 1, + bias_quantizer.hls_type.integer + 1, + True, + bias_quantizer.hls_type.rounding_mode, + bias_quantizer.hls_type.saturation_mode, + bias_quantizer.hls_type.saturation_bits, + ) + bias_quantizer = QuantNodeQuantizer(bias_precision) else: scale = np.array(-1) + scale_precision = IntegerPrecisionType(2, True) bias = const_node.attribute['value'] - bias_precision = const_node.get_attr('quant_precision') - bias_quantizer = const_node.get_attr('quantizer') - if bias_precision and not bias_precision.signed: - # need to add a bit - bias_precision.signed = 1 - bias_precision.width += 1 - bias_precision.integer += 1 - bias_quantizer = QuantNodeQuantizer(bias_precision) elif op == 'mul': scale = const_node.attribute['value'] - bias = np.array(0) - scale_precision = const_node.get_attr('quant_precision') scale_quantizer = const_node.get_attr('quantizer') + bias = np.array(0) + bias_precision = IntegerPrecisionType(1, False) # because C++ doesn't do broadcasting, we may have to change the shapes of the scale and bias if scale.shape != tuple(input_shape) and np.squeeze(scale).shape != tuple(input_shape): @@ -155,12 +163,12 @@ def transform(self, model, node): } ) - bn_layer = model.make_node( + aa_layer = model.make_node( ApplyAlpha, f'bn_{node.name}', attributes, [node.inputs[input_node_idx]], [x for x in node.outputs] ) model.remove_node(const_node, rewire=False) - model.replace_node(node, bn_layer) + model.replace_node(node, aa_layer) return True @@ -186,7 +194,23 @@ def transform(self, model, node): n_in = np.prod(input_shape) const_node = node.get_input_node(node.inputs[1]) scale = 1 / const_node.attribute['value'] + scale_quantizer = const_node.get_attr('quantizer') + if scale_quantizer: + scale_precision = scale_quantizer.hls_type + i_new = 1 + int(scale_precision.signed) + scale_precision.fractional + w_new = 1 + int(scale_precision.signed) + max(scale_precision.fractional, 0) + new_scale_precision = FixedPrecisionType( + w_new, + i_new, + scale_precision.signed, + rounding_mode=scale_precision.rounding_mode, + saturation_mode=scale_precision.saturation_mode, + saturation_bits=scale_precision.saturation_bits, + ) + scale_quantizer = QuantNodeQuantizer(new_scale_precision) + bias = np.array(0) + bias_precision = IntegerPrecisionType(1, False) # because C++ doesn't do broadcasting, we may have to change the shapes of the scale and bias if scale.shape != tuple(input_shape) and np.squeeze(scale).shape != tuple(input_shape): @@ -195,7 +219,17 @@ def transform(self, model, node): bias = np.broadcast_to(bias, input_shape) attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update({'scale_data': scale, 'bias_data': bias, 'n_in': n_in, 'n_out': n_in, 'n_filt': -1}) + attributes.update( + { + 'scale_data': scale, + 'bias_data': bias, + 'scale_quantizer': scale_quantizer, + 'bias_precision': bias_precision, + 'n_in': n_in, + 'n_out': n_in, + 'n_filt': -1, + } + ) bn_layer = model.make_node(ApplyAlpha, f'bn_{node.name}', attributes, [node.inputs[0]], [x for x in node.outputs]) diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py index e49ff99bd7..79d92ec4d1 100644 --- a/hls4ml/model/optimizer/passes/quant_opt.py +++ b/hls4ml/model/optimizer/passes/quant_opt.py @@ -131,7 +131,7 @@ def transform(self, model, node): precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode) attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update({'activation': 'linear', 'quant_precision': precision, 'quantizer': quantizer}) + attributes.update({'activation': 'linear', 'quantizer': quantizer}) new_node = model.make_node(Activation, f'{node.name}_act', attributes, [node.inputs[0]], [x for x in node.outputs]) new_node.get_output_variable().type.precision = precision @@ -187,7 +187,6 @@ def transform(self, model, node): precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode) const_node = node.get_input_node(node.inputs[0]) - const_node.set_attr('quant_precision', precision) const_node.set_attr('quantizer', quantizer) const_node.get_output_variable().type.precision = precision @@ -229,7 +228,7 @@ def transform(self, model, node): precision, quantizer = _calculate_precision_quantizer(bitwidth, bitwidth, signed, narrow, rounding_mode) attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update({'activation': 'linear', 'quant_precision': precision, 'quantizer': quantizer}) + attributes.update({'activation': 'linear', 'quantizer': quantizer}) new_node = model.make_node(Activation, f'{node.name}_act', attributes, [node.inputs[0]], [x for x in node.outputs]) new_node.get_output_variable().type.precision = precision @@ -303,7 +302,6 @@ def transform(self, model, node): # caclucate the new value new_val = const_node.get_attr('value') / scale + bias const_node.set_attr('value', new_val) - const_node.set_attr('quant_precision', precision) const_node.set_attr('quantizer', quantizer) # reinitialize (which also runs quantization if quantizer exists) diff --git a/hls4ml/model/quantizers.py b/hls4ml/model/quantizers.py index c0a5869d5b..cadcdbbc3d 100644 --- a/hls4ml/model/quantizers.py +++ b/hls4ml/model/quantizers.py @@ -102,7 +102,7 @@ def __init__(self, config): self.bits = 1 self.hls_type = XnorPrecisionType() else: - print("Unsupported quantizer: " + config['class_name']) + print('Unsupported quantizer: ' + config['class_name']) self.bits = 16 self.hls_type = FixedPrecisionType(width=16, integer=6, signed=True) @@ -177,8 +177,8 @@ class QuantNodeQuantizer(Quantizer): def __init__(self, precision): super().__init__(precision.width, precision) - if not isinstance(precision, FixedPrecisionType): - raise TypeError("QuantNodeQuantizer is only defined for FixedPrecisionType") + if not isinstance(precision, (FixedPrecisionType, IntegerPrecisionType)): + raise TypeError('QuantNodeQuantizer is only defined for FixedPrecisionType and IntegerPrecisionType') def __call__(self, data): """Apply the quantization on the data""" @@ -216,7 +216,7 @@ def _min_int(signed: bool, saturation_mode: str, bit_width: int) -> int: int(0) """ if saturation_mode not in (SaturationMode.SAT_SYM, SaturationMode.SAT): - raise ValueError(f"Saturation mode {saturation_mode} not supported. Only AP_SAT_SYM, AP_SAT supported") + raise ValueError(f'Saturation mode {saturation_mode} not supported. Only AP_SAT_SYM, AP_SAT supported') if signed and saturation_mode == SaturationMode.SAT_SYM: value = -(2 ** (bit_width - 1)) + 1 elif signed: @@ -253,9 +253,9 @@ def _resolve_rounding_mode(mode): to the corresponding numpy functions.""" if mode == RoundingMode.RND_CONV: return np.round - # elif mode_string == "CEIL": # not supported + # elif mode_string == 'CEIL': # not supported # return np.ceil elif mode == RoundingMode.TRN: return np.floor else: - raise ValueError(f"Rounding mode {mode} not supported.") + raise ValueError(f'Rounding mode {mode} not supported.') diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py index 8c182f4cca..f9e75a7d87 100644 --- a/hls4ml/model/types.py +++ b/hls4ml/model/types.py @@ -81,7 +81,6 @@ class IntegerPrecisionType(PrecisionType): def __init__(self, width=16, signed=True): super().__init__(width=width, signed=signed) - self.integer = width self.fractional = 0 def __str__(self): @@ -96,6 +95,22 @@ def __eq__(self, other): eq = eq and self.fractional == other.fractional return eq + @property + def integer(self): + return self.width + + @property + def rounding_mode(self): + return RoundingMode.TRN + + @property + def saturation_mode(self): + return SaturationMode.WRAP + + @property + def saturation_bits(self): + return None + class FixedPrecisionType(PrecisionType): """Arbitrary precision fixed-point data type. @@ -114,11 +129,15 @@ class FixedPrecisionType(PrecisionType): def __init__(self, width=16, integer=6, signed=True, rounding_mode=None, saturation_mode=None, saturation_bits=None): super().__init__(width=width, signed=signed) self.integer = integer - self.fractional = width - integer self.rounding_mode = rounding_mode self.saturation_mode = saturation_mode self.saturation_bits = saturation_bits + # make this a property to avoid inconsistencies + @property + def fractional(self): + self.width - self.integer + @property def rounding_mode(self): return self._rounding_mode From 81f3e53533984ca67e24a1bd485b3135910e9e2e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 7 Feb 2024 19:44:00 -0600 Subject: [PATCH 049/272] bug fixes from attempting to run --- hls4ml/model/layers.py | 2 +- hls4ml/model/optimizer/__init__.py | 1 + .../model/optimizer/passes/batchnorm_opt.py | 29 +++++++++ hls4ml/model/optimizer/passes/bn_fuse.py | 65 +++++++++++++++++-- hls4ml/model/optimizer/passes/linear.py | 12 ++-- .../optimizer/passes/matmul_const_to_dense.py | 1 + hls4ml/model/optimizer/passes/merge_const.py | 13 ++-- hls4ml/model/optimizer/passes/move_scales.py | 4 +- hls4ml/model/quantizers.py | 6 +- hls4ml/model/types.py | 17 +++-- 10 files changed, 124 insertions(+), 26 deletions(-) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index b5d9f492af..ebf7af2124 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -403,7 +403,7 @@ def initialize(self): # for QONNX, remove batch dimension # (onnx cleaning should have removed reshapes not on data path) if isinstance(shape_node, Constant): - target_shape = shape_node.value[1:] + target_shape = shape_node.attributes['value'][1:] else: raise RuntimeError("Reshape for ONNX requires the target shape to be a second input.") diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index ebe4934029..bd4da19071 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -69,6 +69,7 @@ 'eliminate_linear_activation', 'fuse_consecutive_batch_normalization', 'fuse_batch_normalization', + 'remove_nop_batch_normalization', 'replace_multidimensional_dense_with_conv', 'infer_precision_types', 'set_precision_concat', diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index 3e0984dccb..f633d763c8 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -194,6 +194,15 @@ def transform(self, model, node): # if len(node_map[node.outputs[0]]) > 1: # return False + # only merge if the types are integer or fixed + if ( + not isinstance(prev_node.weights['scale'].type, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(prev_node.weights['bias'].type, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(node.weights['scale'].type, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(node.weights['bias'].type, (IntegerPrecisionType, FixedPrecisionType)) + ): + return False + s0 = prev_node.weights['scale'].data_unquantized b0 = prev_node.weights['bias'].data_unquantized s1 = node.weights['scale'].data_unquantized @@ -239,3 +248,23 @@ def transform(self, model, node): model.remove_node(prev_node, rewire=True) return True + + +class RemoveNopBatchNormalization(OptimizerPass): + """ + OptimizerPass to remove batch normalizations that do nothing (scale 1, bias 0) + + Note: This optimizer may not be safe if weights are updateable. + """ + + def match(self, node): + if isinstance(node, BatchNormalization): + s0 = node.weights['scale'].data_unquantized + b0 = node.weights['bias'].data_unquantized + return (s0 == np.ones_like(s0)).all() and (b0 == np.zeros_like(b0)).all() + else: + return False + + def transform(self, model, node): + model.remove_node(node, rewire=True) + return True diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py index 3d79de7dc8..a636af2f86 100644 --- a/hls4ml/model/optimizer/passes/bn_fuse.py +++ b/hls4ml/model/optimizer/passes/bn_fuse.py @@ -2,10 +2,19 @@ from hls4ml.model.layers import BatchNormalization, Conv1D, Conv2D, Dense from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.types import UnspecifiedPrecisionType +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, UnspecifiedPrecisionType class FuseBatchNormalization(OptimizerPass): + """ + OptimizerPass to merge BatchNormalization layers, + only if the earlier one does not have quantization specified + + Note: Consider restricting this to ApplyAlpha. Batch Normalization quantization seems to be ignored. + + Note: This optimizer may not be safe if weights are updateable. May need to turn off. + """ + def match(self, node): prev_node = node.get_input_node(node.inputs[0]) basic_match = ( @@ -51,13 +60,59 @@ def transform(self, model, node): bn_scale = node.weights['scale'] bn_bias = node.weights['bias'] + # only merge if the types are integer or fixed + if ( + not isinstance(parent_weight.type, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(parent_bias.type, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(bn_scale.type, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(bn_bias.type, (IntegerPrecisionType, FixedPrecisionType)) + ): + return False + fused_weight = bn_scale.data * parent_weight.data fused_bias = bn_scale.data * parent_bias.data + bn_bias.data + w_quantizer = ( + node.get_attr('scale_quantizer') + if (parent_weight.data == np.ones_like(parent_weight.data)).all() + else parent_node.get_attr('weight_quantizer') + ) + b_quantizer = ( + node.get_attr('bias_quantizer') + if (parent_bias.data == np.zeros_like(parent_bias.data)).all() + else parent_node.get_attr('bias_quantizer') + ) + + node.set_attr('weight_quantizer', w_quantizer) + node.set_attr('bias_quantizer', b_quantizer) + + # Not sure if this setting of this is useful + w_prec = None + if w_quantizer is None and (fused_weight == np.ones_like(fused_weight)).all(): + if ( + isinstance(parent_weight.type, IntegerPrecisionType) + and isinstance(bn_scale.type, IntegerPrecisionType) + and parent_weight.type.width == 1 + and bn_scale.type.width == 1 + ): + w_prec = node.weights['scale'].type + + b_prec = None + if b_quantizer is None and (fused_bias == np.zeros_like(fused_bias)).all(): + if ( + isinstance(parent_bias.type, IntegerPrecisionType) + and isinstance(bn_bias.type, IntegerPrecisionType) + and parent_bias.type.width == 1 + and bn_bias.type.width == 1 + ): + b_prec = node.weights['bias'].type + + # call function so that quantizer would be called if needed + node.add_weights_variable( + name='weight', var_name='w{index}', data=fused_weight, quantizer=w_quantizer, precision=w_prec + ) + node.add_weights_variable(name='bias', var_name='b{index}', data=fused_bias, quantizer=b_quantizer, precision=b_prec) + model.remove_node(node, rewire=True) - parent_weight.data = fused_weight - parent_bias.data = fused_bias - if not parent_node.get_attr('use_bias', True): - parent_bias.update_precision(bn_bias.type.precision) return True diff --git a/hls4ml/model/optimizer/passes/linear.py b/hls4ml/model/optimizer/passes/linear.py index 72d6dade9f..78a808b9a1 100644 --- a/hls4ml/model/optimizer/passes/linear.py +++ b/hls4ml/model/optimizer/passes/linear.py @@ -1,5 +1,6 @@ from hls4ml.model.layers import Activation, BatchNormalization, Conv1D, Conv2D, Dense from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import UnspecifiedPrecisionType class EliminateLinearActivation(OptimizerPass): @@ -14,7 +15,6 @@ def transform(self, model, node): return True -# TODO: Move migrate this to auto precisoin check from quant precision check class MergeLinearActivation(OptimizerPass): ''' For many objects it's safe to change the output precision independently of the calculation. @@ -27,16 +27,14 @@ def match(self, node): if isinstance(node, Activation) and node.get_attr('activation') == 'linear': parent = node.get_input_node(node.inputs[0]) safe_parent = isinstance(parent, (Dense, Conv1D, Conv2D, BatchNormalization)) - parent_type_fixed = parent.get_attr("quant_precision") - return safe_parent and not parent_type_fixed + return safe_parent and isinstance(parent.get_output_variable().type.precision, UnspecifiedPrecisionType) else: return False def transform(self, model, node): prev_node = node.get_input_node(node.inputs[0]) - quant_precision = node.get_attr("quant_precision") - prev_node.set_attr("quant_precision", quant_precision) - prev_node.set_attr("quantizer", node.get_attr("quantizer")) - prev_node.update_output_precision(quant_precision) + quantizer = node.get_attr("quantizer") + prev_node.set_attr("quantizer", quantizer) + prev_node.update_output_precision(quantizer.hls_type) model.remove_node(node) return True diff --git a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py index 7eac0ccca3..889a376cee 100644 --- a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py +++ b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py @@ -54,6 +54,7 @@ def transform(self, model, node): # removing and replacing old nodes model.remove_node(const_node, rewire=False) + del node.inputs[1] model.replace_node(node, new_dense) return True diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index 11848c9081..8ffe053866 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -91,9 +91,11 @@ def transform(self, model, node): if node1const: const_node = node1 input_node_idx = 0 + const_node_idx = 1 else: const_node = node.get_input_node(node.inputs[0]) input_node_idx = 1 + const_node_idx = 0 input_shape = node.get_input_variable(node.inputs[input_node_idx]).shape n_in = np.prod(input_shape) @@ -108,14 +110,14 @@ def transform(self, model, node): if op in ('add', 'sum'): scale = np.array(1) scale_precision = IntegerPrecisionType(1, False) - bias = const_node.attribute['value'] + bias = const_node.attributes['value'] bias_quantizer = const_node.get_attr('quantizer') elif op == 'sub': bias_quantizer = const_node.get_attr('quantizer') if node1const: scale = np.array(1) scale_precision = IntegerPrecisionType(1, False) - bias = -const_node.attribute['value'] + bias = -const_node.attributes['value'] if ( bias_quantizer is not None and isinstance(bias_quantizer.hls_type, (IntegerPrecisionType, FixedPrecisionType)) @@ -134,10 +136,10 @@ def transform(self, model, node): else: scale = np.array(-1) scale_precision = IntegerPrecisionType(2, True) - bias = const_node.attribute['value'] + bias = const_node.attributes['value'] elif op == 'mul': - scale = const_node.attribute['value'] + scale = const_node.attributes['value'] scale_quantizer = const_node.get_attr('quantizer') bias = np.array(0) bias_precision = IntegerPrecisionType(1, False) @@ -168,6 +170,7 @@ def transform(self, model, node): ) model.remove_node(const_node, rewire=False) + del node.inputs[const_node_idx] model.replace_node(node, aa_layer) return True @@ -193,7 +196,7 @@ def transform(self, model, node): input_shape = node.get_input_variable().shape n_in = np.prod(input_shape) const_node = node.get_input_node(node.inputs[1]) - scale = 1 / const_node.attribute['value'] + scale = 1 / const_node.attributes['value'] scale_quantizer = const_node.get_attr('quantizer') if scale_quantizer: scale_precision = scale_quantizer.hls_type diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index e97fd89947..fe1acb7f94 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -16,7 +16,7 @@ class ScaleDownMatMul(OptimizerPass): def match(self, node): ''' Check to see if we have a MatMul with at least one input ApplyAlpha. - Note, if both are this optimition runs twice. + Note, if both are this optimizer runs twice. ''' is_match = ( isinstance(node, MatMul) @@ -85,7 +85,7 @@ def transform(self, model, node): try: np.broadcast_to(scale, output.shape) # check size compatibility newscale = scale - newbias = inp[other_idx].value * bias + newbias = inp[other_idx].attributes['value'] * bias np.broadcast_to(newbias, output.shape) can_propagate = True except ValueError: diff --git a/hls4ml/model/quantizers.py b/hls4ml/model/quantizers.py index cadcdbbc3d..daae66fe45 100644 --- a/hls4ml/model/quantizers.py +++ b/hls4ml/model/quantizers.py @@ -215,8 +215,10 @@ def _min_int(signed: bool, saturation_mode: str, bit_width: int) -> int: >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8) int(0) """ - if saturation_mode not in (SaturationMode.SAT_SYM, SaturationMode.SAT): - raise ValueError(f'Saturation mode {saturation_mode} not supported. Only AP_SAT_SYM, AP_SAT supported') + if saturation_mode not in (SaturationMode.SAT_SYM, SaturationMode.SAT, SaturationMode.WRAP): + raise ValueError( + f'Saturation mode {saturation_mode} not supported. Only AP_SAT_SYM, AP_SAT supported, WRAP partially' + ) if signed and saturation_mode == SaturationMode.SAT_SYM: value = -(2 ** (bit_width - 1)) + 1 elif signed: diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py index f9e75a7d87..9fe6867262 100644 --- a/hls4ml/model/types.py +++ b/hls4ml/model/types.py @@ -81,12 +81,12 @@ class IntegerPrecisionType(PrecisionType): def __init__(self, width=16, signed=True): super().__init__(width=width, signed=signed) - self.fractional = 0 def __str__(self): typestring = '{signed}int<{width}>'.format(signed='u' if not self.signed else '', width=self.width) return typestring + # Does this need to make sure other is also an IntegerPrecisionType? I could see a match between Fixed and Integer def __eq__(self, other): eq = self.width == other.width eq = eq and self.signed == other.signed @@ -99,6 +99,10 @@ def __eq__(self, other): def integer(self): return self.width + @property + def fractional(self): + return 0 + @property def rounding_mode(self): return RoundingMode.TRN @@ -134,9 +138,10 @@ def __init__(self, width=16, integer=6, signed=True, rounding_mode=None, saturat self.saturation_bits = saturation_bits # make this a property to avoid inconsistencies + @property def fractional(self): - self.width - self.integer + return self.width - self.integer @property def rounding_mode(self): @@ -144,7 +149,9 @@ def rounding_mode(self): @rounding_mode.setter def rounding_mode(self, mode): - if isinstance(mode, str): + if mode is None: + self._rounding_mode = RoundingMode.TRN + elif isinstance(mode, str): self._rounding_mode = RoundingMode.from_string(mode) else: self._rounding_mode = mode @@ -155,7 +162,9 @@ def saturation_mode(self): @saturation_mode.setter def saturation_mode(self, mode): - if isinstance(mode, str): + if mode is None: + self._saturation_mode = SaturationMode.WRAP + elif isinstance(mode, str): self._saturation_mode = SaturationMode.from_string(mode) else: self._saturation_mode = mode From 9a74e46e33a715054496b408870675a35d4e19df Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sun, 11 Feb 2024 18:07:24 -0600 Subject: [PATCH 050/272] fix some bugs from qonnx pytest --- .../model/optimizer/passes/batchnorm_opt.py | 19 ++++++++++--------- hls4ml/model/optimizer/passes/merge_const.py | 2 ++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index f633d763c8..ee00ecfa46 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -1,6 +1,6 @@ import numpy as np -from hls4ml.model.layers import ApplyAlpha, BatchNormalization, BatchNormOnnx, Constant +from hls4ml.model.layers import BatchNormalization, BatchNormOnnx, Constant from hls4ml.model.optimizer import OptimizerPass from hls4ml.model.quantizers import QuantNodeQuantizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, UnspecifiedPrecisionType @@ -33,7 +33,7 @@ def transform(self, model, node): raise TypeError('Only consant gammas supported') gamma = gamma_node.attributes['value'] attributes['gamma_data'] = gamma - attributes['gamma_quantizer'] = gamma_node.get_attr['quantizer'] + attributes['gamma_quantizer'] = gamma_node.get_attr('quantizer') node.inputs[1] = '' model.remove_node(gamma_node, rewire=False) @@ -43,7 +43,7 @@ def transform(self, model, node): raise TypeError('Only consant betas supported') beta = beta_node.attributes['value'] attributes['beta_data'] = beta - attributes['beta_quantizer'] = beta_node.get_attr['quantizer'] + attributes['beta_quantizer'] = beta_node.get_attr('quantizer') node.inputs[2] = '' model.remove_node(beta_node, rewire=False) @@ -52,7 +52,7 @@ def transform(self, model, node): raise TypeError('Only consant moving_means supported') moving_mean = moving_mean_node.attributes['value'] attributes['mean_data'] = moving_mean - attributes['mean_quantizer'] = moving_mean_node.get_attr['quantizer'] + attributes['mean_quantizer'] = moving_mean_node.get_attr('quantizer') node.inputs[3] = '' model.remove_node(moving_mean_node, rewire=False) @@ -61,13 +61,13 @@ def transform(self, model, node): raise TypeError('Only consant moving_variances supported') moving_variance = moving_variance_node.attributes['value'] attributes['variance_data'] = moving_variance - attributes['variance_quantizer'] = moving_variance_node.get_attr['quantizer'] + attributes['variance_quantizer'] = moving_variance_node.get_attr('quantizer') node.inputs[4] = '' model.remove_node(moving_variance_node, rewire=False) node.inputs = [inp for inp in node.inputs if inp] if len(node.inputs) != 1: - raise RuntimeError('The QONNX batchnomr had unexpected inputs.') + raise RuntimeError('The QONNX batchnorm had unexpected inputs.') new_node = model.make_node(BatchNormalization, node.name, attributes, [node.inputs[0]], [x for x in node.outputs]) @@ -76,6 +76,7 @@ def transform(self, model, node): return True +# Most likely this case is removed by qonnx cleaning class ConstantBatchNormFusion(OptimizerPass): """ Merge BatchNorm into Const (after parameters have already been merged in BatchNormalization) @@ -149,7 +150,7 @@ class FuseConsecutiveBatchNormalization(OptimizerPass): OptimizerPass to merge consecutive BatchNormalization layers, only if the earlier one does not have quantization specified - Note: Consider restricting this to ApplyAlpha. Batch Normalization quantization seems to be ignored. + Note: Consider restricting this to ApplyAlpha. Batch Normalization-style quantization seems to be ignored. Note: This optimizer may not be safe if weights are updateable. May need to turn off. """ @@ -157,8 +158,8 @@ class FuseConsecutiveBatchNormalization(OptimizerPass): def match(self, node): prev_node = node.get_input_node(node.inputs[0]) basic_match = ( - isinstance(node, ApplyAlpha) - and isinstance(prev_node, ApplyAlpha) + isinstance(node, BatchNormalization) + and isinstance(prev_node, BatchNormalization) and isinstance(prev_node.get_output_variable().type.precision, UnspecifiedPrecisionType) ) diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index 8ffe053866..25bd59bda6 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -8,6 +8,7 @@ _base_attributes = ('Trace', 'reuse_factor', 'n_in') +# This should generally not happen because of qonnx cleaning class MergeTwoConstants(OptimizerPass): """Merge of two constants makes another constant""" @@ -237,6 +238,7 @@ def transform(self, model, node): bn_layer = model.make_node(ApplyAlpha, f'bn_{node.name}', attributes, [node.inputs[0]], [x for x in node.outputs]) model.remove_node(const_node, rewire=False) + del node.inputs[1] model.replace_node(node, bn_layer) return True From 60a74bb49e906149f64401678bcb7f0ba4e4eff4 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 12 Feb 2024 09:59:48 -0600 Subject: [PATCH 051/272] fix assertion of not matching the number of inputs when replacing node --- hls4ml/model/optimizer/passes/conv_to_convxd.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/passes/conv_to_convxd.py b/hls4ml/model/optimizer/passes/conv_to_convxd.py index b61b0340be..e54c98c1d7 100644 --- a/hls4ml/model/optimizer/passes/conv_to_convxd.py +++ b/hls4ml/model/optimizer/passes/conv_to_convxd.py @@ -81,9 +81,11 @@ def transform(self, model, node): ) # removing and replacing old nodes - model.remove_node(weight_node, rewire=False) if bias_node: model.remove_node(bias_node, rewire=False) + del node.inputs[2] + model.remove_node(weight_node, rewire=False) + del node.inputs[1] model.replace_node(node, new_node) return True From 88a8d351b158145ef2d1f6d0a9daed9b159a7241 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 29 Feb 2024 16:36:54 -0600 Subject: [PATCH 052/272] update some precisions inference --- .../model/optimizer/passes/infer_precision.py | 121 ++++++++++++++++-- 1 file changed, 109 insertions(+), 12 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 6f6a72097f..c660647d3b 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -1,11 +1,12 @@ import math -from copy import deepcopy import numpy as np from hls4ml.model.optimizer import ConfigurableOptimizerPass from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType +# TODO: The code assumes everything is Fixed or Integer precision. Need to add checks + class InferPrecisionTypes(ConfigurableOptimizerPass): def __init__(self): @@ -36,7 +37,7 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['Dense']: return self._infer_dense_precision(node, types_to_infer) - if node_class in ['BatchNormalization']: + if node_class in ['BatchNormalization', 'ApplyAlpha']: return self._infer_bn_precision(node, types_to_infer) if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']: @@ -51,9 +52,15 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['Clone', 'Reshape', 'Resize', 'Transpose', 'ZeroPadding1D', 'ZeroPadding2D']: return self._infer_output_matching_precision(node, types_to_infer) - if node_class in ['Concatenate', 'Merge']: + if node_class in ['Merge']: return self._infer_merge_precision(node, types_to_infer) + if node_class in ['Concatenate']: + return self._infer_cat_precision(node, types_to_infer) + + if node_class in ['Dot']: + return self._infer_dot_precision(node, types_to_infer) + # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent # this in config_from_* functions @@ -124,6 +131,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): bitwidth = integers + max(frac, bias_width - bias_integers) signed = signed or bias_signed + # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. new_type = FixedPrecisionType(bitwidth, integers, signed) if 'accum_t' in types_to_infer: @@ -225,6 +233,11 @@ def _infer_sepconv_precision(self, node, types_to_infer): return inferred_types def _infer_bn_precision(self, node, types_to_infer): + """ + The batchnormalziation precision here is the more implementation-focused version. It propagates + precision from scale and bias, not mean, variance, etc. + """ + inferred_types = [] if 'scale_t' in types_to_infer: @@ -238,16 +251,28 @@ def _infer_bn_precision(self, node, types_to_infer): inferred_types.append('bias_t') if 'result_t' in types_to_infer: + input_precision = node.get_input_variable().type.precision scale_precision = node.types['scale_t'].precision bias_precision = node.types['bias_t'].precision - out_precision = deepcopy(node.get_input_node().get_output_variable().type.precision) - out_precision.integer += scale_precision.integer - out_precision.fractional = max(out_precision.fractional, scale_precision.fractional) + after_scale_signed = scale_precision.signed or input_precision.signed + after_scale_width = input_precision.width + scale_precision.width + after_scale_integer = input_precision.integer + scale_precision.integer + + out_precision_signed = after_scale_signed or bias_precision.signed + out_precision_integer = ( + max( + after_scale_integer + (bias_precision.signed and not after_scale_signed), + bias_precision.integer + (after_scale_signed and not bias_precision.signed), + ) + + 1 + ) + out_precision_width = out_precision_integer + max( + after_scale_width - after_scale_integer, bias_precision.fractional + ) - out_precision.integer = max(out_precision.integer, bias_precision.integer) + 1 - out_precision.fractional = max(out_precision.fractional, bias_precision.fractional) - out_precision.width = out_precision.fractional + out_precision.integer + # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. + out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision @@ -288,10 +313,82 @@ def _infer_merge_precision(self, node, types_to_infer): input_1 = node.get_input_variable(node.inputs[0]).type.precision input_2 = node.get_input_variable(node.inputs[1]).type.precision - new_width = max(input_1.fractional, input_2.fractional) + max(input_1.integer, input_2.integer) - new_int = max(input_1.integer, input_2.integer) + op = node.get_attr('op').lower() + if op in ('add', 'subtract', 'average'): + new_signed = input_1.signed or input_2.signed or op == 'subtract' + new_int = ( + max( + input_1.integer + (input_2.signed and not input_1.signed), + input_2.integer + (input_1.signed and not input_2.signed), + ) + + 1 + ) + new_width = new_int + max(input_1.fractional, input_2.fractional) + + elif op == 'multiply': + new_signed = input_1.signed or input_2.signed + new_int = input_1.integer + input_2.integer + new_width = input_1.width + input_2.width + elif op in ('maximum', 'minimum'): + new_signed = input_1.signed or input_2.signed + + input_1_integer = input_1.integer + input_2_integer = input_2.integer + + # add one to integer if unsigned while new is signed + if new_signed and not input_1.signed: + input_1_integer += 1 + if new_signed and not input_2.signed: + input_2_integer += 1 + + new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) + new_int = max(input_1_integer, input_2_integer) + + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + + return ['result_t'] + + def _infer_cat_precision(self, node, types_to_infer): + assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + + input_1 = node.get_input_variable(node.inputs[0]).type.precision + input_2 = node.get_input_variable(node.inputs[1]).type.precision + + new_signed = input_1.signed or input_2.signed + + input_1_integer = input_1.integer + input_2_integer = input_2.integer + + # add one to integer if unsigned while new is signed + if new_signed and not input_1.signed: + input_1_integer += 1 + if new_signed and not input_2.signed: + input_2_integer += 1 + + new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) + new_int = max(input_1_integer, input_2_integer) + + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + + return ['result_t'] + + def _infer_dot_precision(self, node, types_to_infer): + assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + + input_1 = node.get_input_variable(node.inputs[0]).type.precision + input_2 = node.get_input_variable(node.inputs[1]).type.precision + + n_in = node.get_input_variable(node.inputs[0]).shape[0] + + new_signed = input_1.signed or input_2.signed + new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in)) + new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in)) - out_precision = FixedPrecisionType(new_width, new_int) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision From 10a3c500b79ad1b4fded24c860f41ad9732a4afb Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 29 Feb 2024 17:33:24 -0600 Subject: [PATCH 053/272] extract bitwidth from size 1 array in quant node --- hls4ml/model/optimizer/passes/quant_opt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py index 79d92ec4d1..0d02124bc6 100644 --- a/hls4ml/model/optimizer/passes/quant_opt.py +++ b/hls4ml/model/optimizer/passes/quant_opt.py @@ -13,6 +13,7 @@ and Linear nodes are immediately merged into the Constant. """ + import math # prefer to use math.ceil for scalar values import numpy as np @@ -67,7 +68,7 @@ def transform(self, model, node): bitwidth = bitwidth_node.get_attr('value') if bitwidth.size != 1: raise RuntimeError('Only scalar bitwidth values are supporeted by the Quant node') - node.set_attr('bitwidth', bitwidth) + node.set_attr('bitwidth', bitwidth[0]) node.inputs[3] = '' model.remove_node(bitwidth_node, rewire=False) From ab8d67b2ce9318106203d99dba12533570f0494d Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 1 Mar 2024 19:55:08 -0600 Subject: [PATCH 054/272] update automatic onnx configuration --- hls4ml/converters/__init__.py | 1 + hls4ml/converters/onnx_to_hls.py | 38 +++++++++++++----- hls4ml/model/graph.py | 3 +- hls4ml/utils/config.py | 66 ++++++++++++++++++++++++++++++-- test/pytest/test_qonnx.py | 6 +-- 5 files changed, 98 insertions(+), 16 deletions(-) diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py index 0bc7ccdbe7..b7bcb05b9e 100644 --- a/hls4ml/converters/__init__.py +++ b/hls4ml/converters/__init__.py @@ -10,6 +10,7 @@ from hls4ml.converters.keras_to_hls import get_supported_keras_layers # noqa: F401 from hls4ml.converters.keras_to_hls import parse_keras_model # noqa: F401 from hls4ml.converters.keras_to_hls import keras_to_hls, register_keras_layer_handler +from hls4ml.converters.onnx_to_hls import parse_onnx_model # noqa: F401 from hls4ml.model import ModelGraph from hls4ml.utils.config import create_config from hls4ml.utils.symbolic_utils import LUTFunction diff --git a/hls4ml/converters/onnx_to_hls.py b/hls4ml/converters/onnx_to_hls.py index 8f6c7461fb..75850fa93e 100644 --- a/hls4ml/converters/onnx_to_hls.py +++ b/hls4ml/converters/onnx_to_hls.py @@ -162,26 +162,23 @@ def get_out_layer_name(graph): return [node.name for node in graph.node if node.output[0] in output_index_list] -def onnx_to_hls(config): - """Convert onnx model to hls model from configuration. +def parse_onnx_model(onnx_model): + """Parses the onnx model, both for configuration building and general processing. Args: - config (dict): ONNX configuration from yaml file or passed through API. + onnx_model: an ONNX model object. Raises: Exception: Raised if an unsupported operation is found in the ONNX model. Returns: - ModelGraph: hls4ml model object + layer_list (list): The onnx layers + input_layers (list): The input layers + output_layers (list): The output layers """ # This is a list of dictionaries to hold all the layer info we need to generate HLS layer_list = [] - # Extract model architecture - print('Interpreting Model ...') - - onnx_model = onnx.load(config['OnnxModel']) if isinstance(config['OnnxModel'], str) else config['OnnxModel'] - # We don't infer the shapes because the qonnx package preprocessing does it. # Obtain list of input/ouput layers @@ -257,6 +254,29 @@ def onnx_to_hls(config): print(f"Layer name: {layer['name']}, layer type: {layer['class_name']}, current shape: {input_shapes}") layer_list.append(layer) + return layer_list, input_layers, output_layers + + +def onnx_to_hls(config): + """Convert onnx model to hls model from configuration. + + Args: + config (dict): ONNX configuration from yaml file or passed through API. + + Raises: + Exception: Raised if an unsupported operation is found in the ONNX model. + + Returns: + ModelGraph: hls4ml model object + """ + + # Extract model architecture + print('Interpreting Model ...') + + onnx_model = onnx.load(config['OnnxModel']) if isinstance(config['OnnxModel'], str) else config['OnnxModel'] + + layer_list, input_layers, output_layers = parse_onnx_model(onnx_model) + ################# # Generate HLS ################# diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index a6b5c29e89..f0d29237b7 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -99,7 +99,8 @@ def get_precision(self, layer, var='default'): type_name = layer.name.lower() + '_' + var + '_t' if precision is None: precision = self.layer_name_precision.get(layer.name.lower() + '_default') - type_name = layer.name.lower() + '_default_t' + # I think it is better to keep these unique still to avoid inadvertent updates + # type_name = layer.name.lower() + '_default_t' if precision is None: precision = self.layer_type_precision.get(layer.class_name.lower() + '_' + var) diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py index 5d7ca1ae72..5c41a52a82 100644 --- a/hls4ml/utils/config.py +++ b/hls4ml/utils/config.py @@ -321,7 +321,7 @@ def config_from_pytorch_model( def config_from_onnx_model( - model, granularity='model', backend=None, default_precision='ap_fixed<16,6>', default_reuse_factor=1 + model, granularity='name', backend=None, default_precision='ap_fixed<16,6>', default_reuse_factor=1 ): """Create an HLS conversion config given the ONNX model. @@ -331,8 +331,8 @@ def config_from_onnx_model( Args: model: ONNX model - granularity (str, optional): Granularity of the created config. Defaults to 'model'. - Can be set to 'model', 'type' and 'layer'. + granularity (str, optional): Granularity of the created config. Defaults to 'name'. + Can be set to 'model', 'type' and 'name'. Granularity can be used to generate a more verbose config that can be fine-tuned. The default granularity ('model') will generate config keys that apply to the whole @@ -351,6 +351,16 @@ def config_from_onnx_model( [dict]: The created config. """ + if granularity.lower() not in ['model', 'type', 'name']: + raise Exception( + f'Invalid configuration granularity specified, expected "model", "type" or "name" got "{granularity}"' + ) + + if backend is not None: + backend = hls4ml.backends.get_backend(backend) + elif granularity.lower() != 'model': + print('Warning: it is recommended to pass the backend to "config_from_onnx_model"') + config = {} model_config = {} @@ -360,4 +370,54 @@ def config_from_onnx_model( config['Model'] = model_config + layer_list, _, _ = hls4ml.converters.parse_onnx_model(model) + + def make_layer_config(layer): + cls_name = layer['class_name'] + + layer_cls = hls4ml.model.layers.layer_map[cls_name] + if backend is not None: + layer_cls = backend.create_layer_class(layer_cls) + + layer_config = {} + + # set the default precision of the layer to auto? + # (not really necessary if we set the backend appropriately) + # layer_config['Precision'] = {'default': 'auto'} + + config_attrs = [a for a in layer_cls.expected_attributes if a.configurable] + for attr in config_attrs: + if isinstance(attr, hls4ml.model.attributes.TypeAttribute): + precision_cfg = layer_config.setdefault('Precision', {}) + name = attr.name + if name.endswith('_t'): + name = name[:-2] + if attr.default is None: + precision_cfg[name] = 'auto' + else: + precision_cfg[name] = str(attr.default) + else: + if attr.default is not None: + layer_config[attr.config_name] = attr.default + + return layer_config + + if granularity.lower() == 'type': + type_config = {} + for layer in layer_list: + if layer['class_name'] in type_config: + continue + layer_config = make_layer_config(layer) + type_config[layer['class_name']] = layer_config + + config['LayerType'] = type_config + + elif granularity.lower() == 'name': + name_config = {} + for layer in layer_list: + layer_config = make_layer_config(layer) + name_config[layer['name']] = layer_config + + config['LayerName'] = name_config + return config diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index 2c314c13ca..529a5adebc 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -93,7 +93,7 @@ def test_tfc_2w2a(tfc_2w2a_model, backend): y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] # Convert QONNX model, compile, and run inference - config = hls4ml.utils.config_from_onnx_model(model) + config = hls4ml.utils.config_from_onnx_model(model, backend=backend) # Some hand-derived config config['LayerName'] = {} config['LayerName']['global_in'] = {'Precision': 'ap_fixed<16,2>'} @@ -116,7 +116,7 @@ def test_cnv_2w2a(cnv_2w2a_model, backend): y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] # Convert QONNX model, compile, and run inference - config = hls4ml.utils.config_from_onnx_model(model, default_precision='fixed<32,16>') + config = hls4ml.utils.config_from_onnx_model(model, backend=backend, default_precision='fixed<32,16>') hls_model = hls4ml.converters.convert_from_onnx_model( model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_cnv-2w2a_{backend}'), @@ -142,7 +142,7 @@ def test_jet_tagging(jettagging_model, backend): y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] # Convert QONNX model, compile, and run inference - config = hls4ml.utils.config_from_onnx_model(model) + config = hls4ml.utils.config_from_onnx_model(model, backend=backend) hls_model = hls4ml.converters.convert_from_onnx_model( model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_jettag_{backend}'), backend=backend, hls_config=config From 0a863adcc25b5facae2b9b375bf4c7fa1bc41ecc Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sat, 2 Mar 2024 12:49:43 -0600 Subject: [PATCH 055/272] standardize on merge operators --- hls4ml/converters/onnx/merge.py | 14 +++++++++++- .../model/optimizer/passes/infer_precision.py | 12 ++++++++-- hls4ml/model/optimizer/passes/merge_const.py | 22 +++++++++---------- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/hls4ml/converters/onnx/merge.py b/hls4ml/converters/onnx/merge.py index 2309cc213f..420f077ec2 100644 --- a/hls4ml/converters/onnx/merge.py +++ b/hls4ml/converters/onnx/merge.py @@ -2,13 +2,25 @@ merge_layers = ['Add', 'Sub', 'Mul', 'Div', 'Average', 'Max', 'Min', 'Concat', 'Sum'] +op_map = { + 'Add': 'add', + 'Sub': 'subtract', + 'Mul': 'multiply', + 'Div': 'divide', + 'Average': 'average', + 'Max': 'maximum', + 'Min': 'minimum', + 'Sum': 'add', + 'Concat': 'concat', +} + @onnx_handler(*merge_layers) def parse_merge_layer(node, input_names, input_shapes, graph): layer = {} layer['class_name'] = node.op_type layer['name'] = node.name - layer['op'] = layer['class_name'].lower() + layer['op'] = op_map[node.op_type] layer['inputs'] = input_names layer['outputs'] = list(node.output) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index c660647d3b..d1629a7a13 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -66,6 +66,10 @@ def _infer_precision(self, node, types_to_infer): return [] + def _get_default_precision(self, node): + model_config = node.model.config + return model_config.backend.convert_precision_string(model_config.model_precision['default']) + def _infer_default_type(self, node, type_name): model_config = node.model.config default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default']) @@ -324,11 +328,12 @@ def _infer_merge_precision(self, node, types_to_infer): + 1 ) new_width = new_int + max(input_1.fractional, input_2.fractional) - + out_precision = FixedPrecisionType(new_width, new_int, new_signed) elif op == 'multiply': new_signed = input_1.signed or input_2.signed new_int = input_1.integer + input_2.integer new_width = input_1.width + input_2.width + out_precision = FixedPrecisionType(new_width, new_int, new_signed) elif op in ('maximum', 'minimum'): new_signed = input_1.signed or input_2.signed @@ -343,8 +348,11 @@ def _infer_merge_precision(self, node, types_to_infer): new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) new_int = max(input_1_integer, input_2_integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + print(f'Warning: not propagating weights for type {op}') + out_precision = self._get_default_precision(node) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index 25bd59bda6..54f275d9ec 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -35,19 +35,19 @@ def transform(self, model, node): val1 = const_node1.attributes['value'] op = node.attributes['op'] - if op in ('add', 'sum'): + if op == 'add': new_val = val0 + val1 - elif op == 'sub': + elif op == 'subtract': new_val = val0 - val1 - elif op == 'mul': + elif op == 'multiply': new_val = val0 * val1 - elif op == 'div': + elif op == 'divide': new_val = val0 / val1 elif op == 'average': new_val = np.mean(np.array([val0, val1]), axis=0) - elif op == 'max': + elif op == 'maximum': new_val = np.maximum(val0, val1) - elif op == 'min': + elif op == 'minimum': new_val = np.minimum(val0, val1) else: raise RuntimeError(f'Unexpected op_type: {op}') @@ -76,7 +76,7 @@ class MergeToApplyAlpha(OptimizerPass): def match(self, node): is_match = ( isinstance(node, Merge) - and node.attributes['op'] in ('add', 'sum', 'sub', 'mul') # Div is separate + and node.attributes['op'] in ('add', 'subtract', 'multiply') # Div is separate and ( isinstance(node.get_input_node(node.inputs[0]), Constant) != isinstance(node.get_input_node(node.inputs[1]), Constant) @@ -108,12 +108,12 @@ def transform(self, model, node): bias_quantizer = None op = node.attributes['op'] - if op in ('add', 'sum'): + if op == 'add': scale = np.array(1) scale_precision = IntegerPrecisionType(1, False) bias = const_node.attributes['value'] bias_quantizer = const_node.get_attr('quantizer') - elif op == 'sub': + elif op == 'subtract': bias_quantizer = const_node.get_attr('quantizer') if node1const: scale = np.array(1) @@ -139,7 +139,7 @@ def transform(self, model, node): scale_precision = IntegerPrecisionType(2, True) bias = const_node.attributes['value'] - elif op == 'mul': + elif op == 'multiply': scale = const_node.attributes['value'] scale_quantizer = const_node.get_attr('quantizer') bias = np.array(0) @@ -187,7 +187,7 @@ class MergeToApplyAlphaDiv(OptimizerPass): def match(self, node): is_match = ( isinstance(node, Merge) - and node.attributes['op'] == 'div' + and node.attributes['op'] == 'divide' and isinstance(node.get_input_node(node.inputs[1]), Constant) ) # only second can be const From bfe6a3f6650705ac2a845949b654f48fdb86acfa Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 7 Mar 2024 19:52:01 -0600 Subject: [PATCH 056/272] snapshot of current work --- hls4ml/model/graph.py | 61 +++++++++------- hls4ml/model/layers.py | 5 -- hls4ml/model/optimizer/__init__.py | 17 +++-- .../model/optimizer/passes/batchnorm_opt.py | 2 +- hls4ml/model/optimizer/passes/bn_fuse.py | 49 ++++--------- .../model/optimizer/passes/conv_to_convxd.py | 20 +++--- .../model/optimizer/passes/infer_precision.py | 12 +++- hls4ml/model/optimizer/passes/linear.py | 8 ++- .../optimizer/passes/matmul_const_to_dense.py | 32 ++++----- hls4ml/model/optimizer/passes/merge_const.py | 72 ++++++++++--------- hls4ml/model/optimizer/passes/quant_opt.py | 65 ++++++++++++----- hls4ml/model/types.py | 26 +------ 12 files changed, 185 insertions(+), 184 deletions(-) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index f0d29237b7..33b367a929 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -94,6 +94,11 @@ def get_layer_config(self, layer): return layer_config + def set_name_config(self, name, config): + hls_config = self.config['HLSConfig'] + layer_config = hls_config.setdefault('LayerName', {}) + layer_config[name] = config + def get_precision(self, layer, var='default'): precision = self.layer_name_precision.get(layer.name.lower() + '_' + var) type_name = layer.name.lower() + '_' + var + '_t' @@ -178,6 +183,35 @@ def get_compression(self, layer): return compression + def parse_name_config(self, layer_name, layer_cfg): + """This is used by _parse_hls_config below, but also in optimizers when a new layer config is created""" + precision_cfg = layer_cfg.get('Precision') + if isinstance(precision_cfg, dict): + for var, precision in precision_cfg.items(): + self.layer_name_precision[layer_name.lower() + '_' + var] = precision + else: + self.layer_name_precision[layer_name.lower() + '_default'] = precision_cfg + + rf = layer_cfg.get('ReuseFactor') + if rf is not None: + self.layer_name_rf[layer_name.lower()] = rf + + targ_cycles = layer_cfg.get('TargetCycles') + if targ_cycles is not None: + self.layer_name_targ_cycles[layer_name.lower()] = targ_cycles + + strategy = layer_cfg.get('Strategy') + if strategy is not None: + self.layer_name_strategy[layer_name.lower()] = strategy + + conv_implementation = layer_cfg.get('ConvImplementation') + if conv_implementation is not None: + self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation + + compression = layer_cfg.get('Compression') + if compression is not None: + self.layer_name_compression[layer_name.lower()] = bool(compression) + def _parse_hls_config(self): hls_config = self.config['HLSConfig'] @@ -250,32 +284,7 @@ def _parse_hls_config(self): layer_name_cfg = hls_config.get('LayerName') if layer_name_cfg is not None: for layer_name, layer_cfg in layer_name_cfg.items(): - precision_cfg = layer_cfg.get('Precision') - if isinstance(precision_cfg, dict): - for var, precision in precision_cfg.items(): - self.layer_name_precision[layer_name.lower() + '_' + var] = precision - else: - self.layer_name_precision[layer_name.lower() + '_default'] = precision_cfg - - rf = layer_cfg.get('ReuseFactor') - if rf is not None: - self.layer_name_rf[layer_name.lower()] = rf - - targ_cycles = layer_cfg.get('TargetCycles') - if targ_cycles is not None: - self.layer_name_targ_cycles[layer_name.lower()] = targ_cycles - - strategy = layer_cfg.get('Strategy') - if strategy is not None: - self.layer_name_strategy[layer_name.lower()] = strategy - - conv_implementation = layer_cfg.get('ConvImplementation') - if conv_implementation is not None: - self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation - - compression = layer_cfg.get('Compression') - if compression is not None: - self.layer_name_compression[layer_name.lower()] = bool(compression) + self.parse_name_config(layer_name, layer_cfg) def _validate_hls_config(self): use_dataflow = False diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index d0ac7e5561..a5130fa7bb 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -249,11 +249,6 @@ def add_output_variable( self.set_attr(out_name, out) - def update_output_precision(self, precision, output_name=None): - if output_name is None: - output_name = self.outputs[0] - self.variables[output_name].type.precision = precision - def add_weights(self, quantizer=None, compression=False): self.add_weights_variable( name='weight', var_name='w{index}', data='weight', quantizer=quantizer, compression=compression diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index bd4da19071..f05f8e3e04 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -33,11 +33,6 @@ register_flow( 'convert', [ - 'infer_precision_types', - 'channels_last_converter', - 'fuse_bias_add', - 'remove_useless_transpose', - 'expand_layer_group', 'reshape_constant', 'quant_constant_parameters', 'quant_to_activation', @@ -56,10 +51,17 @@ 'merge_to_apply_alpha_div', 'matmul_const_to_dense', 'conv_to_conv_x_d', + 'fuse_consecutive_batch_normalization', # needs to be before infer_precision_types + 'merge_linear_activation', # needs to be before infer_precision_types + 'fuse_batch_normalization', # needs to be before infer_precision_types + 'infer_precision_types', + 'channels_last_converter', + 'fuse_bias_add', + 'remove_useless_transpose', + 'expand_layer_group', 'output_rounding_saturation_mode', 'qkeras_factorize_alpha', 'extract_ternary_threshold', - 'fuse_consecutive_batch_normalization', ], ) # TODO Maybe not all QKeras optmizers belong here? @@ -67,13 +69,10 @@ 'optimize', [ 'eliminate_linear_activation', - 'fuse_consecutive_batch_normalization', - 'fuse_batch_normalization', 'remove_nop_batch_normalization', 'replace_multidimensional_dense_with_conv', 'infer_precision_types', 'set_precision_concat', - 'merge_linear_activation', ], requires=['convert'], ) diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index ee00ecfa46..1800b33056 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -5,7 +5,7 @@ from hls4ml.model.quantizers import QuantNodeQuantizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, UnspecifiedPrecisionType -_base_attributes = ('Trace', 'reuse_factor', 'epsilon', 'n_in', 'n_filt') +_base_attributes = ('epsilon', 'n_in', 'n_filt') class BatchNormOnnxConstantParameters(OptimizerPass): diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py index a636af2f86..c84430f13f 100644 --- a/hls4ml/model/optimizer/passes/bn_fuse.py +++ b/hls4ml/model/optimizer/passes/bn_fuse.py @@ -29,13 +29,13 @@ def match(self, node): b1 = node.weights['bias'].data_unquantized scale_compatible = ( (prev_node.get_attr('weight_quantizer') is None and node.get_attr('scale_quantizer') is None) - or (s0 == np.ones_like(s0)).all() - or (s1 == np.ones_like(s1)).all() + or ((s0 == np.ones_like(s0)).all() and prev_node.get_attr('weight_quantizer') is None) + or ((s1 == np.ones_like(s1)).all() and node.get_attr('scale_quantizer') is None) ) bias_compatible = ( (prev_node.get_attr('bias_quantizer') is None and node.get_attr('bias_quantizer') is None) - or (b0 == np.zeros_like(b0)).all() - or (b1 == np.zeros_like(b1)).all() + or ((b0 == np.zeros_like(b0)).all() and prev_node.get_attr('bias_quantizer') is None) + or ((b1 == np.zeros_like(b1)).all() and node.get_attr('bias_quantizer') is None) ) return scale_compatible and bias_compatible @@ -60,12 +60,14 @@ def transform(self, model, node): bn_scale = node.weights['scale'] bn_bias = node.weights['bias'] + allowed_precisions = (IntegerPrecisionType, FixedPrecisionType, UnspecifiedPrecisionType) + # only merge if the types are integer or fixed if ( - not isinstance(parent_weight.type, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(parent_bias.type, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(bn_scale.type, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(bn_bias.type, (IntegerPrecisionType, FixedPrecisionType)) + not isinstance(parent_weight.type.precision, allowed_precisions) + or not isinstance(parent_bias.type.precision, allowed_precisions) + or not isinstance(bn_scale.type.precision, allowed_precisions) + or not isinstance(bn_bias.type.precision, allowed_precisions) ): return False @@ -74,44 +76,21 @@ def transform(self, model, node): w_quantizer = ( node.get_attr('scale_quantizer') - if (parent_weight.data == np.ones_like(parent_weight.data)).all() + if node.get_attr('scale_quantizer') is not None else parent_node.get_attr('weight_quantizer') ) b_quantizer = ( node.get_attr('bias_quantizer') - if (parent_bias.data == np.zeros_like(parent_bias.data)).all() + if node.get_attr('bias_quantizer') is not None else parent_node.get_attr('bias_quantizer') ) node.set_attr('weight_quantizer', w_quantizer) node.set_attr('bias_quantizer', b_quantizer) - # Not sure if this setting of this is useful - w_prec = None - if w_quantizer is None and (fused_weight == np.ones_like(fused_weight)).all(): - if ( - isinstance(parent_weight.type, IntegerPrecisionType) - and isinstance(bn_scale.type, IntegerPrecisionType) - and parent_weight.type.width == 1 - and bn_scale.type.width == 1 - ): - w_prec = node.weights['scale'].type - - b_prec = None - if b_quantizer is None and (fused_bias == np.zeros_like(fused_bias)).all(): - if ( - isinstance(parent_bias.type, IntegerPrecisionType) - and isinstance(bn_bias.type, IntegerPrecisionType) - and parent_bias.type.width == 1 - and bn_bias.type.width == 1 - ): - b_prec = node.weights['bias'].type - # call function so that quantizer would be called if needed - node.add_weights_variable( - name='weight', var_name='w{index}', data=fused_weight, quantizer=w_quantizer, precision=w_prec - ) - node.add_weights_variable(name='bias', var_name='b{index}', data=fused_bias, quantizer=b_quantizer, precision=b_prec) + node.add_weights_variable(name='weight', var_name='w{index}', data=fused_weight, quantizer=w_quantizer) + node.add_weights_variable(name='bias', var_name='b{index}', data=fused_bias, quantizer=b_quantizer) model.remove_node(node, rewire=True) diff --git a/hls4ml/model/optimizer/passes/conv_to_convxd.py b/hls4ml/model/optimizer/passes/conv_to_convxd.py index e54c98c1d7..6fb88ad0d0 100644 --- a/hls4ml/model/optimizer/passes/conv_to_convxd.py +++ b/hls4ml/model/optimizer/passes/conv_to_convxd.py @@ -2,13 +2,9 @@ from hls4ml.model.layers import Constant, Conv, Conv1D, Conv2D from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.quantizers import QuantNodeQuantizer -from hls4ml.model.types import IntegerPrecisionType # these are attributes to copy _base_attributes = ( - 'Trace', - 'reuse_factor', 'in_width', 'out_width', 'n_chan', @@ -25,7 +21,6 @@ 'filt_height', 'stride_height', 'dilation_height', - 'strategy', 'data_format', ) @@ -69,16 +64,19 @@ def transform(self, model, node): if bias_node: attributes['bias_data'] = bias_node.attributes['value'] attributes['bias_quantizer'] = bias_node.get_attr('quantizer') - attributes['have_bias'] = True + attributes['use_bias'] = True else: attributes['bias_data'] = np.zeros(attributes['n_filt']) - attributes['bias_quantizer'] = QuantNodeQuantizer(IntegerPrecisionType(1, False)) - attributes['have_bias'] = False + attributes['use_bias'] = False + + # get the configuration name + config = model.config.get_layer_config(node) + new_name = f'{newtype.__name__}_{node.name}' + model.config.set_name_config(new_name, config) + model.config.parse_name_config(new_name, config) # making new node - new_node = model.make_node( - newtype, f'{newtype.__name__}_{node.name}', attributes, [node.inputs[0]], [x for x in node.outputs] - ) + new_node = model.make_node(newtype, new_name, attributes, [node.inputs[0]], [x for x in node.outputs]) # removing and replacing old nodes if bias_node: diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index d1629a7a13..4de58a18c2 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -12,8 +12,18 @@ class InferPrecisionTypes(ConfigurableOptimizerPass): def __init__(self): # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias self.infer_no_bias = False + self.count = 0 + self.MAX_COUNT = 1000 def match(self, node): + input_var = node.get_input_variable() + if input_var is not None and isinstance(input_var.type, UnspecifiedPrecisionType): + # need to wait for the input to update + # but check for infinite loops + self.count += 1 + if self.count == self.MAX_COUNT: + raise RuntimeError("There is an infinite loop in the precision inference.") + return False for layer_type in node.types.values(): if isinstance(layer_type.precision, UnspecifiedPrecisionType): return True @@ -30,7 +40,7 @@ def transform(self, model, node): if type_name not in inferred_types: self._infer_default_type(node, type_name) - return False # No model graph changes made + return True # May need to rerun def _infer_precision(self, node, types_to_infer): node_class = node.class_name diff --git a/hls4ml/model/optimizer/passes/linear.py b/hls4ml/model/optimizer/passes/linear.py index 78a808b9a1..1b8e3d9686 100644 --- a/hls4ml/model/optimizer/passes/linear.py +++ b/hls4ml/model/optimizer/passes/linear.py @@ -15,6 +15,9 @@ def transform(self, model, node): return True +_safe_parents = (Dense, Conv1D, Conv2D, BatchNormalization, Activation) + + class MergeLinearActivation(OptimizerPass): ''' For many objects it's safe to change the output precision independently of the calculation. @@ -26,7 +29,7 @@ def match(self, node): ''' if isinstance(node, Activation) and node.get_attr('activation') == 'linear': parent = node.get_input_node(node.inputs[0]) - safe_parent = isinstance(parent, (Dense, Conv1D, Conv2D, BatchNormalization)) + safe_parent = isinstance(parent, _safe_parents) return safe_parent and isinstance(parent.get_output_variable().type.precision, UnspecifiedPrecisionType) else: return False @@ -35,6 +38,7 @@ def transform(self, model, node): prev_node = node.get_input_node(node.inputs[0]) quantizer = node.get_attr("quantizer") prev_node.set_attr("quantizer", quantizer) - prev_node.update_output_precision(quantizer.hls_type) + prev_node.types['result_t'] = quantizer.hls_type + prev_node.get_output_variable().type.precision = quantizer.hls_type model.remove_node(node) return True diff --git a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py index 889a376cee..4c48944eb3 100644 --- a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py +++ b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py @@ -2,10 +2,6 @@ from hls4ml.model.layers import Constant, Dense, MatMul from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.quantizers import QuantNodeQuantizer -from hls4ml.model.types import IntegerPrecisionType - -_base_attributes = ('Trace', 'reuse_factor', 'weight', 'weight_t', 'bias', 'bias_t') class MatmulConstToDense(OptimizerPass): @@ -30,27 +26,29 @@ def transform(self, model, node): weight_data = const_node.attributes['value'] weight_quantizer = const_node.get_attr('quantizer') + # get the configuration name + config = model.config.get_layer_config(node) + new_name = f'Dense_{node.name}' + model.config.set_name_config(new_name, config) + model.config.parse_name_config(new_name, config) + in_shape = other_var.shape n_in = np.prod(in_shape) out_shape = list(in_shape[:-1]) + [weight_data.shape[-1]] n_out = np.prod(out_shape) # creating the attributes - attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update( - { - 'weight_data': weight_data, - 'weight_quantizer': weight_quantizer, - 'bias_data': np.zeros(out_shape), - 'bias_quantizer': QuantNodeQuantizer(IntegerPrecisionType(1, False)), - 'have_bias': False, - 'n_in': n_in, - 'n_out': n_out, - } - ) + attributes = { + 'weight_data': weight_data, + 'weight_quantizer': weight_quantizer, + 'bias_data': np.zeros(out_shape), + 'use_bias': False, + 'n_in': n_in, + 'n_out': n_out, + } # making new node - new_dense = model.make_node(Dense, f'Dense_{node.name}', attributes, [node.inputs[0]], [x for x in node.outputs]) + new_dense = model.make_node(Dense, new_name, attributes, [node.inputs[0]], [x for x in node.outputs]) # removing and replacing old nodes model.remove_node(const_node, rewire=False) diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index 54f275d9ec..78591d203c 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -5,8 +5,6 @@ from hls4ml.model.quantizers import QuantNodeQuantizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType -_base_attributes = ('Trace', 'reuse_factor', 'n_in') - # This should generally not happen because of qonnx cleaning class MergeTwoConstants(OptimizerPass): @@ -56,12 +54,10 @@ def transform(self, model, node): const_node0.set_attr('quantizer', quantizer) # overwrite the quantizer if quantizer: const_node0.set_attr('quantizer', quantizer) - + const_node0.types['result_t'] = quantizer.hls_type + const_node0.get_output_variable().type.precision = quantizer.hls_type const_node0.set_attr('value', new_val) - # reinitialize (which also runs quantization if quantizer exists) - const_node0.initialize() - model.remove_node(const_node1, rewire=False) # remove the batch norm node @@ -151,23 +147,26 @@ def transform(self, model, node): if bias.shape != tuple(input_shape) and np.squeeze(bias).shape != tuple(input_shape): bias = np.broadcast_to(bias, input_shape) - attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update( - { - 'scale_data': scale, - 'bias_data': bias, - 'n_in': n_in, - 'n_out': n_in, - 'n_filt': -1, - 'scale_precision': scale_precision, - 'scale_quantizer': scale_quantizer, - 'bias_precision': bias_precision, - 'bias_quantizer': bias_quantizer, - } - ) + attributes = { + 'scale_data': scale, + 'bias_data': bias, + 'n_in': n_in, + 'n_out': n_in, + 'n_filt': -1, + 'scale_precision': scale_precision, + 'scale_quantizer': scale_quantizer, + 'bias_precision': bias_precision, + 'bias_quantizer': bias_quantizer, + } + + # get the configuration name + config = model.config.get_layer_config(node) + new_name = f'bn_{node.name}' + model.config.set_name_config(new_name, config) + model.config.parse_name_config(new_name, config) aa_layer = model.make_node( - ApplyAlpha, f'bn_{node.name}', attributes, [node.inputs[input_node_idx]], [x for x in node.outputs] + ApplyAlpha, new_name, attributes, [node.inputs[input_node_idx]], [x for x in node.outputs] ) model.remove_node(const_node, rewire=False) @@ -222,20 +221,23 @@ def transform(self, model, node): if bias.shape != tuple(input_shape) and np.squeeze(bias).shape != tuple(input_shape): bias = np.broadcast_to(bias, input_shape) - attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update( - { - 'scale_data': scale, - 'bias_data': bias, - 'scale_quantizer': scale_quantizer, - 'bias_precision': bias_precision, - 'n_in': n_in, - 'n_out': n_in, - 'n_filt': -1, - } - ) - - bn_layer = model.make_node(ApplyAlpha, f'bn_{node.name}', attributes, [node.inputs[0]], [x for x in node.outputs]) + attributes = { + 'scale_data': scale, + 'bias_data': bias, + 'scale_quantizer': scale_quantizer, + 'bias_precision': bias_precision, + 'n_in': n_in, + 'n_out': n_in, + 'n_filt': -1, + } + + # get the configuration name + config = model.config.get_layer_config(node) + new_name = f'bn_{node.name}' + model.config.set_name_config(new_name, config) + model.config.parse_name_config(new_name, config) + + bn_layer = model.make_node(ApplyAlpha, new_name, attributes, [node.inputs[0]], [x for x in node.outputs]) model.remove_node(const_node, rewire=False) del node.inputs[1] diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py index 0d02124bc6..ed7f9701a2 100644 --- a/hls4ml/model/optimizer/passes/quant_opt.py +++ b/hls4ml/model/optimizer/passes/quant_opt.py @@ -14,6 +14,7 @@ """ +import copy import math # prefer to use math.ceil for scalar values import numpy as np @@ -25,8 +26,6 @@ _ALSO_MATCH_PO2 = True -_base_attributes = ('Trace', 'reuse_factor') - class QuantConstantParameters(OptimizerPass): """Remove Constant from the Qaunt node parameters (but not input[0])""" @@ -131,11 +130,17 @@ def transform(self, model, node): precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode) - attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update({'activation': 'linear', 'quantizer': quantizer}) + attributes = {'activation': 'linear', 'quantizer': quantizer} + + # update the configuration + config = model.config.get_layer_config(node) + prec_config = config.setdefault('Precision', {}) + prec_config['result'] = str(precision) + new_name = f'{node.name}_act' + model.config.set_name_config(new_name, config) + model.config.parse_name_config(new_name, config) - new_node = model.make_node(Activation, f'{node.name}_act', attributes, [node.inputs[0]], [x for x in node.outputs]) - new_node.get_output_variable().type.precision = precision + new_node = model.make_node(Activation, new_name, attributes, [node.inputs[0]], [x for x in node.outputs]) model.replace_node(node, new_node) return True @@ -189,8 +194,11 @@ def transform(self, model, node): const_node = node.get_input_node(node.inputs[0]) const_node.set_attr('quantizer', quantizer) + const_node.set_attr('result_t', precision) const_node.get_output_variable().type.precision = precision + # Should we update the configuration to reflect the new precision? I don't think it's necessary + # remove the Quant node model.remove_node(node, rewire=True) @@ -228,11 +236,18 @@ def transform(self, model, node): precision, quantizer = _calculate_precision_quantizer(bitwidth, bitwidth, signed, narrow, rounding_mode) - attributes = {k: node.attributes.get(k, None) for k in _base_attributes} - attributes.update({'activation': 'linear', 'quantizer': quantizer}) + activation_attributes = {'activation': 'linear', 'quantizer': quantizer} + + # update the configuration + config = model.config.get_layer_config(node) + act_config = copy.deepcopy(config) + prec_config = act_config.setdefault('Precision', {}) + prec_config['result'] = str(precision) + act_name = f'{node.name}_act' + model.config.set_name_config(act_name, act_config) + model.config.parse_name_config(act_name, act_config) - new_node = model.make_node(Activation, f'{node.name}_act', attributes, [node.inputs[0]], [x for x in node.outputs]) - new_node.get_output_variable().type.precision = precision + new_node = model.make_node(Activation, act_name, activation_attributes, [node.inputs[0]], [x for x in node.outputs]) model.replace_node(node, new_node) # but now add the ApplyAlhpas before and after @@ -240,16 +255,25 @@ def transform(self, model, node): scale = node.get_attr('scale') bias = node.get_attr('zeropt') - attributes_scale = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes_scale = {} + attributes_rescale = {} - attributes_rescale = {k: node.attributes.get(k, None) for k in _base_attributes} + scale_config = copy.deepcopy(config) + scale_name = f'{node.name}_scale' + model.config.set_name_config(scale_name, scale_config) + model.config.parse_name_config(scale_name, scale_config) + + rescale_config = config # no need to deep copy the last + rescale_name = f'{node.name}_rescale' + model.config.set_name_config(rescale_name, rescale_config) + model.config.parse_name_config(rescale_name, rescale_config) firstscale = 1 / scale firstbias = bias attributes_scale['scale_data'] = firstscale attributes_scale['bias_data'] = firstbias - scale_node = model.make_node(ApplyAlpha, node.name + '_scale', attributes_scale, [node.inputs[0]]) + scale_node = model.make_node(ApplyAlpha, scale_name, attributes_scale, [node.inputs[0]]) model.insert_node(scale_node) rescale = scale @@ -257,7 +281,7 @@ def transform(self, model, node): attributes_rescale['scale_data'] = rescale attributes_rescale['bias_data'] = rebias - rescale_node = model.make_node(ApplyAlpha, node.name + '_rescale', attributes_rescale, [new_node.outputs[0]]) + rescale_node = model.make_node(ApplyAlpha, rescale_name, attributes_rescale, [new_node.outputs[0]]) model.insert_node(rescale_node) return True @@ -305,10 +329,15 @@ def transform(self, model, node): const_node.set_attr('value', new_val) const_node.set_attr('quantizer', quantizer) - # reinitialize (which also runs quantization if quantizer exists) - const_node.initialize() + const_node.types['result_t'].precision = precision + const_node.get_output_variable().type.precision = precision + + attributes_rescale = {} - attributes_rescale = {k: node.attributes.get(k, None) for k in _base_attributes} + rescale_config = copy.deepcopy(model.config.get_layer_config(node)) + rescale_name = f'{node.name}_rescale' + model.config.set_name_config(rescale_name, rescale_config) + model.config.parse_name_config(rescale_name, rescale_config) rescale = scale rebias = -bias * scale @@ -316,7 +345,7 @@ def transform(self, model, node): attributes_rescale['bias_data'] = rebias rescale_node = model.make_node( - ApplyAlpha, node.name + '_rescale', attributes_rescale, [x for x in node.inputs], [x for x in node.outputs] + ApplyAlpha, rescale_name, attributes_rescale, [x for x in node.inputs], [x for x in node.outputs] ) model.replace_node(node, rescale_node) diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py index 05617ba124..9fb257a1ef 100644 --- a/hls4ml/model/types.py +++ b/hls4ml/model/types.py @@ -115,26 +115,6 @@ def saturation_mode(self): def saturation_bits(self): return 0 - @property - def integer(self): - return self.width - - @property - def fractional(self): - return 0 - - @property - def rounding_mode(self): - return RoundingMode.TRN - - @property - def saturation_mode(self): - return SaturationMode.WRAP - - @property - def saturation_bits(self): - return None - class FixedPrecisionType(PrecisionType): """Arbitrary precision fixed-point data type. @@ -159,10 +139,6 @@ def __init__(self, width=16, integer=6, signed=True, rounding_mode=None, saturat # make this a property to avoid inconsistencies - @property - def fractional(self): - return self.width - self.integer - @property def fractional(self): return self.width - self.integer @@ -231,6 +207,7 @@ def __init__(self): super().__init__(width=1, signed=False) self.integer = 1 + # TODO: this should really be a specific type def __str__(self): typestring = 'uint<1>' return typestring @@ -245,6 +222,7 @@ class ExponentPrecisionType(PrecisionType): def __init__(self, width=16, signed=True): super().__init__(width=width, signed=signed) + # TODO: this should really be a specific type, not int def __str__(self): typestring = '{signed}int<{width}>'.format(signed='u' if not self.signed else '', width=self.width) return typestring From 25849ef435731679961b5ba7068abbede1d9f02a Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sun, 10 Mar 2024 15:30:36 -0500 Subject: [PATCH 057/272] Fix bug in FuseBatchNormalization --- hls4ml/model/optimizer/passes/bn_fuse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py index c84430f13f..b3e8e454c8 100644 --- a/hls4ml/model/optimizer/passes/bn_fuse.py +++ b/hls4ml/model/optimizer/passes/bn_fuse.py @@ -89,8 +89,8 @@ def transform(self, model, node): node.set_attr('bias_quantizer', b_quantizer) # call function so that quantizer would be called if needed - node.add_weights_variable(name='weight', var_name='w{index}', data=fused_weight, quantizer=w_quantizer) - node.add_weights_variable(name='bias', var_name='b{index}', data=fused_bias, quantizer=b_quantizer) + parent_node.add_weights_variable(name='weight', var_name='w{index}', data=fused_weight, quantizer=w_quantizer) + parent_node.add_weights_variable(name='bias', var_name='b{index}', data=fused_bias, quantizer=b_quantizer) model.remove_node(node, rewire=True) From 4485bf3154ed5f4fbdabea1888b122ba84d2df80 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sun, 10 Mar 2024 20:23:58 -0500 Subject: [PATCH 058/272] fix issue with configuration setup of test --- test/pytest/test_qonnx.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index 529a5adebc..426df8f2e0 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -93,10 +93,7 @@ def test_tfc_2w2a(tfc_2w2a_model, backend): y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] # Convert QONNX model, compile, and run inference - config = hls4ml.utils.config_from_onnx_model(model, backend=backend) - # Some hand-derived config - config['LayerName'] = {} - config['LayerName']['global_in'] = {'Precision': 'ap_fixed<16,2>'} + config = hls4ml.utils.config_from_onnx_model(model, backend=backend, default_precision='fixed<32,16>') hls_model = hls4ml.converters.convert_from_onnx_model( model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_tfc-2w2a_{backend}'), backend=backend, hls_config=config ) From 52067c32e5de07af94322815b3fd02db5f8a5efa Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sun, 10 Mar 2024 22:25:15 -0500 Subject: [PATCH 059/272] fix bug in FuseConsecutiveBatchNormalization --- hls4ml/model/optimizer/passes/batchnorm_opt.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index 1800b33056..26292d7e2a 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -197,10 +197,10 @@ def transform(self, model, node): # only merge if the types are integer or fixed if ( - not isinstance(prev_node.weights['scale'].type, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(prev_node.weights['bias'].type, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(node.weights['scale'].type, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(node.weights['bias'].type, (IntegerPrecisionType, FixedPrecisionType)) + not isinstance(prev_node.weights['scale'].type.precision, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(prev_node.weights['bias'].type.precision, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(node.weights['scale'].type.precision, (IntegerPrecisionType, FixedPrecisionType)) + or not isinstance(node.weights['bias'].type.precision, (IntegerPrecisionType, FixedPrecisionType)) ): return False From 24d6245660d2b601301dc800e4401a5098b39c2a Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 11 Mar 2024 18:54:41 -0500 Subject: [PATCH 060/272] add missing header --- .../quartus/firmware/nnet_utils/nnet_conv2d_resource.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h index 73ad45592f..f5ce781739 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h @@ -1,6 +1,8 @@ #ifndef NNET_CONV2D_RESOURCE_H_ #define NNET_CONV2D_RESOURCE_H_ +#include + #include "nnet_common.h" #include "nnet_dense.h" #include "nnet_helpers.h" From 835af4e0a2c1ce403c74342f873fe727f01d99c0 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 11 Mar 2024 18:55:41 -0500 Subject: [PATCH 061/272] attempt to make qonnx tests match better --- test/pytest/test_qonnx.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index 426df8f2e0..b955608b88 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -88,7 +88,9 @@ def test_tfc_2w2a(tfc_2w2a_model, backend): model = tfc_2w2a_model ishape = (1, 1, 28, 28) - X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape).astype(np.float32) + X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape) + X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] @@ -105,15 +107,20 @@ def test_tfc_2w2a(tfc_2w2a_model, backend): @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_cnv_2w2a(cnv_2w2a_model, backend): + """ + This tests a convolution model. Note: the batch normalizations weights not quantized, so it + is difficult to make this match perfectly. It is also a slow test. + """ model = cnv_2w2a_model ishape = (1, 32, 32, 3) - X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape).astype(np.float32) + X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape) + X = (np.round(X * 2**6) * 2**-6).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] # Convert QONNX model, compile, and run inference - config = hls4ml.utils.config_from_onnx_model(model, backend=backend, default_precision='fixed<32,16>') + config = hls4ml.utils.config_from_onnx_model(model, backend=backend, default_precision='fixed<32,6>') hls_model = hls4ml.converters.convert_from_onnx_model( model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_cnv-2w2a_{backend}'), @@ -134,12 +141,13 @@ def test_jet_tagging(jettagging_model, backend): # Execute QONNX model inference # TODO make the test bigger ishape = (1, 16) - X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape).astype(np.float32) + X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape) + X = (np.round(X * 2**16) * 2**-16).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] # Convert QONNX model, compile, and run inference - config = hls4ml.utils.config_from_onnx_model(model, backend=backend) + config = hls4ml.utils.config_from_onnx_model(model, backend=backend, default_precision='fixed<32,16>') hls_model = hls4ml.converters.convert_from_onnx_model( model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_jettag_{backend}'), backend=backend, hls_config=config From 2bcec04b12ad10dcb689536cada563caafea5faf Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 12 Mar 2024 09:08:35 -0500 Subject: [PATCH 062/272] fix pre-commit --- hls4ml/model/optimizer/passes/move_scales.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index fe1acb7f94..cec69af5e8 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -4,6 +4,7 @@ TODO: Check that biases are properly handled. (Attempt to do it via Merge) ''' + import numpy as np from hls4ml.model.layers import ApplyAlpha, Constant, Conv, MatMul, Merge From 01c436be0b724336edf9d5b2dee125c43591cfd9 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 28 Mar 2024 21:47:00 +0100 Subject: [PATCH 063/272] Add ability to configure writer in Vitis/Vivado, to optionally write namespace, tar file and weight txt files --- hls4ml/backends/vivado/vivado_backend.py | 30 ++- hls4ml/model/graph.py | 12 ++ hls4ml/templates/vivado/firmware/defines.h | 4 + .../templates/vivado/firmware/myproject.cpp | 12 +- hls4ml/templates/vivado/firmware/myproject.h | 4 + hls4ml/templates/vivado/firmware/parameters.h | 4 + hls4ml/templates/vivado/myproject_bridge.cpp | 3 + hls4ml/templates/vivado/myproject_test.cpp | 2 + hls4ml/writer/vitis_writer.py | 1 - hls4ml/writer/vivado_writer.py | 194 ++++++++++++++---- test/pytest/test_writer_config.py | 71 +++++++ 11 files changed, 292 insertions(+), 45 deletions(-) create mode 100644 test/pytest/test_writer_config.py diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 0c056a0c5c..22e31e5ec6 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -175,13 +175,41 @@ def get_default_flow(self): def get_writer_flow(self): return self._writer_flow - def create_initial_config(self, part='xcvu13p-flga2577-2-e', clock_period=5, io_type='io_parallel'): + def create_initial_config( + self, + part='xcvu13p-flga2577-2-e', + clock_period=5, + io_type='io_parallel', + namespace=None, + write_weights_txt=False, + write_tar=True, + ): + """Create initial configuration of the Vivado backend. + + Args: + part (str, optional): The FPGA part to be used. Defaults to 'xcvu13p-flga2577-2-e'. + clock_period (int, optional): The clock period. Defaults to 5. + io_type (str, optional): Type of implementation used. One of + 'io_parallel' or 'io_stream'. Defaults to 'io_parallel'. + namespace (str, optional): If defined, place all generated code within a namespace. Defaults to None. + write_weights_txt (bool, optional): If True, writes weights to .txt files which speeds up compilation. + Defaults to False. + write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to True. + + Returns: + dict: initial configuration. + """ config = {} config['Part'] = part if part is not None else 'xcvu13p-flga2577-2-e' config['ClockPeriod'] = clock_period config['IOType'] = io_type config['HLSConfig'] = {} + config['WriterConfig'] = { + 'Namespace': namespace, + 'WriteWeightsTxt': write_weights_txt, + 'WriteTar': write_tar, + } return config diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index a6b5c29e89..4368df2f07 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -51,6 +51,15 @@ def __init__(self, config): self.pipeline_style = 'pipeline' + if 'WriterConfig' in self.config: + self.writer_config = self.config['WriterConfig'] + else: + self.writer_config = { + 'Namespace': None, + 'WriteWeightsTxt': True, + 'WriteTar': False, + } + self._parse_hls_config() self._validate_hls_config() @@ -177,6 +186,9 @@ def get_compression(self, layer): return compression + def get_writer_config(self): + return self.writer_config + def _parse_hls_config(self): hls_config = self.config['HLSConfig'] diff --git a/hls4ml/templates/vivado/firmware/defines.h b/hls4ml/templates/vivado/firmware/defines.h index 1f11b02095..ce7042c809 100644 --- a/hls4ml/templates/vivado/firmware/defines.h +++ b/hls4ml/templates/vivado/firmware/defines.h @@ -9,6 +9,10 @@ // hls-fpga-machine-learning insert numbers +// hls-fpga-machine-learning insert namespace-start + // hls-fpga-machine-learning insert layer-precision +// hls-fpga-machine-learning insert namespace-end + #endif diff --git a/hls4ml/templates/vivado/firmware/myproject.cpp b/hls4ml/templates/vivado/firmware/myproject.cpp index 133c62ceb1..5ba7f118ba 100644 --- a/hls4ml/templates/vivado/firmware/myproject.cpp +++ b/hls4ml/templates/vivado/firmware/myproject.cpp @@ -3,19 +3,15 @@ #include "myproject.h" #include "parameters.h" +// hls-fpga-machine-learning insert namespace-start + void myproject( // hls-fpga-machine-learning insert header ) { // hls-fpga-machine-learning insert IO -#ifndef __SYNTHESIS__ - static bool loaded_weights = false; - if (!loaded_weights) { - // hls-fpga-machine-learning insert load weights - loaded_weights = true; - } -#endif + // hls-fpga-machine-learning insert load weights // **************************************** // NETWORK INSTANTIATION @@ -23,3 +19,5 @@ void myproject( // hls-fpga-machine-learning insert layers } + +// hls-fpga-machine-learning insert namespace-end diff --git a/hls4ml/templates/vivado/firmware/myproject.h b/hls4ml/templates/vivado/firmware/myproject.h index 4900e9754c..5b34ae4c02 100644 --- a/hls4ml/templates/vivado/firmware/myproject.h +++ b/hls4ml/templates/vivado/firmware/myproject.h @@ -7,9 +7,13 @@ #include "defines.h" +// hls-fpga-machine-learning insert namespace-start + // Prototype of top level function for C-synthesis void myproject( // hls-fpga-machine-learning insert header ); +// hls-fpga-machine-learning insert namespace-end + #endif diff --git a/hls4ml/templates/vivado/firmware/parameters.h b/hls4ml/templates/vivado/firmware/parameters.h index 2d9ddedb3e..614020ddea 100644 --- a/hls4ml/templates/vivado/firmware/parameters.h +++ b/hls4ml/templates/vivado/firmware/parameters.h @@ -10,6 +10,10 @@ // hls-fpga-machine-learning insert weights +// hls-fpga-machine-learning insert namespace-start + // hls-fpga-machine-learning insert layer-config +// hls-fpga-machine-learning insert namespace-end + #endif diff --git a/hls4ml/templates/vivado/myproject_bridge.cpp b/hls4ml/templates/vivado/myproject_bridge.cpp index 35c1997f62..b1822a5ff6 100644 --- a/hls4ml/templates/vivado/myproject_bridge.cpp +++ b/hls4ml/templates/vivado/myproject_bridge.cpp @@ -52,6 +52,7 @@ void collect_trace_output(struct trace_data *c_trace_outputs) { void myproject_float( // hls-fpga-machine-learning insert header #float ) { + // hls-fpga-machine-learning insert namespace // hls-fpga-machine-learning insert wrapper #float } @@ -59,6 +60,8 @@ void myproject_float( void myproject_double( // hls-fpga-machine-learning insert header #double ) { + // hls-fpga-machine-learning insert namespace + // hls-fpga-machine-learning insert wrapper #double } } diff --git a/hls4ml/templates/vivado/myproject_test.cpp b/hls4ml/templates/vivado/myproject_test.cpp index 2fd9747cae..814bb1f3e6 100644 --- a/hls4ml/templates/vivado/myproject_test.cpp +++ b/hls4ml/templates/vivado/myproject_test.cpp @@ -21,6 +21,8 @@ size_t trace_type_size = sizeof(double); } // namespace nnet int main(int argc, char **argv) { + // hls-fpga-machine-learning insert namespace + // load input data from text file std::ifstream fin("tb_data/tb_input_features.dat"); // load predictions from text file diff --git a/hls4ml/writer/vitis_writer.py b/hls4ml/writer/vitis_writer.py index cfb4c0cb27..a2cca7f414 100644 --- a/hls4ml/writer/vitis_writer.py +++ b/hls4ml/writer/vitis_writer.py @@ -30,5 +30,4 @@ def write_hls(self, model): """ super().write_hls(model) self.write_nnet_utils_overrides(model) - os.remove(model.config.get_output_dir() + '.tar.gz') self.write_tar(model) diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 412bb8d667..20928536f1 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -13,36 +13,40 @@ class VivadoWriter(Writer): - def print_array_to_cpp(self, var, odir, write_txt_file=True): + def print_array_to_cpp(self, var, odir, namespace=None, write_txt_file=True): """Write a weights array to C++ header files. Args: var (WeightVariable): Weight to write odir (str): Output directory + namespace (str, optional): Writes a namespace for the weights to avoid clashes with global variables. write_txt_file (bool, optional): Write txt files in addition to .h files. Defaults to True. """ - h_file = open(f"{odir}/firmware/weights/{var.name}.h", "w") + h_file = open(f'{odir}/firmware/weights/{var.name}.h', 'w') if write_txt_file: - txt_file = open(f"{odir}/firmware/weights/{var.name}.txt", "w") + txt_file = open(f'{odir}/firmware/weights/{var.name}.txt', 'w') # meta data - h_file.write(f"//Numpy array shape {var.shape}\n") - h_file.write(f"//Min {np.min(var.min):.12f}\n") - h_file.write(f"//Max {np.max(var.max):.12f}\n") - h_file.write(f"//Number of zeros {var.nzeros}\n") - h_file.write("\n") + h_file.write(f'//Numpy array shape {var.shape}\n') + h_file.write(f'//Min {np.min(var.min):.12f}\n') + h_file.write(f'//Max {np.max(var.max):.12f}\n') + h_file.write(f'//Number of zeros {var.nzeros}\n') + h_file.write('\n') - h_file.write(f"#ifndef {var.name.upper()}_H_\n") - h_file.write(f"#define {var.name.upper()}_H_\n") - h_file.write("\n") + h_file.write(f'#ifndef {var.name.upper()}_H_\n') + h_file.write(f'#define {var.name.upper()}_H_\n') + h_file.write('\n') + + if namespace is not None: + h_file.write(f'namespace {namespace} {{\n\n') if write_txt_file: - h_file.write("#ifndef __SYNTHESIS__\n") - h_file.write(var.definition_cpp() + ";\n") - h_file.write("#else\n") + h_file.write('#ifndef __SYNTHESIS__\n') + h_file.write(var.definition_cpp() + ';\n') + h_file.write('#else\n') - h_file.write(var.definition_cpp() + " = {") + h_file.write(var.definition_cpp() + ' = {') # fill c++ array. # not including internal brackets for multidimensional case @@ -51,12 +55,17 @@ def print_array_to_cpp(self, var, odir, write_txt_file=True): h_file.write(sep + x) if write_txt_file: txt_file.write(sep + x) - sep = ", " - h_file.write("};\n") + sep = ', ' + h_file.write('};\n\n') + if write_txt_file: - h_file.write("#endif\n") + h_file.write('#endif\n') txt_file.close() - h_file.write("\n#endif\n") + + if namespace is not None: + h_file.write('}\n\n') + + h_file.write('\n#endif\n') h_file.close() def write_project_dir(self, model): @@ -124,6 +133,7 @@ def write_project_cpp(self, model): # Add headers to weights and biases if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert header' in line: inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs]) outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs]) @@ -136,22 +146,52 @@ def write_project_cpp(self, model): newline += ',\n' + brams_str newline += '\n' + elif '// hls-fpga-machine-learning insert namespace-start' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += f'namespace {namespace} {{\n' + + elif '// hls-fpga-machine-learning insert namespace-end' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += '}\n' + elif '// hls-fpga-machine-learning insert load weights' in line: newline = line - for layer in model.get_layers(): - for w in layer.get_weights(): - if w.weight_class == 'CompressedWeightVariable': - newline += indent + ' nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( - w.type.name, w.nonzeros, w.name, w.name - ) - elif w.weight_class == 'ExponentWeightVariable': - newline += indent + ' nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( - w.type.name, w.data_length, w.name, w.name - ) - else: - newline += indent + ' nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( - w.type.name, w.data_length, w.name, w.name - ) + if model.config.get_writer_config()['WriteWeightsTxt']: + + newline += '#ifndef __SYNTHESIS__\n' + newline += ' static bool loaded_weights = false;\n' + newline += ' if (!loaded_weights) {\n' + + for layer in model.get_layers(): + for w in layer.get_weights(): + if w.weight_class == 'CompressedWeightVariable': + newline += ( + indent + + ' nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.nonzeros, w.name, w.name + ) + ) + elif w.weight_class == 'ExponentWeightVariable': + newline += ( + indent + + ' nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.data_length, w.name, w.name + ) + ) + else: + newline += indent + ' nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.data_length, w.name, w.name + ) + + newline += ' loaded_weights = true;' + newline += ' }\n' + newline += '#endif' # Add input/output type elif '// hls-fpga-machine-learning insert IO' in line: @@ -242,8 +282,10 @@ def write_project_header(self, model): for line in f.readlines(): if 'MYPROJECT' in line: newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) + elif 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert header' in line: inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs]) outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs]) @@ -255,6 +297,21 @@ def write_project_header(self, model): if len(model_brams) > 0: newline += ',\n' + brams_str newline += '\n' + + elif '// hls-fpga-machine-learning insert namespace-start' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += f'namespace {namespace} {{\n' + + elif '// hls-fpga-machine-learning insert namespace-end' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += '}\n' + else: newline = line fout.write(newline) @@ -300,6 +357,20 @@ def write_defines(self, model): for used_type in all_precision.values(): newline += used_type.definition_cpp() + elif '// hls-fpga-machine-learning insert namespace-start' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += f'namespace {namespace} {{\n' + + elif '// hls-fpga-machine-learning insert namespace-end' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += '}\n' + else: newline = line fout.write(newline) @@ -336,6 +407,21 @@ def write_parameters(self, model): if config: newline += '// ' + layer.name + '\n' newline += config + '\n' + + elif '// hls-fpga-machine-learning insert namespace-start' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += f'namespace {namespace} {{\n' + + elif '// hls-fpga-machine-learning insert namespace-end' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += '}\n' + else: newline = line fout.write(newline) @@ -348,9 +434,13 @@ def write_weights(self, model): Args: model (ModelGraph): the hls4ml model. """ + namespace = model.config.get_writer_config().get('Namespace', None) + write_txt = model.config.get_writer_config().get('WriteWeightsTxt', True) for layer in model.get_layers(): for weights in layer.get_weights(): - self.print_array_to_cpp(weights, model.config.get_output_dir()) + self.print_array_to_cpp( + weights, model.config.get_output_dir(), namespace=namespace, write_txt_file=write_txt + ) def __make_dat_file(self, original_path, project_path): """ @@ -420,10 +510,12 @@ def write_test_bench(self, model): # Insert numbers if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert bram' in line: newline = line for bram in model_brams: newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + elif '// hls-fpga-machine-learning insert data' in line: newline = line offset = 0 @@ -435,6 +527,7 @@ def write_test_bench(self, model): offset += inp.size() for out in model_outputs: newline += ' ' + out.definition_cpp() + ';\n' + elif '// hls-fpga-machine-learning insert zero' in line: newline = line for inp in model_inputs: @@ -442,6 +535,7 @@ def write_test_bench(self, model): newline += f' nnet::fill_zero<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n' for out in model_outputs: newline += ' ' + out.definition_cpp() + ';\n' + elif '// hls-fpga-machine-learning insert top-level-function' in line: newline = line @@ -455,6 +549,7 @@ def write_test_bench(self, model): top_level = indent + f'{model.config.get_project_name()}({all_vars});\n' newline += top_level + elif '// hls-fpga-machine-learning insert predictions' in line: newline = line for out in model_outputs: @@ -462,12 +557,14 @@ def write_test_bench(self, model): newline += indent + ' std::cout << pr[i] << " ";\n' newline += indent + '}\n' newline += indent + 'std::cout << std::endl;\n' + elif '// hls-fpga-machine-learning insert tb-output' in line: newline = line for out in model_outputs: newline += indent + 'nnet::print_result<{}, {}>({}, fout);\n'.format( out.type.name, out.size_cpp(), out.name ) # TODO enable this + elif ( '// hls-fpga-machine-learning insert output' in line or '// hls-fpga-machine-learning insert quantized' in line @@ -477,6 +574,14 @@ def write_test_bench(self, model): newline += indent + 'nnet::print_result<{}, {}>({}, std::cout, true);\n'.format( out.type.name, out.size_cpp(), out.name ) + + elif '// hls-fpga-machine-learning insert namespace' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += indent + f'using namespace {namespace};\n' + else: newline = line fout.write(newline) @@ -503,12 +608,15 @@ def write_bridge(self, model): for line in f.readlines(): if 'MYPROJECT' in line: newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) + elif 'myproject' in line: newline = line.replace('myproject', format(model.config.get_project_name())) + elif '// hls-fpga-machine-learning insert bram' in line: newline = line for bram in model_brams: newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + elif '// hls-fpga-machine-learning insert header' in line: dtype = line.split('#', 1)[1].strip() inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs]) @@ -517,6 +625,7 @@ def write_bridge(self, model): newline = '' newline += indent + inputs_str + ',\n' newline += indent + outputs_str + '\n' + elif '// hls-fpga-machine-learning insert wrapper' in line: dtype = line.split('#', 1)[1].strip() newline = '' @@ -548,6 +657,7 @@ def write_bridge(self, model): newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format( o.type.name, dtype, o.size_cpp(), o.name, o.name ) + elif '// hls-fpga-machine-learning insert trace_outputs' in line: newline = '' for layer in model.get_layers(): @@ -561,6 +671,13 @@ def write_bridge(self, model): + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n' ) + elif '// hls-fpga-machine-learning insert namespace' in line: + newline = '' + + namespace = model.config.get_writer_config().get('Namespace', None) + if namespace is not None: + newline += indent + f'using namespace {namespace};\n' + else: newline = line fout.write(newline) @@ -707,8 +824,13 @@ def write_tar(self, model): model (ModelGraph): the hls4ml model. """ - with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive: - archive.add(model.config.get_output_dir(), recursive=True) + write_tar = model.config.get_writer_config().get('WriteTar', False) + if write_tar: + tar_path = model.config.get_output_dir() + '.tar.gz' + if os.path.exists(tar_path): + os.remove(tar_path) + with tarfile.open(tar_path, mode='w:gz') as archive: + archive.add(model.config.get_output_dir(), recursive=True) def write_hls(self, model): print('Writing HLS project') diff --git a/test/pytest/test_writer_config.py b/test/pytest/test_writer_config.py new file mode 100644 index 0000000000..0c537d2f63 --- /dev/null +++ b/test/pytest/test_writer_config.py @@ -0,0 +1,71 @@ +import os +import shutil +from pathlib import Path + +import pytest +from tensorflow.keras.layers import Dense +from tensorflow.keras.models import Sequential + +import hls4ml + +test_root_path = Path(__file__).parent + + +@pytest.fixture(scope='module') +def keras_model(): + model = Sequential() + model.add(Dense(10, activation='softmax', input_shape=(15,))) + model.compile() + return model + + +@pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) # No Quartus for now +@pytest.mark.parametrize('namespace', [None, 'test_namespace']) +def test_namespace(keras_model, namespace, io_type, backend): + + use_namespace = namespace is None + config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name') + odir = str(test_root_path / f'hls4mlprj_namespace_{use_namespace}_{backend}_{io_type}') + hls_model = hls4ml.converters.convert_from_keras_model( + keras_model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend + ) + hls_model.compile() # It's enough that the model compiles + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) # No Quartus for now +@pytest.mark.parametrize('write_tar', [True, False]) +def test_write_tar(keras_model, write_tar, backend): + + config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name') + odir = str(test_root_path / f'hls4mlprj_write_tar_{write_tar}_{backend}') + + if os.path.exists(odir + '.tar.gz'): + os.remove(odir + '.tar.gz') + + hls_model = hls4ml.converters.convert_from_keras_model( + keras_model, hls_config=config, output_dir=odir, backend=backend, write_tar=write_tar + ) + hls_model.write() + + tar_written = os.path.exists(odir + '.tar.gz') + assert tar_written == write_tar + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) # No Quartus for now +@pytest.mark.parametrize('write_weights_txt', [True, False]) +def test_write_weights_txt(keras_model, write_weights_txt, backend): + + config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name') + odir = str(test_root_path / f'hls4mlprj_write_weights_txt_{write_weights_txt}_{backend}') + + if os.path.exists(odir): + shutil.rmtree(odir) + + hls_model = hls4ml.converters.convert_from_keras_model( + keras_model, hls_config=config, output_dir=odir, backend=backend, write_weights_txt=write_weights_txt + ) + hls_model.write() + + txt_written = os.path.exists(odir + '/firmware/weights/w2.txt') + assert txt_written == write_weights_txt From b3facd25975ac61b02270d04b60efb1fe3e455de Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 17 Apr 2024 09:59:54 -0500 Subject: [PATCH 064/272] remove count, become more selective on when True is returned --- hls4ml/model/optimizer/passes/infer_precision.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 4de58a18c2..ee585c42d6 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -12,17 +12,11 @@ class InferPrecisionTypes(ConfigurableOptimizerPass): def __init__(self): # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias self.infer_no_bias = False - self.count = 0 - self.MAX_COUNT = 1000 def match(self, node): input_var = node.get_input_variable() if input_var is not None and isinstance(input_var.type, UnspecifiedPrecisionType): - # need to wait for the input to update - # but check for infinite loops - self.count += 1 - if self.count == self.MAX_COUNT: - raise RuntimeError("There is an infinite loop in the precision inference.") + # only infer types if the input type is known return False for layer_type in node.types.values(): if isinstance(layer_type.precision, UnspecifiedPrecisionType): @@ -40,7 +34,9 @@ def transform(self, model, node): if type_name not in inferred_types: self._infer_default_type(node, type_name) - return True # May need to rerun + # if the return type was set, this may allow InferPrecisionTypes to be run + # on layers it was not previously able to + return 'result_t' in types_to_infer def _infer_precision(self, node, types_to_infer): node_class = node.class_name From 0d8108eaeacc504a213a3795b1d1482cf621c4cf Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 19 Apr 2024 14:59:03 -0500 Subject: [PATCH 065/272] fix optimizer issue when quantizer is None --- hls4ml/model/optimizer/passes/linear.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hls4ml/model/optimizer/passes/linear.py b/hls4ml/model/optimizer/passes/linear.py index 1b8e3d9686..b1aee7adc7 100644 --- a/hls4ml/model/optimizer/passes/linear.py +++ b/hls4ml/model/optimizer/passes/linear.py @@ -37,8 +37,10 @@ def match(self, node): def transform(self, model, node): prev_node = node.get_input_node(node.inputs[0]) quantizer = node.get_attr("quantizer") - prev_node.set_attr("quantizer", quantizer) - prev_node.types['result_t'] = quantizer.hls_type - prev_node.get_output_variable().type.precision = quantizer.hls_type + # if the activation has a quantizer (usually from a QONNX Quant node), set the previous node's output precision + if quantizer is not None: + prev_node.set_attr("quantizer", quantizer) + prev_node.types['result_t'] = quantizer.hls_type + prev_node.get_output_variable().type.precision = quantizer.hls_type model.remove_node(node) return True From fc2c68a4a69c024a8f56fb1f270383e5acd1bcb9 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Fri, 3 May 2024 16:23:14 -0400 Subject: [PATCH 066/272] fix most pytest issues --- .../firmware/nnet_utils/nnet_conv1d_resource.h | 1 + .../firmware/nnet_utils/nnet_conv2d_resource.h | 1 + test/pytest/test_pytorch_api.py | 16 +++++++--------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h index a110d6d424..974bb95807 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h @@ -3,6 +3,7 @@ #include "nnet_common.h" #include "nnet_dense.h" +#include namespace nnet { diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h index 73ad45592f..c1c14d53b0 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h @@ -4,6 +4,7 @@ #include "nnet_common.h" #include "nnet_dense.h" #include "nnet_helpers.h" +#include namespace nnet { diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index 3ea97a755e..c9d9216c9a 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -550,7 +550,7 @@ def test_pooling(pooling, padds, backend): # Verify correct parsing of layer hls_pool = list(hls_model.get_layers())[-2] if '2d' in pooling.__name__: - assert hls_pool.attributes['name'] == poolNode.name + assert hls_pool.attributes['name'] == "_" + poolNode.name.split("_")[-1] assert hls_pool.attributes['class_name'][-2] == str(2) assert hls_pool.attributes['stride_height'] == class_object_pool.stride assert hls_pool.attributes['stride_width'] == class_object_pool.stride @@ -560,14 +560,14 @@ def test_pooling(pooling, padds, backend): elif '1d' in pooling.__name__: if "Max" in pooling.__name__: - assert hls_pool.attributes['name'] == poolNode.name + assert hls_pool.attributes['name'] == "_" + poolNode.name.split("_")[-1] assert hls_pool.attributes['class_name'][-2] == str(1) assert hls_pool.attributes['pool_width'] == class_object_pool.kernel_size assert hls_pool.attributes['stride_width'] == class_object_pool.stride assert hls_pool.attributes['padding'] == 'valid' if class_object_pool.padding == 0 else 'same' else: - assert hls_pool.attributes['name'] == poolNode.name + assert hls_pool.attributes['name'] == "_" + poolNode.name.split("_")[-1] assert hls_pool.attributes['class_name'][-2] == str(1) assert hls_pool.attributes['pool_width'] == class_object_pool.kernel_size[0] assert hls_pool.attributes['stride_width'] == class_object_pool.stride[0] @@ -641,7 +641,7 @@ def test_squeeze(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input)).detach().numpy().flatten() config = config_from_pytorch_model(model) - del config['Model']['InputsChannelLast'] # We don't want anything touched for this test + del config['Model']['ChannelsLastConversion'] # We don't want anything touched for this test output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_squeeze_{backend}_{io_type}') hls_model = convert_from_pytorch_model( @@ -719,7 +719,7 @@ def test_skipped_layers(backend, io_type): input_shape = (3, 8) batch_input_shape = (None,) + input_shape config = config_from_pytorch_model( - model, default_precision='ap_fixed<32,16>', inputs_channel_last=True, transpose_outputs=False + model, default_precision='ap_fixed<32,16>', channels_last_conversion="full", transpose_outputs=False ) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_skipped_{backend}_{io_type}') hls_model = convert_from_pytorch_model( @@ -734,10 +734,9 @@ def test_skipped_layers(backend, io_type): hls_model.compile() input = torch.randn(10, 3, 8) - hls_input = np.ascontiguousarray(torch.permute(input, (0, 2, 1)).detach().numpy()) # Transpose to channels_last pytorch_prediction = model(input).detach().numpy().flatten() - hls_prediction = hls_model.predict(hls_input).flatten() + hls_prediction = hls_model.predict(input.detach().numpy()).flatten() np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2) @@ -785,8 +784,7 @@ def forward(self, x): config = config_from_pytorch_model( model, default_precision='ap_fixed<32,16>', - inputs_channel_last=False, # Crucial for testing if the first Transpose was removed - transpose_outputs=False, + channels_last_conversion="full", # Crucial for testing if the first Transpose was removed ) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_transpose_nop_{tensor_rank}d_{backend}_{io_type}') hls_model = convert_from_pytorch_model( From 1fa59dcd947c99851a5c5bce4301e3ef52407bdc Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 16 May 2024 11:32:10 -0500 Subject: [PATCH 067/272] update pytest image to 0.5.6 --- test/pytest/ci-template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml index afaf90da4d..f6aa700415 100644 --- a/test/pytest/ci-template.yml +++ b/test/pytest/ci-template.yml @@ -1,6 +1,6 @@ .pytest: stage: test - image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.5.base + image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.6.base tags: - k8s-default before_script: From a181d971b38a09aa4bd0d62e303d43f08474ca0f Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 9 Jun 2024 21:35:40 -0700 Subject: [PATCH 068/272] add vitis --- .../templates/vitis/nnet_utils/nnet_conv1d.h | 14 +++- .../vitis/nnet_utils/nnet_conv1d_latency.h | 80 +++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index 52a404672c..1b66c646af 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -55,9 +55,19 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. //#pragma HLS INLINE recursive - // Nothing special to be done for io_parallel implementation if (CONFIG_T::strategy == nnet::latency) { - conv_1d_latency_cl(data, res, weights, biases); + if (CONFIG_T::implementation == conv_implementation::pointwise) { + // Use pointwise unrolled implementation + if (CONFIG_T::reuse_factor > 1) { + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); + } else { + assert(CONFIG_T::reuse_factor == 1); + pointwise_conv_1d_latency_cl(data, res, weights, biases); + } + } else { + // Use standard unrolled implementation + conv_1d_latency_cl(data, res, weights, biases); + } } else { conv_1d_resource_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index 1bf25cc89c..3fd6160f4f 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -85,5 +85,85 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + } // namespace nnet #endif From 1c8c9ed44286078061b724ebcdc18ccda25d73d8 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 10 Jun 2024 19:13:15 -0500 Subject: [PATCH 069/272] starting towards being able to split seperable --- hls4ml/backends/fpga/fpga_backend.py | 10 ++++ hls4ml/backends/vivado/vivado_backend.py | 6 --- hls4ml/model/graph.py | 62 ++++++++++++++---------- hls4ml/model/layers.py | 16 ++++++ 4 files changed, 62 insertions(+), 32 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 87309ff4e5..672627e35f 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -79,6 +79,16 @@ def __init__(self, name): attrs.append(ConfigurableAttribute('reuse_factor', default=1)) self.attribute_map[layer] = attrs + # seperable is kind of special because it is effectively two layers that will be split + for layer in (SeparableConv1D, SeparableConv2D): + attrs = self.attribute_map.get(layer, []) + attrs.append(TypeAttribute('depthwise_accum')) + attrs.append(TypeAttribute('pointwise_accum')) + attrs.append(TypeAttribute('depthwise_result')) + attrs.append(ConfigurableAttribute('depthwise_reuse_factor', default=1)) + attrs.append(ConfigurableAttribute('pointwise_reuse_factor', default=1)) + self.attribute_map[layer] = attrs + act_attrs = self.attribute_map.get(Activation, []) act_attrs.append(ConfigurableAttribute('table_size', default=1024)) act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 978d9fd54f..4a9568305e 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -76,12 +76,6 @@ def _register_layer_attributes(self): attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) self.attribute_map[layer] = attrs - sep_conv_layers = [SeparableConv1D, SeparableConv2D] - for layer in sep_conv_layers: - attrs = self.attribute_map.get(layer, []) - attrs.append(TypeAttribute('dw_output', default=FixedPrecisionType(18, 8))) - self.attribute_map[layer] = attrs - def _register_flows(self): initializers = self._get_layer_initializers() init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 04ec33294d..d1722eaae1 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -100,6 +100,12 @@ def get_layer_config(self, layer): return layer_config + def set_name_config(self, name, config): + """sets hls_config["LayerName"][name] = config""" + hls_config = self.config['HLSConfig'] + layer_config = hls_config.setdefault('LayerName', {}) + layer_config[name] = config + def get_precision(self, layer, var='default'): precision = self.layer_name_precision.get(layer.name.lower() + '_' + var) type_name = layer.name.lower() + '_' + var + '_t' @@ -183,6 +189,35 @@ def get_compression(self, layer): return compression + def parse_name_config(self, layer_name, layer_cfg): + """This is used by _parse_hls_config below, but also in optimizers when a new layer config is created""" + precision_cfg = layer_cfg.get('Precision') + if isinstance(precision_cfg, dict): + for var, precision in precision_cfg.items(): + self.layer_name_precision[layer_name.lower() + '_' + var] = precision + else: + self.layer_name_precision[layer_name.lower() + '_default'] = precision_cfg + + rf = layer_cfg.get('ReuseFactor') + if rf is not None: + self.layer_name_rf[layer_name.lower()] = rf + + targ_cycles = layer_cfg.get('TargetCycles') + if targ_cycles is not None: + self.layer_name_targ_cycles[layer_name.lower()] = targ_cycles + + strategy = layer_cfg.get('Strategy') + if strategy is not None: + self.layer_name_strategy[layer_name.lower()] = strategy + + conv_implementation = layer_cfg.get('ConvImplementation') + if conv_implementation is not None: + self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation + + compression = layer_cfg.get('Compression') + if compression is not None: + self.layer_name_compression[layer_name.lower()] = bool(compression) + def _parse_hls_config(self): hls_config = self.config['HLSConfig'] @@ -255,32 +290,7 @@ def _parse_hls_config(self): layer_name_cfg = hls_config.get('LayerName') if layer_name_cfg is not None: for layer_name, layer_cfg in layer_name_cfg.items(): - precision_cfg = layer_cfg.get('Precision') - if isinstance(precision_cfg, dict): - for var, precision in precision_cfg.items(): - self.layer_name_precision[layer_name.lower() + '_' + var] = precision - else: - self.layer_name_precision[layer_name.lower() + '_default'] = precision_cfg - - rf = layer_cfg.get('ReuseFactor') - if rf is not None: - self.layer_name_rf[layer_name.lower()] = rf - - targ_cycles = layer_cfg.get('TargetCycles') - if targ_cycles is not None: - self.layer_name_targ_cycles[layer_name.lower()] = targ_cycles - - strategy = layer_cfg.get('Strategy') - if strategy is not None: - self.layer_name_strategy[layer_name.lower()] = strategy - - conv_implementation = layer_cfg.get('ConvImplementation') - if conv_implementation is not None: - self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation - - compression = layer_cfg.get('Compression') - if compression is not None: - self.layer_name_compression[layer_name.lower()] = bool(compression) + self.parse_name_config(layer_name, layer_cfg) def _validate_hls_config(self): use_dataflow = False diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 0d9cc0622c..f076a1e5f0 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -100,6 +100,7 @@ def __init__(self, model, name, attributes, inputs, outputs=None): layer_config = self.model.config.get_layer_config(self) for config_key, config_value in layer_config.items(): + print(f'{config_key=}, {config_value=}') config_key = convert_to_snake_case(config_key) if config_key in self.attributes: print( @@ -179,6 +180,12 @@ def _set_accum_t(self): accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum'))) self.set_attr('accum_t', accum_t) + def _set_type_t(self, name): + has_type_t = any(a for a in self.expected_attributes if a.name == name + '_t' and isinstance(a, TypeAttribute)) + if has_type_t: + type_t = NamedType(*reversed(self.model.config.get_precision(self, name))) + self.set_attr(name + '_t', type_t) + def get_input_node(self, input_name=None): if input_name is None: if len(self.inputs) > 0: @@ -470,6 +477,11 @@ def initialize(self): self.add_bias(quantizer=self.get_attr('bias_quantizer')) + # set the needed types if needed + self._set_type_t('pointwise_accum') + self._set_type_t('depthwise_accum') + self._set_type_t('depthwise_result') + class DepthwiseConv1D(Conv1D): def initialize(self): @@ -616,6 +628,10 @@ def initialize(self): self.add_bias(quantizer=self.get_attr('bias_quantizer')) + self._set_type_t('pointwise_accum') + self._set_type_t('depthwise_accum') + self._set_type_t('depthwise_result') + class DepthwiseConv2D(Conv2D): def initialize(self): From 0925a3dee501486302a0e415f42e1f9d06992f1e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 11 Jun 2024 19:27:07 -0500 Subject: [PATCH 070/272] complete implementation of seperable -> dw + pw, untested --- .../vivado/passes/convolution_templates.py | 2 +- hls4ml/converters/keras/convolution.py | 3 + hls4ml/model/graph.py | 38 ++++++ hls4ml/model/layers.py | 56 +++++++- hls4ml/model/optimizer/__init__.py | 1 + .../optimizer/passes/seperable_to_dw_conv.py | 124 ++++++++++++++++++ 6 files changed, 219 insertions(+), 5 deletions(-) create mode 100644 hls4ml/model/optimizer/passes/seperable_to_dw_conv.py diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 037f2d5eb2..7f3832ba28 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -280,7 +280,7 @@ def format(self, node): # Override bias and bias_t since these are zeros in depthwise step of SepConv1D params['bias'] = params['zero_bias'] params['bias_t'] = params['zero_bias_t'] - params['n_filt'] = params['n_chan'] # In depthwise step n_chan == n_filt + params['n_filt'] = params['n_chan'] * node.get_attr('depth_multiplier') # In depthwise step n_chan == n_filt params['dilation'] = node.get_attr('dilation', 1) params['nzeros'] = node.get_weights('depthwise').nzeros params['index'] = str(node.index) + '_depthwise' diff --git a/hls4ml/converters/keras/convolution.py b/hls4ml/converters/keras/convolution.py index 39780f6dc6..0eaa967844 100644 --- a/hls4ml/converters/keras/convolution.py +++ b/hls4ml/converters/keras/convolution.py @@ -60,6 +60,9 @@ def parse_conv2d_layer(keras_layer, input_names, input_shapes, data_reader): layer['bias_data'] = get_weights_data(data_reader, layer['name'], 'bias') + if 'depth_multiplier' in keras_layer['config']: + layer['depth_multiplier'] = keras_layer['config']['depth_multiplier'] + if 'filters' in keras_layer['config']: layer['n_filt'] = keras_layer['config']['filters'] else: diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index d1722eaae1..10b3a0f854 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -615,6 +615,44 @@ def replace_node(self, old_node, new_node): self.graph = OrderedDict((new_node.name, new_node) if k == old_node.name else (k, v) for k, v in self.graph.items()) self._update_model_outputs() + def split_node(self, old_node, new_node1, new_node2): + """Replace an existing node in the graph with two nodes in sequence. + + Args: + old_node (Layer): The node to replace + new_node1 (Layer): The first new node in sequence + new_node2 (Layer): The second new node in sequence + + """ + + # fmt: off + assert len(new_node1.inputs) == len(old_node.inputs), \ + f'{new_node1.name} and {old_node.name} have different number of inputs' + assert len(new_node2.outputs) == len(old_node.outputs), \ + f'{new_node2.name} and {old_node.name} have different number of outputs' + # fmt: on + + repl = {old_name: new_name for old_name, new_name in zip(old_node.outputs, new_node2.outputs)} + repl.update({old_name: new_name for old_name, new_name in zip(old_node.inputs, new_node1.inputs)}) + + for node in self.graph.values(): + for i, n in enumerate(node.inputs): + if n in repl: + node.inputs[i] = repl[n] + for i, n in enumerate(node.outputs): + if n in repl: + node.outputs[i] = repl[n] + + new_graph = OrderedDict() + for key, value in self.graph.items(): + if key == old_node.name: + new_graph[new_node1.name] = new_node1 + new_graph[new_node2.name] = new_node2 + else: + new_graph[key] = value + self.graph = new_graph + self._update_model_outputs() + def _update_model_outputs(self): '''Update the model outputs diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index f076a1e5f0..9e80da291f 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -447,6 +447,7 @@ class SeparableConv1D(Layer): Attribute('out_width'), Attribute('n_chan'), Attribute('n_filt'), + Attribute('depth_multiplier', default=1), Attribute('filt_width'), Attribute('stride_width'), Attribute('pad_left'), @@ -484,12 +485,27 @@ def initialize(self): class DepthwiseConv1D(Conv1D): + _expected_attributes = [ + Attribute('in_width'), + Attribute('out_width'), + Attribute('n_chan'), + Attribute('depth_multiplier', default=1), + Attribute('filt_width'), + Attribute('stride_width'), + Attribute('pad_left'), + Attribute('pad_right'), + WeightAttribute('depthwise'), + WeightAttribute('bias'), + TypeAttribute('depthwise'), + TypeAttribute('bias'), + ] + def initialize(self): if self.get_attr('data_format') == 'channels_last': - shape = [self.attributes['out_width'], self.attributes['n_chan']] + shape = [self.attributes['out_width'], self.attributes['n_chan'] * self.attributes['depth_multiplier']] dims = [f'OUT_HEIGHT_{self.index}', f'N_CHAN_{self.index}'] else: - shape = [self.attributes['n_chan'], self.attributes['out_width']] + shape = [self.attributes['n_chan'] * self.attributes['depth_multiplier'], self.attributes['out_width']] dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}'] self.add_output_variable(shape, dims) @@ -498,6 +514,7 @@ def initialize(self): ) self.add_bias(quantizer=self.get_attr('bias_quantizer')) + self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier')) class Conv2D(Layer): @@ -594,6 +611,7 @@ class SeparableConv2D(Layer): Attribute('out_width'), Attribute('n_chan'), Attribute('n_filt'), + Attribute('depth_multiplier', default=1), Attribute('filt_height'), Attribute('filt_width'), Attribute('stride_height'), @@ -634,12 +652,41 @@ def initialize(self): class DepthwiseConv2D(Conv2D): + _expected_attributes = [ + Attribute('in_height'), + Attribute('in_width'), + Attribute('out_height'), + Attribute('out_width'), + Attribute('n_chan'), + Attribute('depth_multiplier', default=1), + Attribute('filt_height'), + Attribute('filt_width'), + Attribute('stride_height'), + Attribute('stride_width'), + Attribute('pad_top'), + Attribute('pad_bottom'), + Attribute('pad_left'), + Attribute('pad_right'), + WeightAttribute('weight'), + WeightAttribute('bias'), + TypeAttribute('weight'), + TypeAttribute('bias'), + ] + def initialize(self): if self.get_attr('data_format') == 'channels_last': - shape = [self.attributes['out_height'], self.attributes['out_width'], self.attributes['n_chan']] + shape = [ + self.attributes['out_height'], + self.attributes['out_width'], + self.attributes['n_chan'] * self.attributes['depth_multiplier'], + ] dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}'] else: - shape = [self.attributes['n_chan'], self.attributes['out_height'], self.attributes['out_width']] + shape = [ + self.attributes['n_chan'] * self.attributes['depth_multiplier'], + self.attributes['out_height'], + self.attributes['out_width'], + ] dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}'] self.add_output_variable(shape, dims) @@ -648,6 +695,7 @@ def initialize(self): ) self.add_bias(quantizer=self.get_attr('bias_quantizer')) + self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier')) class Pooling1D(Layer): diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 3aa247d03f..de1b7597df 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -33,6 +33,7 @@ register_flow( 'convert', [ + 'seperable_to_depthwise_and_conv', # has to be before precision inference 'infer_precision_types', 'channels_last_converter', 'remove_transpose_before_flatten', diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py new file mode 100644 index 0000000000..4fdee0010c --- /dev/null +++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py @@ -0,0 +1,124 @@ +""" +This optimizer converts a seperable convolution to a depthwise followed by a regular convolution. +For backends with a custom pointwise implementations the regular convolution will subsequently +be converted to a pointwise convolution by a different optimizer. +""" + +import copy + +from hls4ml.model.layers import SeparableConv1D, SeparableConv2D +from hls4ml.model.optimizer import OptimizerPass + + +class SeperableToDepthwiseAndConv(OptimizerPass): + """Convert Seperable to DepthwiseConv + Conv (potentially later Pointwise)""" + + _dw_attributes = ( + 'in_width', + 'out_width', + 'n_chan', + 'depth_multiplier', + 'pad_left', + 'pad_right', + 'filt_width', + 'stride_width', + 'dilation_width', + 'in_height', + 'out_height', + 'pad_top', + 'pad_bottom', + 'filt_height', + 'stride_height', + 'dilation_height', + 'data_format', + 'depthwise_data', + 'depthwise_quantizer', + ) + + _pw_attributes = ('out_width', 'n_filt', 'dilation_width', 'out_height', 'dilation_height', 'data_format', 'use_bias') + + def match(self, node): + return isinstance(node, (SeparableConv1D, SeparableConv2D)) + + def transform(self, model, node): + dim = node.__class__.__name__[-2:] # '1D' or '2D' + + # get the layer configuration name + layer_config = model.config.get_layer_config(node) + + # First do depthwise + dw_name = f'{node.name}_depthwise' + + # now the layer config (so that set configuration get copied) + dw_layer_config = copy.deepcopy(layer_config) + + if dw_layer_config: + dw_precision_cfg = dw_layer_config.setdefault('Precision', {}) + if 'depthwise' in dw_precision_cfg: + dw_precision_cfg['weight'] = dw_precision_cfg['depthwise'] + del dw_precision_cfg['depthwise'] + if 'depthwise_accum' in dw_precision_cfg: + dw_precision_cfg['accum'] = dw_precision_cfg['depthwise_accum'] + del dw_precision_cfg['depthwise_accum'] + if 'depthwise_result' in dw_precision_cfg: + dw_precision_cfg['result'] = dw_precision_cfg['depthwise_result'] + del dw_precision_cfg['depthwise_result'] + dw_precision_cfg.pop('pointwise', None) + dw_precision_cfg.pop('pointwise_accum', None) + model.config.set_name_config(dw_name, dw_layer_config) + model.config.parse_name_config(dw_name, dw_layer_config) + + # creating the attributes + dw_attributes = {k: node.attributes.get(k, None) for k in SeperableToDepthwiseAndConv._dw_attributes} + + dw_attributes['use_bias'] = False + + new_dw = model.make_node('DepthwiseConv' + dim, dw_name, dw_attributes, [node.inputs[0]]) + + # Then do convolution + pw_name = f'{node.name}_pointwise' + + # now the layer config (so that set configuration get copied) + pw_layer_config = copy.deepcopy(layer_config) + + if pw_layer_config: + pw_precision_cfg = pw_layer_config.setdefault('Precision', {}) + if 'pointwise' in pw_precision_cfg: + pw_precision_cfg['weight'] = pw_precision_cfg['pointwise'] + del pw_precision_cfg['pointwise'] + if 'pointwise_accum' in pw_precision_cfg: + pw_precision_cfg['accum'] = pw_precision_cfg['pointwise_accum'] + del pw_precision_cfg['pointwise_accum'] + if 'pointwise_result' in pw_precision_cfg: + pw_precision_cfg['result'] = pw_precision_cfg['pointwise_result'] + del pw_precision_cfg['pointwise_result'] + pw_precision_cfg.pop('depthwise', None) + pw_precision_cfg.pop('depthwise_accum', None) + model.config.set_name_config(pw_name, pw_layer_config) + model.config.parse_name_config(pw_name, pw_layer_config) + + # creating the attributes + pw_attributes = {k: node.attributes.get(k, None) for k in SeperableToDepthwiseAndConv._pw_attributes} + pw_attributes['filt_width'] = 1 + pw_attributes['filt_height'] = 1 + pw_attributes['stride_width'] = 1 + pw_attributes['stride_height'] = 1 + pw_attributes['pad_left'] = 0 + pw_attributes['pad_right'] = 0 + pw_attributes['pad_top'] = 0 + pw_attributes['pad_bottom'] = 0 + pw_attributes['in_width'] = pw_attributes['out_width'] + pw_attributes['in_height'] = pw_attributes['out_height'] + pw_attributes['n_chan'] = node.get_attr('n_chan') * node.get_attr('depth_multiplier') + pw_attributes['weight_data'] = node.get_attr('pointwise_data') + pw_attributes['weight_quantizer'] = node.get_attr('pointwise_quantizer') + pw_attributes['bias_data'] = node.get_attr('bias_data') + pw_attributes['bias_quantizer'] = node.get_attr('bias_quantizer') + + # note this is just regular convolution. It is replaced by a special pointwise implementation + # if available by another optimizer + new_pw = model.make_node('Conv' + dim, pw_name, pw_attributes, [dw_name]) + + model.split_node(node, new_dw, new_pw) + + return True From 86b0c4075a7db97500a128e93f7d10db6e2cf97c Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 12 Jun 2024 18:28:08 -0500 Subject: [PATCH 071/272] make conv_same_pad also trigger on depthwise, varius bug fixes --- hls4ml/backends/vivado/passes/conv_same_pad.py | 6 +++--- hls4ml/model/layers.py | 5 ++--- hls4ml/model/optimizer/passes/seperable_to_dw_conv.py | 1 + test/pytest/test_sepconv2d.py | 5 +++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/hls4ml/backends/vivado/passes/conv_same_pad.py b/hls4ml/backends/vivado/passes/conv_same_pad.py index bb8354a3d0..dd282f34e3 100644 --- a/hls4ml/backends/vivado/passes/conv_same_pad.py +++ b/hls4ml/backends/vivado/passes/conv_same_pad.py @@ -1,4 +1,4 @@ -from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D +from hls4ml.model.layers import Conv1D, Conv2D, DepthwiseConv1D, DepthwiseConv2D, SeparableConv1D, SeparableConv2D from hls4ml.model.optimizer import OptimizerPass @@ -7,7 +7,7 @@ class InsertZeroPaddingBeforeConv1D(OptimizerPass): def match(self, node): is_match = ( - isinstance(node, (Conv1D, SeparableConv1D)) + isinstance(node, (Conv1D, DepthwiseConv1D, SeparableConv1D)) and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal')) and node.get_attr('filt_width') != 1 ) @@ -55,7 +55,7 @@ class InsertZeroPaddingBeforeConv2D(OptimizerPass): def match(self, node): is_match = ( - isinstance(node, (Conv2D, SeparableConv2D)) + isinstance(node, (Conv2D, DepthwiseConv2D, SeparableConv2D)) and node.get_attr('padding') == 'same' and node.get_attr('filt_height') != 1 and node.get_attr('filt_width') != 1 diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 9e80da291f..cb826bb8a1 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -100,7 +100,6 @@ def __init__(self, model, name, attributes, inputs, outputs=None): layer_config = self.model.config.get_layer_config(self) for config_key, config_value in layer_config.items(): - print(f'{config_key=}, {config_value=}') config_key = convert_to_snake_case(config_key) if config_key in self.attributes: print( @@ -494,9 +493,9 @@ class DepthwiseConv1D(Conv1D): Attribute('stride_width'), Attribute('pad_left'), Attribute('pad_right'), - WeightAttribute('depthwise'), + WeightAttribute('weight'), WeightAttribute('bias'), - TypeAttribute('depthwise'), + TypeAttribute('weight'), TypeAttribute('bias'), ] diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py index 4fdee0010c..0e85131435 100644 --- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py +++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py @@ -33,6 +33,7 @@ class SeperableToDepthwiseAndConv(OptimizerPass): 'data_format', 'depthwise_data', 'depthwise_quantizer', + 'padding', ) _pw_attributes = ('out_width', 'n_filt', 'dilation_width', 'out_height', 'dilation_height', 'data_format', 'use_bias') diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index 58e63fec8a..4732c7c7f1 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -10,7 +10,6 @@ padds_options = ['same', 'valid'] chans_options = ['channels_last'] -io_type_options = ['io_parallel', 'io_stream'] strides_options = [(1, 1), (2, 2)] kernel_options = [(2, 2), (3, 3)] bias_options = [False] @@ -50,7 +49,9 @@ def test_sepconv2d(chans, padds, strides, kernels, bias, io_type, backend): model.compile(optimizer='adam', loss='mse') X_input = np.random.rand(100, *input_shape) keras_prediction = model.predict(X_input) - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,16>', granularity="name", backend=backend + ) stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( From 9dbcbdeeb478fbcfa2801240bee7b1bce21b33a8 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 12 Jun 2024 20:58:51 -0500 Subject: [PATCH 072/272] add parsing of depth multiplier for 1D depthwise conv --- hls4ml/converters/keras/convolution.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hls4ml/converters/keras/convolution.py b/hls4ml/converters/keras/convolution.py index 0eaa967844..2b24613094 100644 --- a/hls4ml/converters/keras/convolution.py +++ b/hls4ml/converters/keras/convolution.py @@ -21,6 +21,9 @@ def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader): layer['bias_data'] = get_weights_data(data_reader, layer['name'], 'bias') + if 'depth_multiplier' in keras_layer['config']: + layer['depth_multiplier'] = keras_layer['config']['depth_multiplier'] + if 'filters' in keras_layer['config']: layer['n_filt'] = keras_layer['config']['filters'] else: From 3a559838e366e1e9ede6c846307434f2cf90d46d Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jun 2024 14:41:33 -0500 Subject: [PATCH 073/272] handle case where layer precision is a string --- .../optimizer/passes/seperable_to_dw_conv.py | 46 ++++++++++--------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py index 0e85131435..7eb5fd57ce 100644 --- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py +++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py @@ -55,17 +55,18 @@ def transform(self, model, node): if dw_layer_config: dw_precision_cfg = dw_layer_config.setdefault('Precision', {}) - if 'depthwise' in dw_precision_cfg: - dw_precision_cfg['weight'] = dw_precision_cfg['depthwise'] - del dw_precision_cfg['depthwise'] - if 'depthwise_accum' in dw_precision_cfg: - dw_precision_cfg['accum'] = dw_precision_cfg['depthwise_accum'] - del dw_precision_cfg['depthwise_accum'] - if 'depthwise_result' in dw_precision_cfg: - dw_precision_cfg['result'] = dw_precision_cfg['depthwise_result'] - del dw_precision_cfg['depthwise_result'] - dw_precision_cfg.pop('pointwise', None) - dw_precision_cfg.pop('pointwise_accum', None) + if isinstance(dw_precision_cfg, dict): + if 'depthwise' in dw_precision_cfg: + dw_precision_cfg['weight'] = dw_precision_cfg['depthwise'] + del dw_precision_cfg['depthwise'] + if 'depthwise_accum' in dw_precision_cfg: + dw_precision_cfg['accum'] = dw_precision_cfg['depthwise_accum'] + del dw_precision_cfg['depthwise_accum'] + if 'depthwise_result' in dw_precision_cfg: + dw_precision_cfg['result'] = dw_precision_cfg['depthwise_result'] + del dw_precision_cfg['depthwise_result'] + dw_precision_cfg.pop('pointwise', None) + dw_precision_cfg.pop('pointwise_accum', None) model.config.set_name_config(dw_name, dw_layer_config) model.config.parse_name_config(dw_name, dw_layer_config) @@ -84,17 +85,18 @@ def transform(self, model, node): if pw_layer_config: pw_precision_cfg = pw_layer_config.setdefault('Precision', {}) - if 'pointwise' in pw_precision_cfg: - pw_precision_cfg['weight'] = pw_precision_cfg['pointwise'] - del pw_precision_cfg['pointwise'] - if 'pointwise_accum' in pw_precision_cfg: - pw_precision_cfg['accum'] = pw_precision_cfg['pointwise_accum'] - del pw_precision_cfg['pointwise_accum'] - if 'pointwise_result' in pw_precision_cfg: - pw_precision_cfg['result'] = pw_precision_cfg['pointwise_result'] - del pw_precision_cfg['pointwise_result'] - pw_precision_cfg.pop('depthwise', None) - pw_precision_cfg.pop('depthwise_accum', None) + if isinstance(pw_precision_cfg, dict): + if 'pointwise' in pw_precision_cfg: + pw_precision_cfg['weight'] = pw_precision_cfg['pointwise'] + del pw_precision_cfg['pointwise'] + if 'pointwise_accum' in pw_precision_cfg: + pw_precision_cfg['accum'] = pw_precision_cfg['pointwise_accum'] + del pw_precision_cfg['pointwise_accum'] + if 'pointwise_result' in pw_precision_cfg: + pw_precision_cfg['result'] = pw_precision_cfg['pointwise_result'] + del pw_precision_cfg['pointwise_result'] + pw_precision_cfg.pop('depthwise', None) + pw_precision_cfg.pop('depthwise_accum', None) model.config.set_name_config(pw_name, pw_layer_config) model.config.parse_name_config(pw_name, pw_layer_config) From c7cb71fdad11cf5d9f990d3ecb3aec4b7c01e04f Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jun 2024 15:21:12 -0500 Subject: [PATCH 074/272] fix up automatic precision inferrence --- hls4ml/model/optimizer/passes/infer_precision.py | 9 ++++++++- hls4ml/model/optimizer/passes/seperable_to_dw_conv.py | 6 +++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 51422c534e..256e8a8152 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -49,7 +49,10 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']: return self._infer_conv_precision(node, types_to_infer) - if node_class in ['SeparableConv1D', 'SeparableConv2D', 'DepthwiseConv2D']: + if node_class in ['DepthwiseConv1D', 'DepthwiseConv2D']: + return self._infer_depthconv_precision(node, types_to_infer) + + if node_class in ['SeparableConv1D', 'SeparableConv2D']: return self._infer_sepconv_precision(node, types_to_infer) if node_class in ['Pooling1D', 'Pooling2D']: @@ -166,6 +169,10 @@ def _infer_conv_precision(self, node, types_to_infer): n_ops = node.get_attr('n_chan') * node.get_attr('filt_height', 1) * node.get_attr('filt_width') return self._infer_common_precision(node, types_to_infer, n_ops) + def _infer_depthconv_precision(self, node, types_to_infer): + n_ops = node.get_attr('filt_height', 1) * node.get_attr('filt_width') + return self._infer_common_precision(node, types_to_infer, n_ops) + def _infer_sepconv_precision(self, node, types_to_infer): inferred_types = [] diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py index 7eb5fd57ce..0142f686d0 100644 --- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py +++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py @@ -71,7 +71,7 @@ def transform(self, model, node): model.config.parse_name_config(dw_name, dw_layer_config) # creating the attributes - dw_attributes = {k: node.attributes.get(k, None) for k in SeperableToDepthwiseAndConv._dw_attributes} + dw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._dw_attributes if k in node.attributes} dw_attributes['use_bias'] = False @@ -101,7 +101,7 @@ def transform(self, model, node): model.config.parse_name_config(pw_name, pw_layer_config) # creating the attributes - pw_attributes = {k: node.attributes.get(k, None) for k in SeperableToDepthwiseAndConv._pw_attributes} + pw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._pw_attributes if k in node.attributes} pw_attributes['filt_width'] = 1 pw_attributes['filt_height'] = 1 pw_attributes['stride_width'] = 1 @@ -111,7 +111,7 @@ def transform(self, model, node): pw_attributes['pad_top'] = 0 pw_attributes['pad_bottom'] = 0 pw_attributes['in_width'] = pw_attributes['out_width'] - pw_attributes['in_height'] = pw_attributes['out_height'] + pw_attributes['in_height'] = pw_attributes.get('out_height', 1) pw_attributes['n_chan'] = node.get_attr('n_chan') * node.get_attr('depth_multiplier') pw_attributes['weight_data'] = node.get_attr('pointwise_data') pw_attributes['weight_quantizer'] = node.get_attr('pointwise_quantizer') From c5841a2d1754bc1b179b9a70f8bdd7463fd14f1b Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 25 Jun 2024 18:03:12 -0500 Subject: [PATCH 075/272] seperate out parse_qonnx flow --- hls4ml/model/optimizer/__init__.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 712dc3822c..eb53ed7925 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -31,7 +31,7 @@ del optimizers register_flow( - 'convert', + 'parse_qonnx', [ 'reshape_constant', 'quant_constant_parameters', @@ -51,9 +51,16 @@ 'merge_to_apply_alpha_div', 'matmul_const_to_dense', 'conv_to_conv_x_d', - 'fuse_consecutive_batch_normalization', # needs to be before infer_precision_types - 'merge_linear_activation', # needs to be before infer_precision_types - 'fuse_batch_normalization', # needs to be before infer_precision_types + ], +) + +register_flow( + 'convert', + [ + 'fuse_consecutive_batch_normalization', + 'merge_linear_activation', + 'fuse_batch_normalization', + # The ones above here need to be before infer_precision_types 'infer_precision_types', 'channels_last_converter', 'remove_transpose_before_flatten', @@ -65,6 +72,7 @@ 'qkeras_factorize_alpha', 'extract_ternary_threshold', ], + requires=['parse_qonnx'], ) # TODO Maybe not all QKeras optmizers belong here? register_flow( From de790ca3f889d777a19da3f802a2708e8cc53788 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 26 Jun 2024 14:01:46 -0500 Subject: [PATCH 076/272] Again allow for None in target shape--for pytorch --- hls4ml/model/layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 0db82ff411..d40d0f04b1 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -402,9 +402,9 @@ def initialize(self): else: raise RuntimeError("Reshape for ONNX requires the target shape to be a second input.") - # nones should not exist here + # remove Nones -- Seems to be used by pytorch parser if target_shape[0] is None: - raise RuntimeError(f"Unexpectedly have a None in {target_shape=}") + target_shape = target_shape[1:] # take care of -1 shapes shape = self._infer_output_shape(input_shape, target_shape) From dad40aac080f22ac26da9ea3bf86b2228bdda4cb Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 2 Jul 2024 18:13:57 -0500 Subject: [PATCH 077/272] update interface for depth multiplier, though HLS doesn't yet implement it --- hls4ml/backends/fpga/passes/codegen.py | 1 + hls4ml/backends/vivado/vivado_backend.py | 26 +++++++++++++++++++ hls4ml/converters/keras/convolution.py | 4 +-- hls4ml/model/layers.py | 12 ++++----- .../optimizer/passes/seperable_to_dw_conv.py | 2 +- .../nnet_utils/nnet_sepconv1d_latency.h | 2 +- .../nnet_utils/nnet_sepconv2d_latency.h | 2 +- 7 files changed, 38 insertions(+), 11 deletions(-) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index c951a02b80..ccbac885c4 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -6,6 +6,7 @@ class GenerateConvIm2col(OptimizerPass): '''Generates tcode for im2col step of 1D/2d convolution''' + # Note, DepthwizeConv1D/2D also matches because it inherits from Conv1D/2D def match(self, node): return ( isinstance(node, (Conv1D, Conv2D, SeparableConv1D, SeparableConv2D)) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 4a9568305e..96da6cea75 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -14,6 +14,7 @@ Conv1D, Conv2D, Dense, + DepthwiseConv1D, DepthwiseConv2D, Embedding, GarNet, @@ -314,6 +315,31 @@ def init_sepconv1d(self, layer): dw_output_t = NamedType(dw_out_name, dw_out_precision) layer.set_attr('dw_output_t', dw_output_t) + @layer_optimizer(DepthwiseConv1D) + def init_depconv1d(self, layer): + if layer.model.config.is_resource_strategy(layer): + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + else: + layer.set_attr('strategy', 'latency') + + out_width = layer.get_output_variable().shape[0] + chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1) + valid_pf = self.get_valid_conv_partition_splits(1, out_width) + if chosen_pf not in valid_pf: + closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) + valid_pf_str = ','.join(map(str, valid_pf)) + print( + f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".' + f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.' + ) + else: + closest_pf = chosen_pf + layer.set_attr('n_partitions', out_width // closest_pf) + + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + @layer_optimizer(Conv2D) def init_conv2d(self, layer): if len(layer.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D diff --git a/hls4ml/converters/keras/convolution.py b/hls4ml/converters/keras/convolution.py index 2b24613094..d223d55dfb 100644 --- a/hls4ml/converters/keras/convolution.py +++ b/hls4ml/converters/keras/convolution.py @@ -27,7 +27,7 @@ def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader): if 'filters' in keras_layer['config']: layer['n_filt'] = keras_layer['config']['filters'] else: - layer['n_filt'] = layer['n_chan'] + layer['n_filt'] = layer['n_chan'] * layer.get('depth_multiplier') layer['filt_width'] = keras_layer['config']['kernel_size'][0] layer['stride_width'] = keras_layer['config']['strides'][0] layer['padding'] = keras_layer['config']['padding'] @@ -69,7 +69,7 @@ def parse_conv2d_layer(keras_layer, input_names, input_shapes, data_reader): if 'filters' in keras_layer['config']: layer['n_filt'] = keras_layer['config']['filters'] else: - layer['n_filt'] = layer['n_chan'] + layer['n_filt'] = layer['n_chan'] * layer.get('depth_multiplier') layer['filt_height'] = keras_layer['config']['kernel_size'][0] layer['filt_width'] = keras_layer['config']['kernel_size'][1] layer['stride_height'] = keras_layer['config']['strides'][0] diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index cb826bb8a1..86a11459b2 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -489,6 +489,7 @@ class DepthwiseConv1D(Conv1D): Attribute('out_width'), Attribute('n_chan'), Attribute('depth_multiplier', default=1), + Attribute('n_filt'), # = n_chan * depth_multiplier Attribute('filt_width'), Attribute('stride_width'), Attribute('pad_left'), @@ -501,10 +502,10 @@ class DepthwiseConv1D(Conv1D): def initialize(self): if self.get_attr('data_format') == 'channels_last': - shape = [self.attributes['out_width'], self.attributes['n_chan'] * self.attributes['depth_multiplier']] + shape = [self.attributes['out_width'], self.attributes['n_filt']] dims = [f'OUT_HEIGHT_{self.index}', f'N_CHAN_{self.index}'] else: - shape = [self.attributes['n_chan'] * self.attributes['depth_multiplier'], self.attributes['out_width']] + shape = [self.attributes['n_filt'], self.attributes['out_width']] dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}'] self.add_output_variable(shape, dims) @@ -513,7 +514,6 @@ def initialize(self): ) self.add_bias(quantizer=self.get_attr('bias_quantizer')) - self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier')) class Conv2D(Layer): @@ -658,6 +658,7 @@ class DepthwiseConv2D(Conv2D): Attribute('out_width'), Attribute('n_chan'), Attribute('depth_multiplier', default=1), + Attribute('n_filt'), # = n_chan * depth_multiplier Attribute('filt_height'), Attribute('filt_width'), Attribute('stride_height'), @@ -677,12 +678,12 @@ def initialize(self): shape = [ self.attributes['out_height'], self.attributes['out_width'], - self.attributes['n_chan'] * self.attributes['depth_multiplier'], + self.attributes['n_filt'], ] dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}'] else: shape = [ - self.attributes['n_chan'] * self.attributes['depth_multiplier'], + self.attributes['n_filt'], self.attributes['out_height'], self.attributes['out_width'], ] @@ -694,7 +695,6 @@ def initialize(self): ) self.add_bias(quantizer=self.get_attr('bias_quantizer')) - self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier')) class Pooling1D(Layer): diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py index 0142f686d0..7d3b71dc96 100644 --- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py +++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py @@ -72,7 +72,7 @@ def transform(self, model, node): # creating the attributes dw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._dw_attributes if k in node.attributes} - + dw_attributes['n_filt'] = dw_attributes['n_chan'] * dw_attributes['depth_multiplier'] dw_attributes['use_bias'] = False new_dw = model.make_node('DepthwiseConv' + dim, dw_name, dw_attributes, [node.inputs[0]]) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h index c9fe86ea93..2f7e57a502 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h @@ -10,7 +10,7 @@ namespace nnet { template void depthwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h index 161cc2c834..00729ac4c2 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h @@ -11,7 +11,7 @@ template void depthwise_conv_2d_latency_cl( data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; From 0ea246ce2b51de1c82ecf93cb8d5bc3a37f51a6c Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 15 Jul 2024 18:20:13 +0200 Subject: [PATCH 078/272] Refactor matrix-multiplication kernel as a function pointer --- hls4ml/backends/fpga/passes/codegen.py | 167 ++++++++++-------- .../vivado/passes/convolution_templates.py | 56 +++--- .../backends/vivado/passes/core_templates.py | 23 +-- .../vivado/passes/recurrent_templates.py | 52 ++++-- .../vivado/passes/resource_strategy.py | 2 +- hls4ml/backends/vivado/vivado_backend.py | 34 +++- .../vivado/nnet_utils/nnet_code_gen.h | 29 +-- .../templates/vivado/nnet_utils/nnet_common.h | 2 +- .../vivado/nnet_utils/nnet_conv1d_stream.h | 4 + .../vivado/nnet_utils/nnet_conv2d_stream.h | 3 +- .../vivado/nnet_utils/nnet_conv_stream.h | 37 +--- .../templates/vivado/nnet_utils/nnet_dense.h | 51 ++++-- .../vivado/nnet_utils/nnet_dense_stream.h | 7 +- .../vivado/nnet_utils/nnet_function_stubs.h | 42 +++++ test/pytest/test_dense_unrolled.py | 92 ++++++++-- 15 files changed, 362 insertions(+), 239 deletions(-) create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index 09e600d421..3667680ed5 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -2,7 +2,7 @@ import numpy as np -from hls4ml.model.layers import Conv1D, Conv2D, Dense +from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense from hls4ml.model.optimizer import OptimizerPass from hls4ml.model.types import Source @@ -60,8 +60,8 @@ class GenerateUnrolledDenseResource(OptimizerPass): def match(self, node): # Only apply to layers use that use Dense Matrix Multiplication - # TODO - Extend (& test) for Conv1D / Separable Conv / Depthwise Conv / Recurrent layers - layers_with_dense = (Dense, Conv2D) + # TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers + layers_with_dense = (Dense, Conv1D, Conv2D, LSTM, GRU) # Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer weights_transposed = node.get_attr('_weights_transposed', False) @@ -70,23 +70,43 @@ def match(self, node): rf_gt_one = node.get_attr('reuse_factor', 1) > 1 # User requested unrolled implementation of Dense - is_unrolled = node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' + is_unrolled = node.get_attr('strategy', 'latency') == 'unrolled' return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled def transform(self, model, node): - code_str = self.__generate_unrolled_dense_resource(model, node) - node.set_attr('unrolled_dense_resource_codegen', Source(code_str)) + if isinstance(node, (LSTM, GRU)): + n_in, n_out, n_in_recr, n_out_recr = node.model.config.backend.get_layer_mult_size(node) - def __generate_unrolled_dense_resource(self, model, node): + reuse_factor = node.get_attr('reuse_factor') + weights = node.weights['weight'] + code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, str(node.index) + '_1') + node.set_attr('unrolled_dense_resource_codegen_1', Source(code_str)) + + recr_reuse_factor = node.get_attr('recurrent_reuse_factor') + recr_weights = node.weights['recurrent_weight'] + code_str = self._generate_unrolled_function( + n_in_recr, n_out_recr, recr_reuse_factor, recr_weights, str(node.index) + '_2' + ) + node.set_attr('unrolled_dense_resource_codegen_2', Source(code_str)) + + else: + n_in, n_out = node.model.config.backend.get_layer_mult_size(node) + reuse_factor = node.get_attr('reuse_factor') + weights = node.weights['weight'] + + code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, node.index) + node.set_attr('unrolled_dense_resource_codegen', Source(code_str)) + + def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, function_suffix): """ Generate a C++ function that mimics the Dense Resource implementation. The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero. - Latency strategy can optimize zero mutiplications + Latency strategy can optimize zero multiplications Resource strategy, on the other hand, cannot. When all the weights in the same BRAM block are zero, Vivado is unable to optimize it - With this (and additional TCL scripts) zero BRAM are optimised + With this (and additional TCL scripts) zero BRAM are optimized Args: node: Layer to generate code for @@ -96,61 +116,58 @@ def __generate_unrolled_dense_resource(self, model, node): # Variable instantiation and function pragmas generated_code = ( - "template\n" - "class dense_unrolled_{index} : public DenseResourceUnrolled {{\n" - " public:\n" - " static void dense_unrolled(\n" - " data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n" - " typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n" - " typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n" - " ) {{\n" - " #pragma HLS pipeline II=CONFIG_T::reuse_factor\n" - "\n" - " constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n" - " #pragma HLS function_instantiate variable=weights,biases\n" - " #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n" - " #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n" - " #pragma HLS ARRAY_PARTITION variable=biases complete\n" - "\n" - " typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n" - " #pragma HLS ARRAY_PARTITION variable=acc complete\n" - "\n" - " InitAccum:\n" - " for (int i = 0; i < CONFIG_T::n_out; i++) {{\n" - " #pragma HLS UNROLL\n" - " acc[i] = (typename CONFIG_T::accum_t) biases[i];\n" - " }}\n" - "\n" - ).format(index=node.index) + 'template\n' + 'class dense_unrolled_{suffix} : public DenseKernel {{\n' + ' public:\n' + ' static void dense(\n' + ' data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n' + ' typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n' + ' typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n' + ' ) {{\n' + ' #pragma HLS pipeline II=CONFIG_T::reuse_factor\n' + '\n' + ' constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n' + ' #pragma HLS function_instantiate variable=weights,biases\n' + ' #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n' + ' #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n' + ' #pragma HLS ARRAY_PARTITION variable=biases complete\n' + '\n' + ' typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n' + ' #pragma HLS ARRAY_PARTITION variable=acc complete\n' + '\n' + ' InitAccum:\n' + ' for (int i = 0; i < CONFIG_T::n_out; i++) {{\n' + ' #pragma HLS UNROLL\n' + ' acc[i] = (typename CONFIG_T::accum_t) biases[i];\n' + ' }}\n' + '\n' + ).format(suffix=function_suffix) # Unrolled multiplication, according to the three cases - n_in, n_out = node.model.config.backend.get_layer_mult_size(node) - reuse_factor = node.get_attr('reuse_factor') - weights = node.weights['weight'] if reuse_factor <= n_in: - mult_code = self.__generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights) + mult_code = self._generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights) elif reuse_factor > n_in and reuse_factor % n_in == 0: - mult_code = self.__generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights) + mult_code = self._generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights) else: # This case shouldn't happen if my understanding of RF is correct # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in raise Exception('Not implemented...') # Write output - generated_code += mult_code + "\n" + generated_code += mult_code + '\n' generated_code += ( - " Result:\n" - " for (int i = 0; i < CONFIG_T::n_out; i++) {\n" - " #pragma HLS UNROLL\n" - " res[i] = cast(acc[i]);\n" - " }\n" - " }\n" - "};\n" + ' Result:\n' + ' for (int i = 0; i < CONFIG_T::n_out; i++) {\n' + ' #pragma HLS UNROLL\n' + ' res[i] = cast(acc[i]);\n' + ' }\n' + ' }\n' + '};\n' ) return generated_code - def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights): + def _generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights): # Function constants mult_factor = min(n_in, reuse_factor) block_factor = int(math.ceil(n_in * n_out / reuse_factor)) @@ -162,10 +179,13 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we # The new shape is (parallel_mult, reuse_factor) zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) + # Used to pad the code to make it human-readable + indent = ' ' + # Generate unrolled multiplications - mult_code = f"\t\t#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n" - mult_code += "\t\tMULT: {\n" - mult_code += "\t\t\t#pragma HLS protocol\n" + mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n' + mult_code += f'{indent*2}MULT: {{\n' + mult_code += f'{indent*3}#pragma HLS protocol\n' for ir in range(reuse_factor): acc_step = 0 @@ -173,13 +193,15 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we w_index = ir in_index = ir - mult_code += f"\t\t\tM{ir}: {{\n" + mult_code += f'{indent*3}M{ir}: {{\n' for _ in range(block_factor): if weights.data.flatten()[w_index] != 0: - mult_code += f"\t\t\t\tacc[{out_index}] += \ - static_cast\ - (CONFIG_T::template product::\ - product(data[{in_index}], weights[{w_index}]));\n" + mult_code += ( + f'{indent*4}acc[{out_index}] += ' + 'static_cast' + '(CONFIG_T::template product::' + f'product(data[{in_index}], weights[{w_index}]));\n' + ) w_index += reuse_factor in_index += reuse_factor @@ -191,13 +213,13 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we else: acc_step += 1 - mult_code += "\t\t\t}\n" + mult_code += f'{indent*3}}}\n' - mult_code += "\t\t}\n" + mult_code += f'{indent*2}}}\n' return mult_code - def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights): + def _generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights): # Function constants mult_factor = min(n_in, reuse_factor) block_factor = int(math.ceil(n_in * n_out / reuse_factor)) @@ -208,6 +230,9 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor # The new shape is (parallel_mult, reuse_factor) zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) + # Used to pad the code to make it human-readable + indent = ' ' + # Generate out indices outidx = [0] * reuse_factor outstep = 0 @@ -221,32 +246,34 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor in_index = 0 # Generate unrolled multiplications - mult_code = f"\t\t#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n" - mult_code += "\t\tMULT: {\n" - mult_code += "\t\t\t#pragma HLS protocol\n" + mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n' + mult_code += f'{indent*2}MULT: {{\n' + mult_code += f'{indent*3}#pragma HLS protocol\n' for ir in range(reuse_factor): w_index = ir out_index = outidx[ir] - mult_code += f"\t\t\tM{ir}: {{\n" + mult_code += f'{indent*3}M{ir}: {{\n' for _ in range(block_factor): if weights.data.flatten()[w_index] != 0: - mult_code += f"\t\t\t\tacc[{int(out_index)}] += \ - static_cast\ - (CONFIG_T::template product::\ - product(data[{in_index}], weights[{w_index}]));\n" + mult_code += ( + f'{indent*4}acc[{int(out_index)}] += ' + 'static_cast' + '(CONFIG_T::template product::' + f'product(data[{in_index}], weights[{w_index}]));\n' + ) w_index += reuse_factor if w_index > n_in * n_out: break out_index += outscale - mult_code += "\t\t\t}\n" + mult_code += f'{indent*3}}}\n' in_index += 1 if in_index >= n_in: in_index = 0 - mult_code += "\t\t}\n" + mult_code += f'{indent*2}}}\n' return mult_code diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 95a9c10cb0..9b584237a6 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -17,14 +17,13 @@ static const unsigned n_out = {n_out}; static const unsigned reuse_factor = {reuse}; static const unsigned strategy = nnet::{strategy}; - static const unsigned resource_implementation = nnet::{dense_resource_implementation}; - template - using dense_unrolled = nnet::{unrolled_function}; static const unsigned n_zeros = {nzeros}; static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; typedef {accum_t.name} accum_t; typedef {bias_t.name} bias_t; typedef {weight_t.name} weight_t; + template + using kernel = nnet::{dense_function}; template using product = nnet::product::{product_type}; }};\n""" @@ -49,9 +48,6 @@ static const bool store_weights_in_bram = false; static const unsigned strategy = nnet::{strategy}; static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; - static const unsigned resource_implementation = nnet::{dense_resource_implementation}; - template - using dense_unrolled = nnet::{unrolled_function}; static const unsigned min_width = {min_width}; static const ap_uint pixels[min_width]; static const unsigned n_partitions = {n_partitions}; @@ -96,8 +92,6 @@ def format(self, node): params['fill_fn'] = f'fill_buffer_{node.index}' else: params['fill_fn'] = 'FillConv1DBuffer' - # TODO - Extend unrolled Dense Resource to Conv1D - params['unrolled_function'] = 'DenseResourceUnrolled' conv_config = self.template.format(**params) @@ -108,8 +102,18 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) - # TODO - Extend unrolled Dense Resource to Conv1D - mult_params['unrolled_function'] = 'DenseResourceUnrolled' + + if node.get_attr('strategy').lower() == 'latency': + mult_params['dense_function'] = 'DenseLatency' + elif node.get_attr('strategy').lower() == 'resource': + if int(mult_params['reuse_factor']) <= int(mult_params['n_in']): + mult_params['dense_function'] = 'DenseResource_rf_leq_nin' + else: + mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0' + # The 3rd case is never used + elif node.get_attr('strategy').lower() == 'unrolled': + mult_params['dense_function'] = f'dense_unrolled_{node.index}' + mult_config = self.mult_template.format(**mult_params) return mult_config + '\n' + conv_config @@ -160,9 +164,6 @@ def __init__(self): static const bool store_weights_in_bram = false; static const unsigned strategy = nnet::{strategy}; static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; - static const unsigned resource_implementation = nnet::{dense_resource_implementation}; - template - using dense_unrolled = nnet::{unrolled_function}; static const unsigned min_height = {min_height}; static const unsigned min_width = {min_width}; static const ap_uint pixels[min_height * min_width]; @@ -217,15 +218,6 @@ def format(self, node): else: params['fill_fn'] = 'FillConv2DBuffer' - if ( - node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' - and node.get_attr('strategy').lower() == 'resource' - and node.get_attr('reuse_factor') > 1 - ): - params['unrolled_function'] = f'dense_unrolled_{node.index}' - else: - params['unrolled_function'] = 'DenseResourceUnrolled' - conv_config = self.template.format(**params) mult_params = self._default_config_params(node) @@ -235,14 +227,18 @@ def format(self, node): mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) - if ( - node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' - and node.get_attr('strategy').lower() == 'resource' - and node.get_attr('reuse_factor') > 1 - ): - mult_params['unrolled_function'] = f'dense_unrolled_{node.index}' - else: - mult_params['unrolled_function'] = 'DenseResourceUnrolled' + + if node.get_attr('strategy').lower() == 'latency': + mult_params['dense_function'] = 'DenseLatency' + elif node.get_attr('strategy').lower() == 'resource': + if int(mult_params['reuse_factor']) <= int(mult_params['n_in']): + mult_params['dense_function'] = 'DenseResource_rf_leq_nin' + else: + mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0' + # The 3rd case is never used + elif node.get_attr('strategy').lower() == 'unrolled': + mult_params['dense_function'] = f'dense_unrolled_{node.index}' + mult_config = self.mult_template.format(**mult_params) return mult_config + '\n' + conv_config diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index 5f1a25e37f..16973b7fe2 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -9,9 +9,6 @@ static const unsigned n_out = {n_out}; static const unsigned io_type = nnet::{iotype}; static const unsigned strategy = nnet::{strategy}; - static const unsigned resource_implementation = nnet::{dense_resource_implementation}; - template - using dense_unrolled = nnet::{unrolled_function}; static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; @@ -21,6 +18,8 @@ typedef {bias_t.name} bias_t; typedef {weight_t.name} weight_t; typedef {index_t.name} index_t; + template + using kernel = nnet::{dense_function}; template using product = nnet::product::{product_type}; }};\n""" @@ -43,14 +42,16 @@ def format(self, node): node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) - if ( - node.get_attr('dense_resource_implementation', 'standard') == 'unrolled' - and node.get_attr('strategy').lower() == 'resource' - and node.get_attr('reuse_factor') > 1 - ): - params['unrolled_function'] = f'dense_unrolled_{node.index}' - else: - params['unrolled_function'] = 'DenseResourceUnrolled' + if node.get_attr('strategy').lower() == 'latency': + params['dense_function'] = 'DenseLatency' + elif node.get_attr('strategy').lower() == 'resource': + if int(params['reuse_factor']) <= int(params['n_in']): + params['dense_function'] = 'DenseResource_rf_leq_nin' + else: + params['dense_function'] = 'DenseResource_rf_gt_nin_rem0' + # The 3rd case is never used + elif node.get_attr('strategy').lower() == 'unrolled': + params['dense_function'] = f'dense_unrolled_{node.index}' return self.template.format(**params) diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py index e5c3937fd3..34e3e2f9f0 100644 --- a/hls4ml/backends/vivado/passes/recurrent_templates.py +++ b/hls4ml/backends/vivado/passes/recurrent_templates.py @@ -11,15 +11,13 @@ static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; - static const unsigned resource_implementation = nnet::{dense_resource_implementation}; - template - using dense_unrolled = nnet::{unrolled_function}; static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; static const bool store_weights_in_bram = false; typedef {accum_t.name} accum_t; typedef {bias_t.name} bias_t; typedef {weight_t.name} weight_t; - typedef {index_t.name} index_t; + template + using kernel = nnet::{dense_function}; template using product = nnet::product::{product_type}; }};\n""" @@ -116,11 +114,11 @@ def format(self, node): act_params['type'] = node.get_attr('activation') recr_act_params['type'] = node.get_attr('recurrent_activation') if node.get_attr('return_sequences'): - act_params['n_in'] = node.get_output_variable().dim_names[1] - recr_act_params['n_in'] = node.get_output_variable().dim_names[1] + ' * %i' % (n_recr_mult - 1) + act_params['n_in'] = node.get_output_variable().shape[1] + recr_act_params['n_in'] = node.get_output_variable().shape[1] * (n_recr_mult - 1) else: - act_params['n_in'] = node.get_output_variable().dim_names[0] - recr_act_params['n_in'] = node.get_output_variable().dim_names[0] + ' * %i' % (n_recr_mult - 1) + act_params['n_in'] = node.get_output_variable().shape[0] + recr_act_params['n_in'] = node.get_output_variable().shape[0] * (n_recr_mult - 1) act_config = self.act_template.format(**act_params) recr_act_config = self.recr_act_template.format(**recr_act_params) @@ -128,11 +126,11 @@ def format(self, node): mult_params1 = self._default_config_params(node) mult_params2 = self._default_config_params(node) - mult_params1['n_in'] = node.get_input_variable().dim_names[1] + mult_params1['n_in'] = node.get_input_variable().shape[1] if node.get_attr('return_sequences'): - mult_params1['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult + mult_params1['n_out'] = node.get_output_variable().shape[1] * n_recr_mult else: - mult_params1['n_out'] = node.get_output_variable().dim_names[0] + ' * %i' % n_recr_mult + mult_params1['n_out'] = node.get_output_variable().shape[0] * n_recr_mult mult_params1['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) @@ -141,15 +139,23 @@ def format(self, node): mult_params1['nzeros'] = node.get_weights('weight').nzeros mult_params1['nonzeros'] = node.get_weights('weight').nonzeros - # TODO - Extend unrolled Dense Resource to recurrent kernels - mult_params1['unrolled_function'] = 'DenseResourceUnrolled' + if node.get_attr('strategy').lower() == 'latency': + mult_params1['dense_function'] = 'DenseLatency' + elif node.get_attr('strategy').lower() == 'resource': + if int(mult_params1['reuse_factor']) <= int(mult_params1['n_in']): + mult_params1['dense_function'] = 'DenseResource_rf_leq_nin' + else: + mult_params1['dense_function'] = 'DenseResource_rf_gt_nin_rem0' + # The 3rd case is never used + elif node.get_attr('strategy').lower() == 'unrolled': + mult_params1['dense_function'] = f'dense_unrolled_{node.index}_1' if node.get_attr('return_sequences'): - mult_params2['n_in'] = node.get_output_variable().dim_names[1] - mult_params2['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult + mult_params2['n_in'] = node.get_output_variable().shape[1] + mult_params2['n_out'] = node.get_output_variable().shape[1] * n_recr_mult else: - mult_params2['n_in'] = node.get_output_variable().dim_names[0] - mult_params2['n_out'] = node.get_output_variable().dim_names[0] + ' * %i' % n_recr_mult + mult_params2['n_in'] = node.get_output_variable().shape[0] + mult_params2['n_out'] = node.get_output_variable().shape[0] * n_recr_mult mult_params2['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision ) @@ -158,8 +164,16 @@ def format(self, node): mult_params2['nzeros'] = node.get_weights('recurrent_weight').nzeros mult_params2['nonzeros'] = node.get_weights('recurrent_weight').nonzeros - # TODO - Extend unrolled Dense Resource to recurrent kernels - mult_params2['unrolled_function'] = 'DenseResourceUnrolled' + if node.get_attr('strategy').lower() == 'latency': + mult_params2['dense_function'] = 'DenseLatency' + elif node.get_attr('strategy').lower() == 'resource': + if int(mult_params2['reuse_factor']) <= int(mult_params2['n_in']): + mult_params2['dense_function'] = 'DenseResource_rf_leq_nin' + else: + mult_params2['dense_function'] = 'DenseResource_rf_gt_nin_rem0' + # The 3rd case is never used + elif node.get_attr('strategy').lower() == 'unrolled': + mult_params2['dense_function'] = f'dense_unrolled_{node.index}_2' mult_config1 = self.mult1_template.format(**mult_params1) mult_config2 = self.mult2_template.format(**mult_params2) diff --git a/hls4ml/backends/vivado/passes/resource_strategy.py b/hls4ml/backends/vivado/passes/resource_strategy.py index 63e6e0b4db..d65b0dc48e 100644 --- a/hls4ml/backends/vivado/passes/resource_strategy.py +++ b/hls4ml/backends/vivado/passes/resource_strategy.py @@ -9,7 +9,7 @@ class ApplyResourceStrategy(OptimizerPass): def match(self, node): node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU)) - is_resource_strategy = node.get_attr('strategy', '').lower() == 'resource' + is_resource_strategy = node.get_attr('strategy', '').lower() in ['resource', 'unrolled'] already_transformed = node.get_attr('_weights_transposed', False) is True return node_matches and is_resource_strategy and not already_transformed diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 35cc908ed4..6c5deccc68 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -75,15 +75,6 @@ def _register_layer_attributes(self): attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) self.attribute_map[layer] = attrs - # Add implementation of Dense Resource for all layers that use Dense for matrix mult - # Handle different implementations of Resource strategy; only makes a difference if strategy == Resource - # Standard -> nnet_dense_resource.h - # Unrolled -> Code generation, ignoring zero DSPs and optimizing zero-filled BRAM blocks - for layer in [Dense] + cnn_layers + rnn_layers: - attrs = self.attribute_map.get(layer, []) - attrs.append( - ChoiceAttribute('dense_resource_implementation', choices=['standard', 'unrolled'], default='standard') - ) sep_conv_layers = [SeparableConv1D, SeparableConv2D] for layer in sep_conv_layers: attrs = self.attribute_map.get(layer, []) @@ -259,6 +250,11 @@ def init_dense(self, layer): index_t = layer.get_weights('weight').type.index_precision else: layer.set_attr('strategy', 'resource') + elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) @@ -275,6 +271,11 @@ def init_conv1d(self, layer): n_in, n_out = self.get_layer_mult_size(layer) self.set_target_reuse_factor(layer) self.set_closest_reuse_factor(layer, n_in, n_out) + elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') @@ -334,6 +335,11 @@ def init_conv2d(self, layer): self.set_target_reuse_factor(layer) n_in, n_out = self.get_layer_mult_size(layer) self.set_closest_reuse_factor(layer, n_in, n_out) + elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') @@ -453,6 +459,11 @@ def init_lstm(self, layer): self.set_closest_reuse_factor(layer, n_in, n_out) self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') layer.set_attr('strategy', 'resource') + elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') @@ -471,6 +482,11 @@ def init_gru(self, layer): self.set_closest_reuse_factor(layer, n_in, n_out) self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') layer.set_attr('strategy', 'resource') + elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index caab69663e..4a8a40cd10 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -5,38 +5,11 @@ #include "hls_stream.h" #include "nnet_common.h" +#include "nnet_function_stubs.h" #include "nnet_mult.h" namespace nnet { -template class FillConv1DBuffer { - public: - static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], - const unsigned partition) { - // To be implemented in subclasses - } -}; - -template class FillConv2DBuffer { - public: - static void - fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], - const unsigned partition) { - // To be implemented in subclasses - } -}; - -template class DenseResourceUnrolled { - public: - static void dense_unrolled(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], - typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { - // To be implemented in subclasses - } -}; - // hls4ml insert code } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index fed0395a1a..fee8b7b935 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -23,7 +23,7 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; -enum strategy { latency, resource }; +enum strategy { latency, resource, unrolled }; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h index b23c330c78..4a55700d8d 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h @@ -60,6 +60,10 @@ void conv_1d_buffer_cl(hls::stream &data, hls::stream &res, typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + if (CONFIG_T::strategy == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + #pragma HLS allocation instances=compute_output_buffer_1d limit=1 function + } + ReadInputWidth: for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { #pragma HLS LOOP_FLATTEN diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h index 08d06501c3..d5583f2669 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h @@ -75,8 +75,7 @@ void conv_2d_buffer_cl( [CONFIG_T::n_chan]; #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 - if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && - CONFIG_T::reuse_factor > 1) { + if (CONFIG_T::strategy == nnet::unrolled && CONFIG_T::reuse_factor > 1) { #pragma HLS allocation instances=compute_output_buffer_1d limit=1 function #pragma HLS allocation instances=compute_output_buffer_2d limit=1 function } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index d95d528e46..dcd914dffe 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -95,13 +95,8 @@ void mult_buffer(hls::stream data_window[CONFIG_T:: } #pragma HLS INLINE recursive - if (CONFIG_T::strategy == nnet::latency) { - dense_latency( - data, res, weights, biases); - } else { - dense_resource( - data, res, weights, biases); - } + CONFIG_T::mult_config::template kernel::dense(data, res, weights, biases); CastLoop: for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) { @@ -290,18 +285,8 @@ void compute_output_buffer_2d( // Dense multiply // #pragma HLS INLINE recursive - if (CONFIG_T::strategy == nnet::latency) { - dense_latency( - kernel_data, res_out, weights, biases); - } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && - CONFIG_T::reuse_factor > 1) { - CONFIG_T::template dense_unrolled::dense_unrolled(kernel_data, res_out, weights, - biases); - } else { - dense_resource( - kernel_data, res_out, weights, biases); - } + CONFIG_T::mult_config::template kernel::dense(kernel_data, res_out, weights, biases); // Pack output CastLoop: @@ -366,18 +351,8 @@ void compute_output_buffer_1d( // Dense multiply // #pragma HLS INLINE recursive - if (CONFIG_T::strategy == nnet::latency) { - dense_latency( - kernel_data, res_out, weights, biases); - } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && - CONFIG_T::reuse_factor > 1) { - CONFIG_T::template dense_unrolled::dense_unrolled(kernel_data, res_out, weights, - biases); - } else { - dense_resource( - kernel_data, res_out, weights, biases); - } + CONFIG_T::mult_config::template kernel::dense(kernel_data, res_out, weights, biases); // Pack output CastLoop: diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h index 2037daf0b9..d6c7beb70e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h @@ -5,17 +5,13 @@ #include "nnet_common.h" #include "nnet_dense_latency.h" #include "nnet_dense_resource.h" +#include "nnet_function_stubs.h" #include "nnet_helpers.h" #include "nnet_mult.h" #include namespace nnet { -// Different implementations of Resource strategy; this attribute only makes a difference if strategy == Resource -// Default -> nnet_dense_resource.h -// Unrolled -> Code generation, ignoring zero DSPs and optimizing BRAM -enum resource_implementation { standard, unrolled }; - struct dense_config { // Internal data type definitions typedef float bias_t; @@ -33,9 +29,7 @@ struct dense_config { static const bool store_weights_in_bram = false; static const unsigned n_zeros = 0; - static const unsigned resource_implementation = standard; - template - using dense_unrolled = nnet::DenseResourceUnrolled; + template using kernel = nnet::DenseKernel; // Partitioning arrays cyclically to go with roll factors? @@ -47,16 +41,41 @@ template void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { - #pragma HLS inline - if (CONFIG_T::strategy == nnet::latency) { + #pragma HLS INLINE + CONFIG_T::template kernel::dense(data, res, weights, biases); +} + +template class DenseLatency : public DenseKernel { + public: + static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS INLINE dense_latency(data, res, weights, biases); - } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && - CONFIG_T::reuse_factor > 1) { - CONFIG_T::template dense_unrolled::dense_unrolled(data, res, weights, biases); - } else { - dense_resource(data, res, weights, biases); } -} +}; + +template +class DenseResource_rf_leq_nin : public DenseKernel { + public: + static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS INLINE + dense_resource_rf_leq_nin(data, res, weights, biases); + } +}; + +template +class DenseResource_rf_gt_nin_rem0 : public DenseKernel { + public: + static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS INLINE + dense_resource_rf_gt_nin_rem0(data, res, weights, biases); + } +}; } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h index db3039fc33..3e3183480e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h @@ -16,13 +16,8 @@ void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor - dense_latency(data, res, weights, biases); - } else if (CONFIG_T::strategy == nnet::resource && CONFIG_T::resource_implementation == nnet::unrolled && - CONFIG_T::reuse_factor > 1) { - CONFIG_T::template dense_unrolled::dense_unrolled(data, res, weights, biases); - } else { - dense_resource(data, res, weights, biases); } + CONFIG_T::template kernel::dense(data, res, weights, biases); } template diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h new file mode 100644 index 0000000000..1316bbe776 --- /dev/null +++ b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h @@ -0,0 +1,42 @@ +#ifndef NNET_FUNCTION_STUBS_H_ +#define NNET_FUNCTION_STUBS_H_ + +#include "nnet_helpers.h" + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_mult.h" + +namespace nnet { + +template class FillConv1DBuffer { + public: + static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { + // To be implemented in subclasses + } +}; + +template class FillConv2DBuffer { + public: + static void + fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { + // To be implemented in subclasses + } +}; + +template class DenseKernel { + public: + static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + // To be implemented in subclasses + } +}; + +} // namespace nnet + +#endif diff --git a/test/pytest/test_dense_unrolled.py b/test/pytest/test_dense_unrolled.py index a3318049be..6b7503c543 100644 --- a/test/pytest/test_dense_unrolled.py +++ b/test/pytest/test_dense_unrolled.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from tensorflow.keras.layers import Conv2D, Dense, Flatten +from tensorflow.keras.layers import GRU, LSTM, Conv1D, Conv2D, Dense, Flatten from tensorflow.keras.models import Sequential from hls4ml.converters import convert_from_keras_model @@ -14,37 +14,51 @@ # Tests a wide range of RF to ensure the unrolled Dense is correct @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) @pytest.mark.parametrize('reuse_factor', [1, 2, 4, 8, 16, 32, 48, 64, 96, 192]) -def test_dense_unrolled(io_type, reuse_factor): +@pytest.mark.parametrize('backend', ['Vitis', 'Vivado']) +def test_dense_unrolled(io_type, reuse_factor, backend): input_shape = (16,) X = np.random.rand(100, *input_shape) model = Sequential() - model.add(Dense(12, input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform')) + model.add( + Dense( + 12, input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform', name='dense' + ) + ) model.compile('adam', 'mse') keras_prediction = model.predict(X) - config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', default_reuse_factor=reuse_factor) - config['Model']['Strategy'] = 'Resource' - config['Model']['DenseResourceImplementation'] = 'Unrolled' + config = config_from_keras_model( + model, default_precision='ac_fixed<32, 16>', backend=backend, default_reuse_factor=reuse_factor + ) + config['Model']['Strategy'] = 'Unrolled' + + output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_{io_type}_{reuse_factor}_{backend}') + hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) + + # Check if strategy was not overridden + assert list(hls_model.get_layers())[1].get_attr('strategy') == 'unrolled' if reuse_factor > 1 else 'latency' - output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_{io_type}_{reuse_factor}') - hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type) hls_model.compile() hls_prediction = hls_model.predict(X) np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=1e-2) -# Tests a wide range RF on streaming Conv2D to ensure the unrolled Dense is correct +# Tests a wide range RF on streaming Conv1D/2D to ensure the unrolled Dense is correct +@pytest.mark.parametrize('dim', [1, 2]) @pytest.mark.parametrize('io_type', ['io_stream']) @pytest.mark.parametrize('reuse_factor', [1, 3, 9, 27, 54, 108]) -def test_dense_unrolled_streaming_conv(io_type, reuse_factor): - input_shape = (8, 8, 3) +def test_dense_unrolled_streaming_conv(dim, io_type, reuse_factor): + input_shape = (8,) * dim + (3,) X = np.random.rand(100, *input_shape) + conv_class = Conv1D if dim == 1 else Conv2D model = Sequential() model.add( - Conv2D(4, (3, 3), input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform') + conv_class( + 4, (3,) * dim, input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform' + ) ) model.add(Flatten()) model.add(Dense(1, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform')) @@ -52,12 +66,60 @@ def test_dense_unrolled_streaming_conv(io_type, reuse_factor): keras_prediction = model.predict(X) config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', default_reuse_factor=reuse_factor) - config['Model']['Strategy'] = 'Resource' - config['Model']['DenseResourceImplementation'] = 'Unrolled' + config['Model']['Strategy'] = 'Unrolled' - output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_conv2d_{io_type}_{reuse_factor}') + output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_conv{dim}d_{io_type}_{reuse_factor}') hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type) + + # Check if strategy was not overridden + assert list(hls_model.get_layers())[1].get_attr('strategy') == 'unrolled' if reuse_factor > 1 else 'latency' + hls_model.compile() hls_prediction = hls_model.predict(X) np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=1e-2) + + +@pytest.mark.parametrize('rnn_layer', [LSTM, GRU]) +@pytest.mark.parametrize('backend', ['Vitis', 'Vivado']) +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +@pytest.mark.parametrize('static', [True, False]) +@pytest.mark.parametrize('reuse_factor', [1, 4, 32, 128]) # These should be enough +def test_rnn_unrolled(rnn_layer, backend, io_type, static, reuse_factor): + # Subtract 0.5 to include negative values + input_shape = (12, 8) + X = np.random.rand(50, *input_shape) - 0.5 + + layer_name = rnn_layer.__name__.lower() + keras_model = Sequential() + keras_model.add( + rnn_layer( + units=8, + input_shape=input_shape, + kernel_initializer='lecun_uniform', + recurrent_initializer='lecun_uniform', + bias_initializer='lecun_uniform', + return_sequences=False, + name=layer_name, + ) + ) + keras_model.compile() + + default_precision = 'ap_fixed<32, 16>' if backend in ['Vivado', 'Vitis'] else 'ac_fixed<32, 16, true>' + hls_config = config_from_keras_model( + keras_model, granularity='name', default_precision=default_precision, backend=backend + ) + hls_config['LayerName'][layer_name]['static'] = static + hls_config['LayerName'][layer_name]['Strategy'] = 'Unrolled' + hls_config['LayerName'][layer_name]['ReuseFactor'] = reuse_factor + prj_name = f'hls4mlprj_rnn_unrolled_{layer_name}_static_{int(static)}_{io_type}_{reuse_factor}_{backend}' + output_dir = str(test_root_path / prj_name) + + hls_model = convert_from_keras_model( + keras_model, hls_config=hls_config, output_dir=output_dir, backend=backend, io_type=io_type + ) + hls_model.compile() + + keras_prediction = keras_model.predict(X) + hls_prediction = hls_model.predict(X) + np.testing.assert_allclose(hls_prediction.flatten(), keras_prediction.flatten(), rtol=0.0, atol=5e-2) From 4497631a12190e9f3067c5e44aed4433de91af6a Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 15 Jul 2024 16:12:39 -0500 Subject: [PATCH 079/272] add an assert checking that multiplier limit is 1 --- hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h | 2 ++ hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h | 3 +++ hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h | 2 ++ hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h | 3 +++ 4 files changed, 10 insertions(+) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h index 2f7e57a502..beacbbe4ec 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h @@ -32,6 +32,8 @@ void depthwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Limit multipliers to control parallelization #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit + assert((CONFIG_T::n_filt == CONFIG_T::n_chan) && "only a depth multiplier of 1 is currently supported"); + PartitionLoop: for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h index 254fc5067b..ca3143d01e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h @@ -61,6 +61,9 @@ template void depthwise_conv_1d_cl(hls::stream &data, hls::stream &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + + assert((CONFIG_T::n_filt == CONFIG_T::n_chan) && "only a depth multiplier of 1 is currently supported"); + #pragma HLS inline recursive switch (CONFIG_T::implementation) { case conv_implementation::linebuffer: diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h index 00729ac4c2..d8adedc7ec 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h @@ -33,6 +33,8 @@ void depthwise_conv_2d_latency_cl( // Limit multipliers to control parallelization #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit + assert((CONFIG_T::n_filt == CONFIG_T::n_chan) && "only a depth multiplier of 1 is currently supported"); + PartitionLoop: for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h index d56ed6d9a4..7f4dd866c9 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h @@ -81,6 +81,9 @@ void depthwise_conv_2d_cl( hls::stream &data, hls::stream &res, typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + + assert((CONFIG_T::n_filt == CONFIG_T::n_chan) && "only a depth multiplier of 1 is currently supported"); + #pragma HLS inline recursive switch (CONFIG_T::implementation) { case conv_implementation::linebuffer: From ad39b8a50bff4f6de83055dbb3fc39be3dd61d2f Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 15 Jul 2024 16:23:04 -0500 Subject: [PATCH 080/272] remove unused reuse factor and accum attributes for separable --- hls4ml/backends/fpga/fpga_backend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 672627e35f..479af8ebf3 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -55,8 +55,6 @@ def __init__(self, name): Dense, Conv1D, Conv2D, - SeparableConv1D, - SeparableConv2D, Pooling1D, Pooling2D, GlobalPooling1D, From 13b6dbb2eb1dbbf4eb3f9f6a2cf790579665bfc8 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 15 Jul 2024 16:34:57 -0500 Subject: [PATCH 081/272] revert unneeded conv_same_pad change --- hls4ml/backends/vivado/passes/conv_same_pad.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hls4ml/backends/vivado/passes/conv_same_pad.py b/hls4ml/backends/vivado/passes/conv_same_pad.py index dd282f34e3..bb8354a3d0 100644 --- a/hls4ml/backends/vivado/passes/conv_same_pad.py +++ b/hls4ml/backends/vivado/passes/conv_same_pad.py @@ -1,4 +1,4 @@ -from hls4ml.model.layers import Conv1D, Conv2D, DepthwiseConv1D, DepthwiseConv2D, SeparableConv1D, SeparableConv2D +from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D from hls4ml.model.optimizer import OptimizerPass @@ -7,7 +7,7 @@ class InsertZeroPaddingBeforeConv1D(OptimizerPass): def match(self, node): is_match = ( - isinstance(node, (Conv1D, DepthwiseConv1D, SeparableConv1D)) + isinstance(node, (Conv1D, SeparableConv1D)) and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal')) and node.get_attr('filt_width') != 1 ) @@ -55,7 +55,7 @@ class InsertZeroPaddingBeforeConv2D(OptimizerPass): def match(self, node): is_match = ( - isinstance(node, (Conv2D, DepthwiseConv2D, SeparableConv2D)) + isinstance(node, (Conv2D, SeparableConv2D)) and node.get_attr('padding') == 'same' and node.get_attr('filt_height') != 1 and node.get_attr('filt_width') != 1 From 5c7721608b824a514687fd26e1431829b2c5189b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 22:40:52 +0000 Subject: [PATCH 082/272] [pre-commit.ci] auto fixes from pre-commit hooks --- hls4ml/backends/vivado/vivado_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 98700a4893..3ca497e49f 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -178,7 +178,7 @@ def create_initial_config( self, part='xcvu13p-flga2577-2-e', clock_period=5, - clock_uncertainty='12.5%', + clock_uncertainty='12.5%', io_type='io_parallel', namespace=None, write_weights_txt=False, From 74bf27f0442bc315673c0134e755a8dd96a1fc11 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 15 Jul 2024 17:06:32 -0700 Subject: [PATCH 083/272] pre-commit --- test/pytest/test_weight_writer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/pytest/test_weight_writer.py b/test/pytest/test_weight_writer.py index faa05981da..62281c071a 100644 --- a/test/pytest/test_weight_writer.py +++ b/test/pytest/test_weight_writer.py @@ -19,14 +19,18 @@ def test_weight_writer(k, i, f): u = '' if k else 'u' dtype = f'{u}fixed<{b}, {i}>' hls_config = {'LayerName': {'dense': {'Precision': {'weight': dtype}}}} + model = keras.Sequential([keras.layers.Dense(1, input_shape=(1,), name='dense')]) model.layers[0].kernel.assign(keras.backend.constant(w)) output_dir = str(test_root_path / f'hls4ml_prj_test_weight_writer_{dtype}') - model_hls = hls4ml.converters.convert_from_keras_model(model, hls_config=hls_config, output_dir=output_dir) + + model_hls = hls4ml.converters.convert_from_keras_model( + model, hls_config=hls_config, output_dir=output_dir, write_weights_txt=True + ) model_hls.write() + w_paths = glob(str(Path(output_dir) / 'firmware/weights/w*.txt')) - print(w_paths[0]) assert len(w_paths) == 1 + w_loaded = np.loadtxt(w_paths[0], delimiter=',').reshape(1, 1) - print(f'{w[0, 0]:.14}', f'{w_loaded[0, 0]:.14}') assert np.all(w == w_loaded) From 74909cab65e321cd2c7dfdac7c8f9b90bcb6fc44 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 15 Jul 2024 18:12:51 -0700 Subject: [PATCH 084/272] fix --- hls4ml/backends/vitis/vitis_backend.py | 31 +++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index 2a0616a198..151d0b4114 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -34,8 +34,32 @@ def _register_flows(self): self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) def create_initial_config( - self, part='xcvu13p-flga2577-2-e', clock_period=5, clock_uncertainty='27%', io_type='io_parallel', **_ + self, + part='xcvu13p-flga2577-2-e', + clock_period=5, + clock_uncertainty='27%', + io_type='io_parallel', + namespace=None, + write_weights_txt=False, + write_tar=True, + **_, ): + """Create initial configuration of the Vitis backend. + + Args: + part (str, optional): The FPGA part to be used. Defaults to 'xcvu13p-flga2577-2-e'. + clock_period (int, optional): The clock period. Defaults to 5. + clock_uncertainty (str, optional): The clock uncertainty. Defaults to 27%. + io_type (str, optional): Type of implementation used. One of + 'io_parallel' or 'io_stream'. Defaults to 'io_parallel'. + namespace (str, optional): If defined, place all generated code within a namespace. Defaults to None. + write_weights_txt (bool, optional): If True, writes weights to .txt files which speeds up compilation. + Defaults to False. + write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to True. + + Returns: + dict: initial configuration. + """ config = {} config['Part'] = part if part is not None else 'xcvu13p-flga2577-2-e' @@ -43,6 +67,11 @@ def create_initial_config( config['ClockUncertainty'] = clock_uncertainty if clock_uncertainty is not None else '27%' config['IOType'] = io_type if io_type is not None else 'io_parallel' config['HLSConfig'] = {} + config['WriterConfig'] = { + 'Namespace': namespace, + 'WriteWeightsTxt': write_weights_txt, + 'WriteTar': write_tar, + } return config From f2727400d91b3b1613ba937da66781bab2a51689 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 15 Jul 2024 19:27:46 -0700 Subject: [PATCH 085/272] fix default --- hls4ml/model/graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 4d11e37a64..ff68ae109b 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -56,8 +56,8 @@ def __init__(self, config): else: self.writer_config = { 'Namespace': None, - 'WriteWeightsTxt': True, - 'WriteTar': False, + 'WriteWeightsTxt': False, + 'WriteTar': True, } self._parse_hls_config() From 6a4818054e64dad782df07db334a3201944e6ba2 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Tue, 16 Jul 2024 06:20:37 -0700 Subject: [PATCH 086/272] reverse defaults --- hls4ml/backends/vitis/vitis_backend.py | 4 ++-- hls4ml/backends/vivado/vivado_backend.py | 4 ++-- hls4ml/model/graph.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index 151d0b4114..745e1c290b 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -40,8 +40,8 @@ def create_initial_config( clock_uncertainty='27%', io_type='io_parallel', namespace=None, - write_weights_txt=False, - write_tar=True, + write_weights_txt=True, + write_tar=False, **_, ): """Create initial configuration of the Vitis backend. diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 3ca497e49f..a8f8167a6f 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -181,8 +181,8 @@ def create_initial_config( clock_uncertainty='12.5%', io_type='io_parallel', namespace=None, - write_weights_txt=False, - write_tar=True, + write_weights_txt=True, + write_tar=False, **_, ): """Create initial configuration of the Vivado backend. diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index ff68ae109b..4d11e37a64 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -56,8 +56,8 @@ def __init__(self, config): else: self.writer_config = { 'Namespace': None, - 'WriteWeightsTxt': False, - 'WriteTar': True, + 'WriteWeightsTxt': True, + 'WriteTar': False, } self._parse_hls_config() From 1be29ad3eddb81654f7dd17417a366a9bceeeeaa Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Tue, 16 Jul 2024 06:25:53 -0700 Subject: [PATCH 087/272] edit doc --- hls4ml/backends/vitis/vitis_backend.py | 4 ++-- hls4ml/backends/vivado/vivado_backend.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index 745e1c290b..89484237f3 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -54,8 +54,8 @@ def create_initial_config( 'io_parallel' or 'io_stream'. Defaults to 'io_parallel'. namespace (str, optional): If defined, place all generated code within a namespace. Defaults to None. write_weights_txt (bool, optional): If True, writes weights to .txt files which speeds up compilation. - Defaults to False. - write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to True. + Defaults to True. + write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to False. Returns: dict: initial configuration. diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index a8f8167a6f..1c27dd3175 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -195,8 +195,8 @@ def create_initial_config( 'io_parallel' or 'io_stream'. Defaults to 'io_parallel'. namespace (str, optional): If defined, place all generated code within a namespace. Defaults to None. write_weights_txt (bool, optional): If True, writes weights to .txt files which speeds up compilation. - Defaults to False. - write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to True. + Defaults to True. + write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to False. Returns: dict: initial configuration. From 88e1f9b37ed905b541392e36f95dbec65d22e64f Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 16 Jul 2024 10:18:16 -0400 Subject: [PATCH 088/272] change upsampling test to changed interface for channels_last conversion --- test/pytest/test_upsampling_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/test_upsampling_pytorch.py b/test/pytest/test_upsampling_pytorch.py index 4a6c69ede4..62cedd0263 100644 --- a/test/pytest/test_upsampling_pytorch.py +++ b/test/pytest/test_upsampling_pytorch.py @@ -56,7 +56,7 @@ def test_pytorch_upsampling1d(data_1d, io_type, backend): config = hls4ml.utils.config_from_pytorch_model( model, default_precision='ap_fixed<16,6>', - inputs_channel_last=True, # We don't test channels_last conversion here + channels_last_conversion="off", # We don't test channels_last conversion here transpose_outputs=False, ) odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_1d_{backend}_{io_type}') From 229dc7b4082ee55487ffd8a648abf75ec912a376 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 16 Jul 2024 12:43:42 -0400 Subject: [PATCH 089/272] another fix to pytests for upsampling --- test/pytest/test_upsampling_pytorch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/pytest/test_upsampling_pytorch.py b/test/pytest/test_upsampling_pytorch.py index 62cedd0263..e881c39bbf 100644 --- a/test/pytest/test_upsampling_pytorch.py +++ b/test/pytest/test_upsampling_pytorch.py @@ -56,7 +56,7 @@ def test_pytorch_upsampling1d(data_1d, io_type, backend): config = hls4ml.utils.config_from_pytorch_model( model, default_precision='ap_fixed<16,6>', - channels_last_conversion="off", # We don't test channels_last conversion here + channels_last_conversion="internal", transpose_outputs=False, ) odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_1d_{backend}_{io_type}') @@ -85,7 +85,7 @@ def test_pytorch_upsampling2d(data_2d, io_type, backend): config = hls4ml.utils.config_from_pytorch_model( model, default_precision='ap_fixed<16,6>', - inputs_channel_last=False, # With conversion to channels_last + channels_last_conversion="full", # With conversion to channels_last transpose_outputs=True, ) odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_2d_{backend}_{io_type}') From 91f8f83bea62e5a048945dc82cd333be18254de7 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 16 Jul 2024 20:07:43 +0200 Subject: [PATCH 090/272] Always write tar when using hls4ml script (for jenkins tests) --- scripts/hls4ml | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/hls4ml b/scripts/hls4ml index 8180393bb4..fd6fcb2427 100755 --- a/scripts/hls4ml +++ b/scripts/hls4ml @@ -91,6 +91,7 @@ def _config(args, extra_args): part=args.fpga, board=args.board, clock_period=args.clock, + write_tar=True, ) if args.model.endswith('.h5'): From a4b9b6b25e111bf8fc7a7f954ba88645cfadbdba Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 16 Jul 2024 20:09:46 +0200 Subject: [PATCH 091/272] Check if tar.gz exists before deleting in VivadoAcc backend --- hls4ml/writer/vivado_accelerator_writer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py index 3b4e5fdf13..cefa158e11 100644 --- a/hls4ml/writer/vivado_accelerator_writer.py +++ b/hls4ml/writer/vivado_accelerator_writer.py @@ -408,7 +408,9 @@ def write_driver(self, model): ) def write_new_tar(self, model): - os.remove(model.config.get_output_dir() + '.tar.gz') + tarfile = model.config.get_output_dir() + '.tar.gz' + if os.path.exists(tarfile): + os.remove(tarfile) super().write_tar(model) def write_hls(self, model): From fcd6f7a85f6f1e24960ecd8683bcbe1f1624d4ff Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 17 Jul 2024 08:42:27 -0700 Subject: [PATCH 092/272] write tarball --- test/keras-to-hls.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/keras-to-hls.sh b/test/keras-to-hls.sh index f8c25e756d..f8f63443dc 100755 --- a/test/keras-to-hls.sh +++ b/test/keras-to-hls.sh @@ -144,6 +144,11 @@ do echo " Input: ${precision}" >> ${file} echo " Output: ${precision}" >> ${file} fi + # Write tarball + echo "WriterConfig:" >> ${file} + echo " Namespace: None" >> ${file} + echo " WriteWeightsTxt: True" >> ${file} + echo " WriteTar: True" >> ${file} ${pycmd} ../scripts/hls4ml convert -c ${file} || exit 1 rm ${file} From 2909d154dd49f81f0ada629f2e9bd45786a24ebf Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 18 Jul 2024 14:14:57 -0500 Subject: [PATCH 093/272] Following what seems to be done in the main branch --- hls4ml/model/optimizer/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index eb53ed7925..282561e11e 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -60,6 +60,7 @@ 'fuse_consecutive_batch_normalization', 'merge_linear_activation', 'fuse_batch_normalization', + 'eliminate_linear_activation', # The ones above here need to be before infer_precision_types 'infer_precision_types', 'channels_last_converter', From c9693da8106ee9bf34ce52c9003684d825a7d0e7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 19 Jul 2024 10:18:41 -0500 Subject: [PATCH 094/272] update infer_precision based on changes in keras-config-auto --- .../model/optimizer/passes/infer_precision.py | 298 +++++++++++++----- 1 file changed, 211 insertions(+), 87 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 51422c534e..5c1801156f 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -1,9 +1,10 @@ import math +from typing import Iterable import numpy as np from hls4ml.model.optimizer import ConfigurableOptimizerPass -from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, PrecisionType, UnspecifiedPrecisionType # TODO: The code assumes everything is Fixed or Integer precision. Need to add checks @@ -67,6 +68,12 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['Dot']: return self._infer_dot_precision(node, types_to_infer) + if node_class in ['Embedding']: + return self._infer_embedding_precision(node, types_to_infer) + + if node_class in ['SimpleRNN', 'LSTM', 'GRU']: + return self._infer_rnn_precision(node, types_to_infer) + # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent # this in config_from_* functions @@ -76,6 +83,20 @@ def _get_default_precision(self, node): model_config = node.model.config return model_config.backend.convert_precision_string(model_config.model_precision['default']) + def _get_maximum_precision(self, node): + model_config = node.model.config + if 'maximum' in model_config.model_precision: + return model_config.backend.convert_precision_string(model_config.model_precision['maximum']) + else: + return None + + def _all_supported_types(self, types: Iterable[PrecisionType]): + """Are all the types supported for inference--currently Integer or Fixed""" + for tp in types: + if not isinstance(tp, (IntegerPrecisionType, FixedPrecisionType)): + return False + return True + def _infer_default_type(self, node, type_name): model_config = node.model.config default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default']) @@ -96,9 +117,6 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): inferred_types = [] input_precision = node.get_input_variable().type.precision - input_width = input_precision.width - input_integers = input_precision.integer - input_signed = input_precision.signed if 'weight_t' in types_to_infer: weight_quantizer = node.get_attr('weight_quantizer', None) @@ -110,10 +128,6 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): node.weights['weight'].update_precision(node.types['weight_t'].precision) inferred_types.append('weight_t') - weight_width = node.types['weight_t'].precision.width - weight_integers = node.types['weight_t'].precision.integer - weight_signed = node.types['weight_t'].precision.signed - if 'bias_t' in types_to_infer: bias_quantizer = node.get_attr('bias_quantizer', None) if bias_quantizer is not None: @@ -124,25 +138,42 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): node.weights['bias'].update_precision(node.types['bias_t'].precision) inferred_types.append('bias_t') - bias_width = node.types['bias_t'].precision.width - bias_integers = node.types['bias_t'].precision.integer - bias_signed = node.types['bias_t'].precision.signed - no_bias = node.weights['bias'].nonzeros == 0 and self.infer_no_bias # no bias + if self._all_supported_types((input_precision, node.types['weight_t'].precision, node.types['bias_t'].precision)): + input_width = input_precision.width + input_integers = input_precision.integer + input_signed = input_precision.signed - # using math.ceil instead of np.ceil because it returns an int - bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops)) - integers = weight_integers + input_integers + math.ceil(np.log2(n_ops)) - signed = weight_signed or input_signed + weight_width = node.types['weight_t'].precision.width + weight_integers = node.types['weight_t'].precision.integer + weight_signed = node.types['weight_t'].precision.signed - frac = bitwidth - integers + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer + bias_signed = node.types['bias_t'].precision.signed + no_bias = node.weights['bias'].nonzeros == 0 and self.infer_no_bias # no bias + + # using math.ceil instead of np.ceil because it returns an int + bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops)) + integers = weight_integers + input_integers + math.ceil(np.log2(n_ops)) + signed = weight_signed or input_signed + + frac = bitwidth - integers - if not no_bias: - integers = max(integers + (bias_signed and not signed), bias_integers + (signed and not bias_signed)) + 1 - bitwidth = integers + max(frac, bias_width - bias_integers) - signed = signed or bias_signed + if not no_bias: + integers = max(integers + (bias_signed and not signed), bias_integers + (signed and not bias_signed)) + 1 + bitwidth = integers + max(frac, bias_width - bias_integers) + signed = signed or bias_signed - # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. - new_type = FixedPrecisionType(bitwidth, integers, signed) + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + bitwidth = min(bitwidth, max_precision.width) + integers = min(integers, max_precision.integer) + + # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. + new_type = FixedPrecisionType(bitwidth, integers, signed) + else: + new_type = self._get_default_precision(node) if 'accum_t' in types_to_infer: node.types['accum_t'].name = node.name + '_accum_t' @@ -166,6 +197,7 @@ def _infer_conv_precision(self, node, types_to_infer): n_ops = node.get_attr('n_chan') * node.get_attr('filt_height', 1) * node.get_attr('filt_width') return self._infer_common_precision(node, types_to_infer, n_ops) + # This function is ignored because we will split sepconv in the future def _infer_sepconv_precision(self, node, types_to_infer): inferred_types = [] @@ -265,24 +297,35 @@ def _infer_bn_precision(self, node, types_to_infer): scale_precision = node.types['scale_t'].precision bias_precision = node.types['bias_t'].precision - after_scale_signed = scale_precision.signed or input_precision.signed - after_scale_width = input_precision.width + scale_precision.width - after_scale_integer = input_precision.integer + scale_precision.integer + if self._all_supported_types((input_precision, scale_precision, bias_precision)): + + after_scale_signed = scale_precision.signed or input_precision.signed + after_scale_width = input_precision.width + scale_precision.width + after_scale_integer = input_precision.integer + scale_precision.integer - out_precision_signed = after_scale_signed or bias_precision.signed - out_precision_integer = ( - max( - after_scale_integer + (bias_precision.signed and not after_scale_signed), - bias_precision.integer + (after_scale_signed and not bias_precision.signed), + out_precision_signed = after_scale_signed or bias_precision.signed + out_precision_integer = ( + max( + after_scale_integer + (bias_precision.signed and not after_scale_signed), + bias_precision.integer + (after_scale_signed and not bias_precision.signed), + ) + + 1 + ) + out_precision_width = out_precision_integer + max( + after_scale_width - after_scale_integer, bias_precision.fractional ) - + 1 - ) - out_precision_width = out_precision_integer + max( - after_scale_width - after_scale_integer, bias_precision.fractional - ) - # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. - out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed) + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + out_precision_width = min(out_precision_width, max_precision.width) + out_precision_integer = min(out_precision_integer, max_precision.integer) + + # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. + out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed) + + else: + out_precision = self._get_default_precision(node) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision @@ -298,20 +341,29 @@ def _infer_pooling_precision(self, node, types_to_infer): input_precision = node.get_input_variable().type.precision pool_op = node.attributes['pool_op'].lower() - width = input_precision.width - integer = input_precision.integer - signed = input_precision.signed + if pool_op == 'max': + # This has the benefit of working for xnor types. I don't think "copy" is needed + accum_type = input_precision + + elif pool_op == 'average': + if self._all_supported_types((input_precision,)): + width = input_precision.width + integer = input_precision.integer + signed = input_precision.signed + + pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width') + extra_bits = int(np.ceil(np.log2(pool_size))) + + # for now ignore max precision in this case + accum_type = FixedPrecisionType( + width=width + extra_bits * 2, integer=integer + extra_bits, signed=signed + ) + else: + accum_type = self._get_default_precision(node) - pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width') - if pool_op == 'average': - extra_bits = int(np.ceil(np.log2(pool_size))) - elif pool_op == 'max': - extra_bits = 0 else: raise ValueError(f'Unknown pooling operation: {pool_op}') - accum_type = FixedPrecisionType(width=width + extra_bits * 2, integer=integer + extra_bits, signed=signed) - node.types['accum_t'].name = node.name + '_accum_t' node.types['accum_t'].precision = accum_type @@ -331,22 +383,76 @@ def _infer_merge_precision(self, node, types_to_infer): op = node.get_attr('op').lower() if op in ('add', 'subtract', 'average'): - new_signed = input_1.signed or input_2.signed or op == 'subtract' - new_int = ( - max( - input_1.integer + (input_2.signed and not input_1.signed), - input_2.integer + (input_1.signed and not input_2.signed), + if self._all_supported_types((input_1, input_2)): + new_signed = input_1.signed or input_2.signed or op == 'subtract' + new_int = ( + max( + input_1.integer + (input_2.signed and not input_1.signed), + input_2.integer + (input_1.signed and not input_2.signed), + ) + + 1 ) - + 1 - ) - new_width = new_int + max(input_1.fractional, input_2.fractional) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) + new_width = new_int + max(input_1.fractional, input_2.fractional) + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + new_width = min(new_width, max_precision.width) + new_int = min(new_int, max_precision.integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + out_precision = self._get_default_precision(node) elif op == 'multiply': - new_signed = input_1.signed or input_2.signed - new_int = input_1.integer + input_2.integer - new_width = input_1.width + input_2.width - out_precision = FixedPrecisionType(new_width, new_int, new_signed) + if self._all_supported_types((input_1, input_2)): + new_signed = input_1.signed or input_2.signed + new_int = input_1.integer + input_2.integer + new_width = input_1.width + input_2.width + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + new_width = min(new_width, max_precision.width) + new_int = min(new_int, max_precision.integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + out_precision = self._get_default_precision(node) elif op in ('maximum', 'minimum'): + if input_1 == input_2: + # can handle binary and potentially others + out_precision = input_1 # I assume copy is not necessary + elif self._all_supported_types((input_1, input_2)): + new_signed = input_1.signed or input_2.signed + + input_1_integer = input_1.integer + input_2_integer = input_2.integer + + # add one to integer if unsigned while new is signed + if new_signed and not input_1.signed: + input_1_integer += 1 + if new_signed and not input_2.signed: + input_2_integer += 1 + + new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) + new_int = max(input_1_integer, input_2_integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + out_precision = self._get_default_precision(node) + else: + print(f'Warning: not propagating weights for type {op}') + out_precision = self._get_default_precision(node) + + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + + return ['result_t'] + + def _infer_cat_precision(self, node, types_to_infer): + assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + + input_1 = node.get_input_variable(node.inputs[0]).type.precision + input_2 = node.get_input_variable(node.inputs[1]).type.precision + + if input_1 == input_2: + # can handle binary and potentially others + out_precision = input_1 # I assume copy is not necessary + elif self._all_supported_types((input_1, input_2)): new_signed = input_1.signed or input_2.signed input_1_integer = input_1.integer @@ -360,9 +466,15 @@ def _infer_merge_precision(self, node, types_to_infer): new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) new_int = max(input_1_integer, input_2_integer) + + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + new_width = min(new_width, max_precision.width) + new_int = min(new_int, max_precision.integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) else: - print(f'Warning: not propagating weights for type {op}') out_precision = self._get_default_precision(node) node.types['result_t'].name = node.name + '_result_t' @@ -370,46 +482,58 @@ def _infer_merge_precision(self, node, types_to_infer): return ['result_t'] - def _infer_cat_precision(self, node, types_to_infer): + def _infer_dot_precision(self, node, types_to_infer): assert 'result_t' in types_to_infer and len(types_to_infer) == 1 input_1 = node.get_input_variable(node.inputs[0]).type.precision input_2 = node.get_input_variable(node.inputs[1]).type.precision - new_signed = input_1.signed or input_2.signed + if self._all_supported_types((input_1, input_2)): + n_in = node.get_input_variable(node.inputs[0]).shape[0] - input_1_integer = input_1.integer - input_2_integer = input_2.integer - - # add one to integer if unsigned while new is signed - if new_signed and not input_1.signed: - input_1_integer += 1 - if new_signed and not input_2.signed: - input_2_integer += 1 + new_signed = input_1.signed or input_2.signed + new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in)) + new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in)) - new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) - new_int = max(input_1_integer, input_2_integer) + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + new_width = min(new_width, max_precision.width) + new_int = min(new_int, max_precision.integer) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + out_precision = self._get_default_precision(node) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision return ['result_t'] - def _infer_dot_precision(self, node, types_to_infer): - assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + def _infer_embedding_precision(self, node, types_to_infer): + inferred_types = [] - input_1 = node.get_input_variable(node.inputs[0]).type.precision - input_2 = node.get_input_variable(node.inputs[1]).type.precision + if 'embeddings_t' in types_to_infer: + self._infer_default_type(node, 'embeddings_t') + node.weights['embeddings'].update_precision(node.types['embeddings_t'].precision) + inferred_types.append('embeddings_t') + + if 'result_t' in types_to_infer: + out_precision = self._get_default_precision(node) + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + inferred_types.append('result_t') - n_in = node.get_input_variable(node.inputs[0]).shape[0] + return inferred_types - new_signed = input_1.signed or input_2.signed - new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in)) - new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in)) + # TODO: This is just a placeholder + def _infer_rnn_precision(self, node, types_to_infer): + inferred_types = [] - out_precision = FixedPrecisionType(new_width, new_int, new_signed) - node.types['result_t'].name = node.name + '_result_t' - node.types['result_t'].precision = out_precision + # for now just do the weights and leave the rest for the default catch + for weightvar in ('weight', 'bias', 'recurrent_weight', 'recurrent_bias'): + if f'{weightvar}_t' in types_to_infer: + self._infer_default_type(node, f'{weightvar}_t') + node.weights[weightvar].update_precision(node.types[f'{weightvar}_t'].precision) + inferred_types.append(f'{weightvar}_t') - return ['result_t'] + return inferred_types From aaaa2fcfe01a8aed2efb09707f2eb423366dac1e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 19 Jul 2024 10:20:26 -0500 Subject: [PATCH 095/272] loosen batchnorm merging restrictions, fix ternary handling --- hls4ml/model/optimizer/__init__.py | 4 +-- .../model/optimizer/passes/batchnorm_opt.py | 32 +++++++++---------- hls4ml/model/optimizer/passes/qkeras.py | 10 +++++- test/pytest/test_qkeras.py | 4 ++- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 282561e11e..c6270d8f28 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -61,6 +61,8 @@ 'merge_linear_activation', 'fuse_batch_normalization', 'eliminate_linear_activation', + 'qkeras_factorize_alpha', + 'extract_ternary_threshold', # The ones above here need to be before infer_precision_types 'infer_precision_types', 'channels_last_converter', @@ -70,8 +72,6 @@ 'fuse_bias_add', 'expand_layer_group', 'output_rounding_saturation_mode', - 'qkeras_factorize_alpha', - 'extract_ternary_threshold', ], requires=['parse_qonnx'], ) # TODO Maybe not all QKeras optmizers belong here? diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index 26292d7e2a..94a9a32d70 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -170,12 +170,12 @@ def match(self, node): s1 = node.weights['scale'].data_unquantized b1 = node.weights['bias'].data_unquantized scale_compatible = ( - (prev_node.get_attr('scale_quantizer') is None and node.get_attr('scale_quantizer') is None) + (prev_node.get_attr('scale_quantizer') is None or node.get_attr('scale_quantizer') is None) or (s0 == np.ones_like(s0)).all() or (s1 == np.ones_like(s1)).all() ) bias_compatible = ( - (prev_node.get_attr('bias_quantizer') is None and node.get_attr('bias_quantizer') is None) + (prev_node.get_attr('bias_quantizer') is None or node.get_attr('bias_quantizer') is None) or (b0 == np.zeros_like(b0)).all() or (b1 == np.zeros_like(b1)).all() ) @@ -195,26 +195,24 @@ def transform(self, model, node): # if len(node_map[node.outputs[0]]) > 1: # return False - # only merge if the types are integer or fixed - if ( - not isinstance(prev_node.weights['scale'].type.precision, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(prev_node.weights['bias'].type.precision, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(node.weights['scale'].type.precision, (IntegerPrecisionType, FixedPrecisionType)) - or not isinstance(node.weights['bias'].type.precision, (IntegerPrecisionType, FixedPrecisionType)) - ): - return False - s0 = prev_node.weights['scale'].data_unquantized b0 = prev_node.weights['bias'].data_unquantized s1 = node.weights['scale'].data_unquantized b1 = node.weights['bias'].data_unquantized - s_quantizer = ( - node.get_attr('scale_quantizer') if (s0 == np.ones_like(s0)).all() else prev_node.get_attr('scale_quantizer') - ) - b_quantizer = ( - node.get_attr('bias_quantizer') if (b0 == np.zeros_like(b0)).all() else prev_node.get_attr('bias_quantizer') - ) + if (s0 == np.ones_like(s0)).all(): + s_quantizer = node.get_attr('scale_quantizer') + elif (s1 == np.ones_like(s1)).all(): + s_quantizer = prev_node.get_attr('scale_quantizer') + else: + s_quantizer = None + + if (b0 == np.ones_like(b0)).all(): + b_quantizer = node.get_attr('bias_quantizer') + elif (b1 == np.ones_like(b1)).all(): + b_quantizer = prev_node.get_attr('bias_quantizer') + else: + b_quantizer = None node.set_attr('scale_quantizer', s_quantizer) node.set_attr('bias_quantizer', b_quantizer) diff --git a/hls4ml/model/optimizer/passes/qkeras.py b/hls4ml/model/optimizer/passes/qkeras.py index a97438832d..03690bed0d 100644 --- a/hls4ml/model/optimizer/passes/qkeras.py +++ b/hls4ml/model/optimizer/passes/qkeras.py @@ -163,8 +163,16 @@ def transform(self, model, node): else: n_in = node.get_attr('n_out') + # the name of the new ApplyAlpha node + alpha_name = node.get_attr('name') + '_alpha' + + # make the precision auto + alpha_precision = {'Precision': 'auto'} + model.config.set_name_config(alpha_name, alpha_precision) + model.config.parse_name_config(alpha_name, alpha_precision) + attrs = { - 'name': node.get_attr('name') + '_alpha', + 'name': alpha_name, 'class_name': 'Alpha', 'inputs': node.outputs, 'n_in': n_in, diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py index 45d015807b..5f62475d1a 100644 --- a/test/pytest/test_qkeras.py +++ b/test/pytest/test_qkeras.py @@ -356,8 +356,10 @@ def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type): ], ) def test_qactivation_kwarg(randX_100_10, activation_quantizer, weight_quantizer): - if activation_quantizer in ['binary', 'ternary']: + if activation_quantizer in ['binary']: name = 'bnbt_qdense_alpha' + elif activation_quantizer in ['ternary']: + name = 'bnbt_qdense_ternary_scale' else: name = f'qdense_{eval(activation_quantizer).__class__.__name__}' From a2b88f4a1a9f6c4ddb06bdf50c5e5e8d21dd0eb4 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 19 Jul 2024 17:24:52 -0500 Subject: [PATCH 096/272] remove some backends from slow qonnx test --- test/pytest/test_qonnx.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index b955608b88..5b7b9d95c9 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -105,11 +105,11 @@ def test_tfc_2w2a(tfc_2w2a_model, backend): np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vitis']) def test_cnv_2w2a(cnv_2w2a_model, backend): """ - This tests a convolution model. Note: the batch normalizations weights not quantized, so it - is difficult to make this match perfectly. It is also a slow test. + This tests a convolution model. Note: the batch normalizations weights not quantized, so it is + difficult to make this match perfectly. It is also a slow test, which is why only Vitis is tested. """ model = cnv_2w2a_model From c738b7f6eb6a50841467bf603b781de183fbe7c5 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Tue, 2 Jan 2024 19:08:04 -0800 Subject: [PATCH 097/272] :tada: add support for HGQ-proxy-model :tada: add support for HGQ-proxy-model more consistent type naming revert example model version --- .../backends/fpga/passes/hgq_proxy_model.py | 89 ++++++ hls4ml/backends/quartus/quartus_backend.py | 26 +- hls4ml/backends/vivado/vivado_backend.py | 28 +- hls4ml/converters/keras/hgq_proxy_model.py | 21 ++ hls4ml/model/optimizer/__init__.py | 1 + .../model/optimizer/passes/hgq_proxy_model.py | 128 +++++++++ hls4ml/utils/fixed_point_quantizer.py | 253 ++++++++++++++++++ test/pytest/test_hgq_proxy_model.py | 164 ++++++++++++ 8 files changed, 706 insertions(+), 4 deletions(-) create mode 100644 hls4ml/backends/fpga/passes/hgq_proxy_model.py create mode 100644 hls4ml/converters/keras/hgq_proxy_model.py create mode 100644 hls4ml/model/optimizer/passes/hgq_proxy_model.py create mode 100644 hls4ml/utils/fixed_point_quantizer.py create mode 100644 test/pytest/test_hgq_proxy_model.py diff --git a/hls4ml/backends/fpga/passes/hgq_proxy_model.py b/hls4ml/backends/fpga/passes/hgq_proxy_model.py new file mode 100644 index 0000000000..af172e62f9 --- /dev/null +++ b/hls4ml/backends/fpga/passes/hgq_proxy_model.py @@ -0,0 +1,89 @@ +import numpy as np + +from hls4ml.backends import Backend +from hls4ml.backends.template import FunctionCallTemplate +from hls4ml.model.layers import Layer +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer +from hls4ml.model.types import Source + + +def to_apfixed(k, b, i, RND, SAT): + u = 'u' if k == 0 else '' + return f'ap_{u}fixed<{b},{i},AP_{RND},AP_{SAT}>' + + +def to_acfixed(k, b, i, RND, SAT): + k = 'false' if k == 0 else 'true' + return f'ac_fixed<{b},{i},{k},AC_{RND},AC_{SAT}>' + + +def generate_mask_fn( + name: str, shape: tuple[int, ...], k: np.ndarray, b: np.ndarray, i: np.ndarray, RND: str, SAT: str, backend: str +) -> str: + """Generate heterogenous quantization mask function, ONLY works for IOType=io_parallel""" + assert k.shape[0] == b.shape[0] == i.shape[0] == 1 + assert backend.lower() in ('quartus', 'vivado', 'vitis'), f'Backend {backend} not tested' + Ks, Bs, Is = k[0], b[0], i[0] + Ks, Bs, Is = np.broadcast_to(Ks, shape), np.broadcast_to(Bs, shape), np.broadcast_to(Is, shape) + Ks, Bs, Is = Ks.ravel(), Bs.ravel(), Is.ravel() + masks = [] + to_fixed = to_acfixed if backend.lower() == 'quartus' else to_apfixed + for idx, (k, b, i) in enumerate(zip(Ks, Bs, Is)): + if b == 0: + fn = f'out[{idx}] = 0;' + else: + fn = f'out[{idx}] = {to_fixed(k,b,i,RND,SAT)}(inp[{idx}]);' + masks.append(f' {fn}') + body = "\n".join(masks) + mask_fn = f''' +template +void {name}(input_t *inp, output_t *out) {{ + #pragma HLS INLINE + #pragma HLS PIPELINE + +{body} +}} +''' + return mask_fn + + +class ProcessFixedPointQuantizerLayer(OptimizerPass): + def match(self, node: Layer): + return isinstance(node, FixedPointQuantizer) + + def transform(self, model, node: FixedPointQuantizer): + if node.fusible: + model.remove_node(node, rewire=True) + return True + + if model.config.config['IOType'] != 'io_parallel': + raise NotImplementedError('Heterogenous quantization for activations is only supported with IOType=io_parallel') + + backend = model.config.config['Backend'] + + name = node.name + + assert node.mask_kbi is not None + k, b, i = node.mask_kbi + RND = node.RND + SAT = node.SAT + mask_fn: str = generate_mask_fn(name, node.get_input_variable().shape, k, b, i, RND, SAT, backend) + + node.set_attr('mask_fn_codegen', Source(mask_fn)) + + +class ProcessFixedPointQuantizerCall(FunctionCallTemplate): + def __init__(self): + super().__init__(FixedPointQuantizer, include_header=[]) + self.template = 'nnet::{name}<{input_t}, {output_t}>({input}, {output});' + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +def register_hgq_proxy_model(backend: Backend): + backend.register_pass('process_fixed_point_quantizer_layer', ProcessFixedPointQuantizerLayer) + backend.register_template(ProcessFixedPointQuantizerCall) diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py index f08b244dd8..b6080a8c95 100644 --- a/hls4ml/backends/quartus/quartus_backend.py +++ b/hls4ml/backends/quartus/quartus_backend.py @@ -1,5 +1,6 @@ import os from contextlib import contextmanager +from warnings import warn import numpy as np @@ -73,6 +74,7 @@ def _register_flows(self): 'quartus:inplace_stream_flatten', 'quartus:skip_softmax', 'quartus:fix_softmax_table_size', + 'quartus:process_fixed_point_quantizer_layer', 'infer_precision_types', ] optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) @@ -265,7 +267,17 @@ def init_conv1d(self, layer): n_in, n_out = self.get_layer_mult_size(layer) self.set_target_reuse_factor(layer) self.set_closest_reuse_factor(layer, n_in, n_out) - layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)) + + # Not overriding user parallelization factor, if already set and user has not specified a value + user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None) + layer_pf = layer.get_attr('parallelization_factor', None) + chosen_pf = user_pf or layer_pf or 1 + if user_pf is not None and layer_pf is not None: + if user_pf != layer_pf: + warn( + f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.' # noqa: E501 + ) + layer.set_attr('parallelization', chosen_pf) # impl_filt_width determines the filter size post-Winograd transformation layer.set_attr('impl_filt_width', layer.get_attr('filt_width')) @@ -295,7 +307,17 @@ def init_conv2d(self, layer): n_in, n_out = self.get_layer_mult_size(layer) self.set_target_reuse_factor(layer) self.set_closest_reuse_factor(layer, n_in, n_out) - layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)) + + # Not overriding user parallelization factor, if already set and user has not specified a value + user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None) + layer_pf = layer.get_attr('parallelization_factor', None) + chosen_pf = user_pf or layer_pf or 1 + if user_pf is not None and layer_pf is not None: + if user_pf != layer_pf: + warn( + f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.' # noqa: E501 + ) + layer.set_attr('parallelization', chosen_pf) # impl_filt_width & impl_filt_height determine the filter size post-Winograd transformation layer.set_attr('impl_filt_height', layer.get_attr('filt_height')) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 44b284b33a..8f0d97d76b 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -1,5 +1,6 @@ import os import sys +from warnings import warn import numpy as np @@ -107,6 +108,7 @@ def _register_flows(self): 'vivado:inplace_stream_flatten', 'vivado:skip_softmax', 'vivado:fix_softmax_table_size', + 'vivado:process_fixed_point_quantizer_layer', 'infer_precision_types', ] optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) @@ -266,7 +268,17 @@ def init_conv1d(self, layer): layer.set_attr('strategy', 'latency') out_width = layer.get_output_variable().shape[0] - chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1) + + # Not overriding user parallelization factor, if already set and user has not specified a value + user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None) + layer_pf = layer.get_attr('parallelization_factor', None) + chosen_pf = user_pf or layer_pf or 1 + if user_pf is not None and layer_pf is not None: + if user_pf != layer_pf: + warn( + f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.' # noqa: E501 + ) + valid_pf = self.get_valid_conv_partition_splits(1, out_width) if chosen_pf not in valid_pf: closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) @@ -278,6 +290,7 @@ def init_conv1d(self, layer): else: closest_pf = chosen_pf layer.set_attr('n_partitions', out_width // closest_pf) + layer.set_attr('parallelization_factor', closest_pf) layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) @@ -332,7 +345,17 @@ def init_conv2d(self, layer): out_height = layer.get_output_variable().shape[0] out_width = layer.get_output_variable().shape[1] - chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1) + + # Not overriding user parallelization factor, if already set and user has not specified a value + user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None) + layer_pf = layer.get_attr('parallelization_factor', None) + chosen_pf = user_pf or layer_pf or 1 + if user_pf is not None and layer_pf is not None: + if user_pf != layer_pf: + warn( + f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.' # noqa: E501 + ) + valid_pf = self.get_valid_conv_partition_splits(out_height, out_width) if chosen_pf not in valid_pf: closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) @@ -344,6 +367,7 @@ def init_conv2d(self, layer): else: closest_pf = chosen_pf layer.set_attr('n_partitions', out_height * out_width // closest_pf) + layer.set_attr('parallelization_factor', closest_pf) layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) diff --git a/hls4ml/converters/keras/hgq_proxy_model.py b/hls4ml/converters/keras/hgq_proxy_model.py new file mode 100644 index 0000000000..235dd50269 --- /dev/null +++ b/hls4ml/converters/keras/hgq_proxy_model.py @@ -0,0 +1,21 @@ +from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer + + +@keras_handler('FixedPointQuantizer') +def fixedpoint_quantizer_handler(keras_layer, input_names, input_shapes, data_reader): + config = parse_default_keras_layer(keras_layer, input_names) + + name = config['name'] + fusible = keras_layer['config']['fusible'] + config['RND'] = keras_layer['config']['RND'] + config['SAT'] = keras_layer['config']['SAT'] + config['fusible'] = fusible + if not fusible: + k = data_reader.get_weights_data(name, 'keep_negative') + b = data_reader.get_weights_data(name, 'bits') + i = data_reader.get_weights_data(name, 'integers') + config['mask_kbi'] = k, b, i + config['overrides'] = keras_layer['config']['overrides'] + + layer = config + return layer, input_shapes[0] diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 3aa247d03f..8519efd77e 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -44,6 +44,7 @@ 'qkeras_factorize_alpha', 'extract_ternary_threshold', 'fuse_consecutive_batch_normalization', + 'enforce_proxy_model_embedded_config', ], ) # TODO Maybe not all QKeras optmizers belong here? diff --git a/hls4ml/model/optimizer/passes/hgq_proxy_model.py b/hls4ml/model/optimizer/passes/hgq_proxy_model.py new file mode 100644 index 0000000000..da5f37ed64 --- /dev/null +++ b/hls4ml/model/optimizer/passes/hgq_proxy_model.py @@ -0,0 +1,128 @@ +import re +from warnings import warn + +from hls4ml.backends.fpga.fpga_types import NamedType +from hls4ml.model.layers import Layer, register_layer +from hls4ml.model.optimizer import OptimizerPass, register_pass +from hls4ml.model.types import FixedPrecisionType, WeightVariable + +re_purge_prefix = re.compile(r'(?]+)>\s*', re.IGNORECASE) + + +class FixedPointQuantizer(Layer): + def initialize(self): + inp = self.get_input_variable() + shape = inp.shape + dims = inp.dim_names + self.add_output_variable(shape, dims) + self.set_attr('n_in', self.get_input_variable().size()) + self.overrides = self.attributes['overrides'] + self.fusible = self.attributes['fusible'] + self.SAT, self.RND = self.attributes['SAT'], self.attributes['RND'] + self.mask_kbi = self.attributes.get('mask_kbi', None) + + +def to_hls4ml_fixed(fixed: str): + matched = re_parse_fixed.match(re_purge_prefix.sub('', fixed)) + assert matched is not None, f'Cannot parse {fixed}' + signed = matched.group(1) != 'u' + b, i, *args = matched.group(2).split(',') + b, i = int(b), int(i) + args = [arg.upper() for arg in args] + new_type = FixedPrecisionType(b, i, signed, *args) + # For some reason, __class__ is overwritten in hls4ml + return new_type + + +def userconf_ifdef(key: str, layer_name: str, model): + hls_config: dict = model.config.config['HLSConfig'] + layer_confs: dict = hls_config.get('LayerName', None) + if not layer_confs: + return False + layer_conf = layer_confs.get(layer_name, None) + if not layer_conf: + return False + # return key in layer_conf # Ideal case. Not for now. + if key.endswith('_t') and key != 'table_t': + # table_t cannot be defined in Precision, for some reason. + # On the other hand, result_t, weight_t, bias_t, accum_t cannot be decleared explicitly outside Precision, for now. + # However, still assume that they can be defined explicitly outside Precision. + precision_conf = layer_conf.get('Precision', None) + if not precision_conf: + return key in layer_conf + return key[:-2] in precision_conf or key in layer_conf + + if key == 'parallelization_factor': + # Irregular config key name. + return 'ParallelizationFactor' in layer_conf + + return key in layer_conf + + +class EnforceProxyModelEmbeddedConfig(OptimizerPass): + def match(self, node: Layer): + if not isinstance(node, FixedPointQuantizer): + return False + if not node.overrides: + return False + return True + + def transform(self, model, node: FixedPointQuantizer): + if 'layers' not in node.overrides: + return False + + graph_changed = False + layers = node.overrides['layers'] + for name, conf in layers.items(): + conf: dict[str, str] + name: str + if name not in model.graph: + # Some layer may be removed by other passes. (e.g. Final flatten layer) + continue + target_node: Layer = model.graph[name] + for k, v in conf.items(): + if userconf_ifdef(k, name, model): + warn( + f'Config key {k} is defined in hls_config for layer {name} by user. Proxy model config is ignored.', + stacklevel=1, + ) + continue + + if k.endswith('_t'): + var_type = target_node.get_attr(k) # type: ignore + if var_type is None: + continue + var_type: NamedType + precision = to_hls4ml_fixed(v) + var_type.precision = precision + if k == 'result_t': + type_name = f'{name}_t' + else: + type_name = f'{name}_{k}' + var_type.name = type_name + # Need to overwrite kernel/bias writing precision also, or written weights will likely be wrong. + if k[:-2] in target_node.attributes.keys(): + weight_var: WeightVariable = target_node.attributes[k[:-2]] + # weight_var should be a StaticWeightVariable, which is again, defined with meta programming + # Type hinting using StaticWeightVariableDefinition which is the base class. + weight_var.update_precision(precision) + # Well, it turned out that there is yet ANOTHER copy saved in config. + model.config.layer_name_precision[f'{name}_{k[:-2]}'] = v + elif k in target_node.attributes.attributes: + target_node.set_attr(k, v) + elif k == 'parallelization_factor': + target_node.set_attr(k, int(v)) + + if linear_node := model.graph.get(f'{name}_linear'): + # Proxy model does not assume any extra linear layer. + # Purge them on sight + model.remove_node(linear_node) + graph_changed = True + + return graph_changed + + +def register_hgq_proxy_model(): + register_layer('FixedPointQuantizer', FixedPointQuantizer) + register_pass('enforce_proxy_model_embedded_config', EnforceProxyModelEmbeddedConfig) diff --git a/hls4ml/utils/fixed_point_quantizer.py b/hls4ml/utils/fixed_point_quantizer.py new file mode 100644 index 0000000000..628e7bb4f9 --- /dev/null +++ b/hls4ml/utils/fixed_point_quantizer.py @@ -0,0 +1,253 @@ +from typing import Callable + +import keras +import numpy as np +import tensorflow as tf +from keras import backend as K + +# Nice figure (Figure. 2 and 3) from https://www.researchgate.net/publication/226964494_Formalization_of_Fixed-Point_Arithmetic_in_HOL to illustrate the rounding and saturation modes. # noqa: E501 + + +def TRN(x): + # Truncate towards negative infinity. Fast. Preferred when possible. + return tf.floor(x) + + +def RND(x): + # Round to nearest, ties to even. + # Can be reduced to TRN with a bias. + return tf.floor(x + 0.5) # type:ignore + + +def RND_CONV(x): + # towards nearest integer, ties to even. + return tf.round(x) + + +def TRN_ZERO(x): + # Truncate towards zero. + sign = K.sign(x) + return tf.floor(K.abs(x)) * sign + + +def RND_ZERO(x): + # Round to nearest, ties to zero. + sign = K.sign(x) + return -tf.floor(-K.abs(x) + 0.5) * sign + + +def RND_MIN_INF(x): + # Round to nearest, ties to negative infinity. + return -tf.floor(-x + 0.5) # type: ignore + + +def RND_INF(x): + # Round to nearest, ties away from zero. + sign = K.sign(x) + return tf.floor(K.abs(x) + 0.5) * sign + + +def SAT(x, k, b): + # Saturate between highest and lowest representable values. + high = 2 ** (b - k) - 1 + low = -(high + 1) * k + return tf.clip_by_value(x, low, high) + + +def SAT_ZERO(x, k, b): + # Overflow to zero. + high = 2 ** (b - k) - 1 + low = (-high - 1) * k + mask = tf.cast((x <= high) & (x >= low), 'float32') + return x * mask + + +def SAT_SYM(x, k, b): + # Saturate between highest and lowest representable values when unsigned; between highest and -highest when signed. + high = 2 ** (b - k) - 1 + low = -high * k + return tf.clip_by_value(x, low, high) + + +def WRAP(x, k, b): + # Wrap around. + high = 2 ** (b - k) - 1 + low = -(high + 1) * k + return tf.math.floormod(x - low, high - low + 1) + low + + +def WRAP_SYM(x, k, b): + # High and low bounds are reflective.When overflows, can be less trash than WARP but still more trash than SAT. # noqa: E501 + dtype = x.dtype + high = 2 ** (b - k) - 1 + low = -(high + 1) * k + interval = (high - low + 1) * 2 + mapped = K.cast(tf.math.floormod(x - high - 1, interval), 'float32') + return K.cast(K.abs(mapped - interval / 2 + 0.5) - 0.5 + low, dtype) + + +RND_MAP = { + 'RND': RND, + 'RND_ZERO': RND_ZERO, + 'RND_MIN_INF': RND_MIN_INF, + 'RND_INF': RND_INF, + 'RND_CONV': RND_CONV, + 'TRN_ZERO': TRN_ZERO, + 'TRN': TRN, +} + +SAT_MAP = { + 'SAT': SAT, + 'SAT_ZERO': SAT_ZERO, + 'SAT_SYM': SAT_SYM, + 'WRAP': WRAP, + 'WRAP_SYM': WRAP_SYM, +} + + +@tf.function(autograph=False, jit_compile=True) +def gfixed_quantizer(x, keep_negative, bits, integer_bits, RND='TRN', SAT='WRAP'): + '''Generalized fixed point quantizer, should have the same behavior to ap_fixed/ap_ufixed. + Support high granularity quantization and broadcasting of bitwidths. RND and SAT mode must be strings.''' + + keep_negative = tf.cast(keep_negative, 'float32') + bits = tf.cast(bits, 'float32') + integer_bits = tf.cast(integer_bits, dtype='float32') + + two = tf.constant(2, dtype='float32') + float_bits = bits - integer_bits # type:ignore + scale = tf.pow(two, float_bits) + + scaled_input = x * scale + rnd, sat = RND_MAP[RND], SAT_MAP[SAT] + quantized = sat(rnd(scaled_input), keep_negative, bits) + return quantized / scale * tf.cast(bits != 0, 'float32') + + +def gfixed(keep_negative, bits, integer_bits, RND='TRN', SAT='WRAP') -> Callable: + '''Functional form of generalized fixed point quantizer, should have the same behavior to ap_fixed/ap_ufixed. + Support high granularity quantization and broadcasting of bitwidths. RND and SAT mode must be strings.''' + + def compute(x): + return gfixed_quantizer(x, keep_negative, bits, integer_bits, RND, SAT) # type:ignore + + return compute + + +def ufixed(bits, integer_bits, RND='TRN', SAT='WRAP') -> Callable: + """Grammatical sugar for gfixed(0, bits, integer_bits, RND, SAT).""" + return gfixed(0, bits, integer_bits, RND, SAT) + + +def fixed(bits, integer_bits, RND='TRN', SAT='WRAP') -> Callable: + """Grammatical sugar for gfixed(1, bits, integer_bits, RND, SAT).""" + return gfixed(1, bits, integer_bits, RND, SAT) + + +class FixedPointQuantizer(keras.layers.Layer): + """Fixed point quantizer layer. This layer is not trainable. It is used as a proxy layer when converting a trained model into hls4ml readable form, and can also be used for bit-accurate hls4ml model emulation (up to fp32 representable precision). + + This class is not intended to be instantiated by users. + + Properties: + - overrides: dict. Stores the precision overrides for layers. Currently only `overrides/layers/{layer_name}` field is used. + - fusible: bool, property method. If True, this quantizer can be deleted and fused into the layer before it. + - heterogeneous: bool, property method. If True, this quantizer has different bitwidths for different position. + - result_t_kif: tuple of int. The (keep_negative, integer_bits, float_bits) of the quantized result. + - keep_negative: tf.Variable. The keep_negative flag for each position. + - bits: tf.Variable. The total bitwidth for each position. + - integers: tf.Variable. The integer bitwidth for each position. + - RND: str. The rounding mode. Only 'TRN' and 'RND' are fully tested. + - SAT: str. The saturation mode. Only 'WRAP' and 'SAT' are fully tested. + """ # noqa: E501 + + def __init__( + self, + keep_negative, + bits, + integers, + RND: str = 'TRN', + SAT: str = 'WRAP', + overrides: dict | None = None, + accum_bits_bias=None, + **kwargs, + ): + zeros = bits == 0 + keep_negative = tf.where(zeros, tf.zeros_like(keep_negative), keep_negative) + integers = tf.where(zeros, tf.zeros_like(integers), integers) + self.keep_negative = tf.Variable(keep_negative, dtype='int8', name='keep_negative', trainable=False) + self.bits = tf.Variable(bits, dtype='int8', name='bits', trainable=False) + self.integers = tf.Variable(integers, dtype='int8', name='integers', trainable=False) + + msg = f'Shapes mismatch: keep_negative, bits, and integers must have the same shape. Got {self.keep_negative.shape}, {self.bits.shape}, {self.integers.shape}.' # noqa: E501 + assert self.keep_negative.shape == self.bits.shape == self.integers.shape, msg + + self.accum_bits_bias = accum_bits_bias + self.RND = RND + self.SAT = SAT + + self.overrides = overrides or {'layers': {}} + kwargs.pop('trainable', None) + self._quantizer_created = False + + super().__init__(trainable=False, **kwargs) + + def call(self, x): + if not self.built: + self.build(x.shape) + return gfixed_quantizer(x, self.keep_negative, self.bits, self.integers, self.RND, self.SAT) # type:ignore + + @property + def result_t_kif(self): + k, i, f = self.keep_negative, self.integers - self.keep_negative, self.bits - self.integers # type:ignore + k, i, f = np.max(k), np.max(i), np.max(f) # type:ignore + return k, i, f + + @property + def fusible(self): + """Delete this quantizer if no heterogeneity is detected.""" + assert ( + len(self._inbound_nodes) == 1 + ), 'FixedPointQuantizer must not be reused. Create proxy model only via proviced functions.' + last_layer = self._inbound_nodes[0].inbound_layers + assert not isinstance( + last_layer, list + ), f'FixedPointQuantizer has exactly one inbound layer. Got a list of {len(last_layer)} layers.' + if len(last_layer._outbound_nodes) != 1: + return False + return not self.heterogeneous + + @property + def heterogeneous(self): + k0, b0, i0 = tf.reduce_max(self.keep_negative), tf.reduce_max(self.bits), tf.reduce_max(self.integers) + if not tf.reduce_all(self.keep_negative == k0): + return True + if not tf.reduce_all(self.bits == b0): + return True + if not tf.reduce_all(self.integers == i0): + return True + return False + + def get_config(self): + assert tf.reduce_all( + (self.keep_negative == 0) | (self.keep_negative == 1) + ), 'Illegal bitwidth config: keep_negative must be 0 or 1.' + assert tf.reduce_all(self.bits >= 0), 'Illegal bitwidth config: bits must be non-negative.' # type:ignore + conf = super().get_config() + conf['RND'] = self.RND + conf['SAT'] = self.SAT + conf['shape'] = tuple(self.bits.shape) + overrides = self.overrides + + conf['overrides'] = overrides + conf['fusible'] = self.fusible + return conf + + @classmethod + def from_config(cls, config: dict): + dummy_v = np.full(config.pop('shape'), -128, dtype='int8') + keep_negative = K.variable(dummy_v, dtype='int8', name='keep_negative') + bits = K.variable(dummy_v, dtype='int8', name='bits') + integers = K.variable(dummy_v, dtype='int8', name='integers') + config.pop('fusible', None) + return cls(keep_negative, bits, integers, **config) diff --git a/test/pytest/test_hgq_proxy_model.py b/test/pytest/test_hgq_proxy_model.py new file mode 100644 index 0000000000..bd5963a58e --- /dev/null +++ b/test/pytest/test_hgq_proxy_model.py @@ -0,0 +1,164 @@ +from pathlib import Path + +import numpy as np +import pytest +import tensorflow as tf +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from tensorflow import keras + +from hls4ml.converters import convert_from_keras_model +from hls4ml.utils.fixed_point_quantizer import FixedPointQuantizer + +################################################################# +# Proxy model is implemented as a submodule of HGQ. # +# As HGQ requires python>=3.10,<3.12, and tensorflow==2.13, # +# As the current testing environment is based on python==3.8, # +# HGQ cannot be marked as a dependency at the moment. # +################################################################# + + +test_root_path = Path(__file__).parent +example_model_path = test_root_path.parent.parent / 'example-models' + + +@pytest.fixture(scope='module') +def jet_classifier_model(): + with open(example_model_path / 'keras/proxy_jet_classifier.json') as f: + model_config = f.read() + co = {'FixedPointQuantizer': FixedPointQuantizer} + model: keras.Model = keras.models.model_from_json(model_config, custom_objects=co) # type: ignore + model.load_weights(example_model_path / 'keras/proxy_jet_classifier.h5') + return model + + +@pytest.fixture(scope='module') +def jet_classifier_data(): + print('Fetching data...') + data = fetch_openml('hls4ml_lhc_jets_hlf') + + X, y = data['data'], data['target'] + codecs = {'g': 0, 'q': 1, 't': 4, 'w': 2, 'z': 3} + y = np.array([codecs[i] for i in y]) + + X_train_val, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + X_train_val, X_test = X_train_val.astype(np.float32), X_test.astype(np.float32) + + scaler = StandardScaler() + X_train_val = scaler.fit_transform(X_train_val) + X_test = scaler.transform(X_test) + + X_test = np.ascontiguousarray(X_test) + return X_test, y_test + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) +@pytest.mark.parametrize('io_type', ['io_parallel']) +@pytest.mark.parametrize('overflow', [True, False]) +def test_proxy_jet_classifier(jet_classifier_model, jet_classifier_data, backend: str, io_type: str, overflow: bool): + X, y = jet_classifier_data + if overflow: + X *= 2 # This will cause overflow + + output_dir = str(test_root_path / f'hls4mlprj_proxy_jet_classifier_{backend}_{io_type}_{overflow}') + hls_config = {'Model': {'Precision': 'fixed<1,0>', 'ReuseFactor': 1}} + model_hls = convert_from_keras_model( + jet_classifier_model, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type + ) + model_hls.compile() + + r_hls = model_hls.predict(X) + r_keras = jet_classifier_model(X).numpy() + acc = np.mean(np.argmax(r_hls, axis=1) == y) + + if overflow: + assert acc < 0.7 + if not overflow and io_type == 'io_parallel': + assert 0.750 < acc < 0.751 + assert np.all(r_hls == r_keras) + + +def get_mnist_model_stream(): + with open(example_model_path / 'keras/proxy_mnist_homogeneous_act.json') as f: + model_config = f.read() + co = {'FixedPointQuantizer': FixedPointQuantizer} + model: keras.Model = keras.models.model_from_json(model_config, custom_objects=co) # type: ignore + model.load_weights(example_model_path / 'keras/proxy_mnist_homogeneous_act.h5') + return model + + +def get_mnist_model_parallel(): + with open(example_model_path / 'keras/proxy_mnist_heterogeneous_act.json') as f: + model_config = f.read() + co = {'FixedPointQuantizer': FixedPointQuantizer} + model: keras.Model = keras.models.model_from_json(model_config, custom_objects=co) # type: ignore + model.load_weights(example_model_path / 'keras/proxy_mnist_heterogeneous_act.h5') + return model + + +@pytest.fixture(scope='module') +def mnist_data(): + mnist = tf.keras.datasets.mnist + _, (X_test, y_test) = mnist.load_data() + X_test = (X_test / 255.0).astype(np.float32) + X_test = np.ascontiguousarray(X_test) + return X_test, y_test + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +@pytest.mark.parametrize('overflow', [True, False]) +def test_proxy_mnist(mnist_data, backend: str, io_type: str, overflow: bool): + X, y = mnist_data + if overflow: + X *= 2 # This will cause overflow + + print(X[0].mean()) + if backend.lower() != 'quartus': + model = get_mnist_model_stream() if io_type == 'io_stream' else get_mnist_model_parallel() + else: + # Codegen is not working for Quartus backend, intra-layer heterogeneous activation quantization not possible. + # Only use stream-compatible model, in which all quantizer layers are fusible (homogeneous + layer has no sibling) + model = get_mnist_model_stream() + + output_dir = str(test_root_path / f'hls4mlprj_proxy_mnist_{backend}_{io_type}_{overflow}') + hls_config = { + 'Strategy': 'Latency', + 'Model': {'Precision': 'fixed<1,0>', 'ReuseFactor': 1}, + } # Accum for io_stream is not fixed. Set a large number as placeholder. + + model_hls = convert_from_keras_model( + model, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type + ) + + if backend.lower() != 'quartus': + if io_type == 'io_parallel': + # Check parallel factor is propagated to the hls model + assert model_hls.graph['h_conv2d'].attributes.attributes['n_partitions'] == 1 + assert model_hls.graph['h_conv2d_1'].attributes.attributes['n_partitions'] == 1 + else: + assert model_hls.graph['h_conv2d_2'].attributes.attributes['n_partitions'] == 26**2 + assert model_hls.graph['h_conv2d_3'].attributes.attributes['n_partitions'] == 11**2 + else: + # n_partitions is not used in Quartus backend + assert model_hls.graph['h_conv2d_2'].attributes.attributes['parallelization'] == 1 + assert model_hls.graph['h_conv2d_3'].attributes.attributes['parallelization'] == 1 + + model_hls.compile() + r_keras = model(X).numpy() # type: ignore + acc = np.mean(np.argmax(r_keras, axis=1) == y) + + if overflow: + assert acc < 0.9 + else: + if io_type == 'io_parallel' and backend.lower() != 'quartus': + assert 0.927 < acc < 0.928 + else: + assert 0.957 < acc < 0.958 + + r_hls = model_hls.predict(X) + mismatch_ph = r_hls != r_keras + assert np.all( + r_hls == r_keras + ), f"Proxy-HLS4ML mismatch for out: {np.sum(np.any(mismatch_ph,axis=1))} out of {len(X)} samples are different. Sample: {r_keras[mismatch_ph].ravel()[:5]} vs {r_hls[mismatch_ph].ravel()[:5]}" # noqa: From 7f19b03a761ce2398858f60463a304f105498d7e Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Mon, 8 Jan 2024 22:13:11 -0800 Subject: [PATCH 098/272] Add UnaryLUT support Support UnaryLUT w/ vivado&io_parallel hotfix format support vivado_stream fix syn issue fix syn issue2 Revert unnecessary "fix syn issue1/2" Revert "fix syn issue2" This reverts commit af3c3470b138e6abd0fe0c38feaef5017ee2b93c. Revert "fix syn issue" This reverts commit 532cb9d83e3b8212ba3158a2339d8ceb9905debf. rm redundant pipeline pragma unary lut vitis fix unary lut vitis fix - leftover --- .../backends/fpga/passes/hgq_proxy_model.py | 22 +++++++++++++-- .../backends/quartus/passes/core_templates.py | 3 +- .../backends/vivado/passes/core_templates.py | 3 +- hls4ml/converters/keras/hgq_proxy_model.py | 20 +++++++++++-- hls4ml/converters/keras_to_hls.py | 1 + .../model/optimizer/passes/hgq_proxy_model.py | 15 ++++++++++ .../vivado/nnet_utils/nnet_activation.h | 22 +++++++++++++++ .../nnet_utils/nnet_activation_stream.h | 28 +++++++++++++++++++ 8 files changed, 108 insertions(+), 6 deletions(-) diff --git a/hls4ml/backends/fpga/passes/hgq_proxy_model.py b/hls4ml/backends/fpga/passes/hgq_proxy_model.py index af172e62f9..a58c9f43db 100644 --- a/hls4ml/backends/fpga/passes/hgq_proxy_model.py +++ b/hls4ml/backends/fpga/passes/hgq_proxy_model.py @@ -4,7 +4,7 @@ from hls4ml.backends.template import FunctionCallTemplate from hls4ml.model.layers import Layer from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer +from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer, UnaryLUT from hls4ml.model.types import Source @@ -40,7 +40,6 @@ def generate_mask_fn( template void {name}(input_t *inp, output_t *out) {{ #pragma HLS INLINE - #pragma HLS PIPELINE {body} }} @@ -84,6 +83,25 @@ def format(self, node): return self.template.format(**params) +class ProcessUnaryLUTCall(FunctionCallTemplate): + def __init__(self): + super().__init__(UnaryLUT, include_header=[]) + self.template = 'nnet::unary_lut<{input_t}, {output_t}, {config}>({input}, {output}, {table});' + self.include_header = [ + 'nnet_utils/nnet_activation.h', + 'nnet_utils/nnet_activation_stream.h', + ] + + def format(self, node): + params = self._default_function_params(node) + node.attributes['result_t'].precision = node.attributes['table_t'].precision + params['config'] = f'unary_lut_config{node.index}' + params['table'] = node.get_weights('table').name + + return self.template.format(**params) + + def register_hgq_proxy_model(backend: Backend): backend.register_pass('process_fixed_point_quantizer_layer', ProcessFixedPointQuantizerLayer) backend.register_template(ProcessFixedPointQuantizerCall) + backend.register_template(ProcessUnaryLUTCall) diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py index aece9fc226..d6998c9ab2 100644 --- a/hls4ml/backends/quartus/passes/core_templates.py +++ b/hls4ml/backends/quartus/passes/core_templates.py @@ -1,6 +1,7 @@ from hls4ml.backends.backend import get_backend from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax +from hls4ml.model.optimizer.passes.hgq_proxy_model import UnaryLUT # Dense templates @@ -152,7 +153,7 @@ def format(self, node): class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): - super().__init__((Activation, ParametrizedActivation, PReLU)) + super().__init__((Activation, ParametrizedActivation, PReLU, UnaryLUT)) self.template = activ_config_template def format(self, node): diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index c8119c0c2e..268293dd1e 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -1,6 +1,7 @@ from hls4ml.backends.backend import get_backend from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax +from hls4ml.model.optimizer.passes.hgq_proxy_model import UnaryLUT # Dense templates @@ -144,7 +145,7 @@ def format(self, node): class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): - super().__init__((Activation, ParametrizedActivation, PReLU)) + super().__init__((Activation, ParametrizedActivation, PReLU, UnaryLUT)) self.template = activ_config_template def format(self, node): diff --git a/hls4ml/converters/keras/hgq_proxy_model.py b/hls4ml/converters/keras/hgq_proxy_model.py index 235dd50269..5a1e3fc2a1 100644 --- a/hls4ml/converters/keras/hgq_proxy_model.py +++ b/hls4ml/converters/keras/hgq_proxy_model.py @@ -1,8 +1,8 @@ -from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer +from hls4ml.converters.keras_to_hls import KerasReader, keras_handler, parse_default_keras_layer @keras_handler('FixedPointQuantizer') -def fixedpoint_quantizer_handler(keras_layer, input_names, input_shapes, data_reader): +def fixedpoint_quantizer_handler(keras_layer, input_names, input_shapes, data_reader: KerasReader): config = parse_default_keras_layer(keras_layer, input_names) name = config['name'] @@ -19,3 +19,19 @@ def fixedpoint_quantizer_handler(keras_layer, input_names, input_shapes, data_re layer = config return layer, input_shapes[0] + + +@keras_handler('UnaryLUT') +def unary_lut_keras_handler(keras_layer, input_names, input_shapes, data_reader: KerasReader): + config = parse_default_keras_layer(keras_layer, input_names) + + table = data_reader.get_weights_data(config['name'], 'table') + k, i, f = keras_layer['config']['kif_out'] + k, b, i = k, k + i + f, k + i + config['table_t'] = f'{"" if k else "u"}fixed<{b},{i}>' + config['table'] = table + config['table_size'] = len(table) + config['activation'] = 'unary_lut' + + layer = config + return layer, input_shapes[0] diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py index 1d2376f576..0357b6fa8c 100644 --- a/hls4ml/converters/keras_to_hls.py +++ b/hls4ml/converters/keras_to_hls.py @@ -205,6 +205,7 @@ def parse_keras_model(model_arch, reader): 'Softmax', 'TernaryTanh', 'HardActivation', + 'UnaryLUT', ] # Recurrent layers recurrent_layers = ['SimpleRNN', 'LSTM', 'GRU'] diff --git a/hls4ml/model/optimizer/passes/hgq_proxy_model.py b/hls4ml/model/optimizer/passes/hgq_proxy_model.py index da5f37ed64..cf2d96ab23 100644 --- a/hls4ml/model/optimizer/passes/hgq_proxy_model.py +++ b/hls4ml/model/optimizer/passes/hgq_proxy_model.py @@ -23,6 +23,20 @@ def initialize(self): self.mask_kbi = self.attributes.get('mask_kbi', None) +class UnaryLUT(Layer): + def initialize(self): + inp = self.get_input_variable() + shape = inp.shape + dims = inp.dim_names + self.add_output_variable(shape, dims) + self.set_attr('n_in', inp.size()) + self.table = self.attributes['table'] + self.table_size = self.attributes['table_size'] + + table_t = to_hls4ml_fixed(self.attributes['table_t']) + self.add_weights_variable(name='table', var_name='table{index}', precision=table_t, data=self.table) + + def to_hls4ml_fixed(fixed: str): matched = re_parse_fixed.match(re_purge_prefix.sub('', fixed)) assert matched is not None, f'Cannot parse {fixed}' @@ -125,4 +139,5 @@ def transform(self, model, node: FixedPointQuantizer): def register_hgq_proxy_model(): register_layer('FixedPointQuantizer', FixedPointQuantizer) + register_layer('UnaryLUT', UnaryLUT) register_pass('enforce_proxy_model_embedded_config', EnforceProxyModelEmbeddedConfig) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h index 8baadf2897..da13998e38 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h @@ -441,6 +441,28 @@ template void tanh(data_T data[CO } } +// ************************************************* +// UnaryLUT Activation +// ************************************************* +template inline unsigned get_index_unary_lut(data_T x) { + // Slice the top N bits to get an index into the table + static constexpr int N = ceillog2(table_size); + return (unsigned)(x(x.width - 1, 0)); +} + +template +void unary_lut(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in], + typename CONFIG_T::table_t table[CONFIG_T::table_size]) { + #pragma HLS function_instantiate variable=table + #pragma HLS ARRAY_PARTITION variable=table + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + #pragma HLS UNROLL + unsigned index = get_index_unary_lut(data[ii]); + res[ii] = (res_T)table[index]; + } +} + // ************************************************* // Hard sigmoid Activation // ************************************************* diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h index b72809eff9..4f12ee5cb4 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h @@ -412,6 +412,34 @@ template void tanh(hls::stream +void unary_lut(hls::stream &data, hls::stream &res, typename CONFIG_T::table_t table[CONFIG_T::table_size]) { + #pragma HLS function_instantiate variable=table + #pragma HLS ARRAY_PARTITION variable=table complete + +UnaryLUTActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + UnaryLUTPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + unsigned index = get_index_unary_lut(in_data[j].V); + out_data[j] = table[index]; + } + + res.write(out_data); + } +} + // ************************************************* // Hard sigmoid Activation // ************************************************* From 4586d4caca9f0350859e987b7d5fd091aaace702 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Wed, 24 Apr 2024 18:15:10 -0700 Subject: [PATCH 099/272] :boom: python 3.10 required --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 1911f3b328..34eeff32a4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,7 +32,7 @@ install_requires = tabulate tensorflow tensorflow-model-optimization<=0.7.5 -python_requires = >=3.8 +python_requires = >=3.10 include_package_data = True scripts = scripts/hls4ml From 51b46db5a002f0bc235e0ece4a9947935fef6efe Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Thu, 25 Apr 2024 14:15:31 -0700 Subject: [PATCH 100/272] add pooling precision setting in override --- hls4ml/model/optimizer/passes/hgq_proxy_model.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/passes/hgq_proxy_model.py b/hls4ml/model/optimizer/passes/hgq_proxy_model.py index cf2d96ab23..ccaf363fd4 100644 --- a/hls4ml/model/optimizer/passes/hgq_proxy_model.py +++ b/hls4ml/model/optimizer/passes/hgq_proxy_model.py @@ -4,7 +4,7 @@ from hls4ml.backends.fpga.fpga_types import NamedType from hls4ml.model.layers import Layer, register_layer from hls4ml.model.optimizer import OptimizerPass, register_pass -from hls4ml.model.types import FixedPrecisionType, WeightVariable +from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType, WeightVariable re_purge_prefix = re.compile(r'(?]+)>\s*', re.IGNORECASE) @@ -95,6 +95,12 @@ def transform(self, model, node: FixedPointQuantizer): # Some layer may be removed by other passes. (e.g. Final flatten layer) continue target_node: Layer = model.graph[name] + + # Invoke automatic precision derivation for pooling layers accum_t, if undefined. + if 'pool' in target_node.__class__.__name__.lower(): + if not userconf_ifdef('accum_t', name, model): + target_node.attributes['accum_t'].precision = UnspecifiedPrecisionType() + for k, v in conf.items(): if userconf_ifdef(k, name, model): warn( From 1e3ef52cedbbc5862c7e2b7a71633abd41a1cd08 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Thu, 25 Apr 2024 14:18:36 -0700 Subject: [PATCH 101/272] add HGQ to optional dependency --- setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index 34eeff32a4..9b7ef45f8f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,6 +41,8 @@ pytest_randomly.random_seeder = hls4ml = hls4ml:reseed [options.extras_require] +HGQ = + HGQ~=0.2.0 optimization = keras-tuner==1.1.3 ortools==9.4.1874 @@ -52,6 +54,7 @@ profiling = sr = sympy testing = + HGQ~=0.2.0 pytest pytest-cov pytest-randomly From 31e4e54226a19546bc0db20b75c8e770d97202d8 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Thu, 25 Apr 2024 19:21:35 -0700 Subject: [PATCH 102/272] support serializable hgq objects (tf>=2.13) --- hls4ml/converters/keras/hgq_proxy_model.py | 4 ++-- hls4ml/converters/keras_to_hls.py | 1 + hls4ml/model/optimizer/passes/hgq_proxy_model.py | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hls4ml/converters/keras/hgq_proxy_model.py b/hls4ml/converters/keras/hgq_proxy_model.py index 5a1e3fc2a1..1598759253 100644 --- a/hls4ml/converters/keras/hgq_proxy_model.py +++ b/hls4ml/converters/keras/hgq_proxy_model.py @@ -1,7 +1,7 @@ from hls4ml.converters.keras_to_hls import KerasReader, keras_handler, parse_default_keras_layer -@keras_handler('FixedPointQuantizer') +@keras_handler('FixedPointQuantizer', 'HGQ>FixedPointQuantizer') def fixedpoint_quantizer_handler(keras_layer, input_names, input_shapes, data_reader: KerasReader): config = parse_default_keras_layer(keras_layer, input_names) @@ -21,7 +21,7 @@ def fixedpoint_quantizer_handler(keras_layer, input_names, input_shapes, data_re return layer, input_shapes[0] -@keras_handler('UnaryLUT') +@keras_handler('UnaryLUT', 'HGQ>UnaryLUT') def unary_lut_keras_handler(keras_layer, input_names, input_shapes, data_reader: KerasReader): config = parse_default_keras_layer(keras_layer, input_names) diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py index 0357b6fa8c..f1150be15e 100644 --- a/hls4ml/converters/keras_to_hls.py +++ b/hls4ml/converters/keras_to_hls.py @@ -206,6 +206,7 @@ def parse_keras_model(model_arch, reader): 'TernaryTanh', 'HardActivation', 'UnaryLUT', + 'HGQ>UnaryLUT', ] # Recurrent layers recurrent_layers = ['SimpleRNN', 'LSTM', 'GRU'] diff --git a/hls4ml/model/optimizer/passes/hgq_proxy_model.py b/hls4ml/model/optimizer/passes/hgq_proxy_model.py index ccaf363fd4..13e48aac43 100644 --- a/hls4ml/model/optimizer/passes/hgq_proxy_model.py +++ b/hls4ml/model/optimizer/passes/hgq_proxy_model.py @@ -145,5 +145,7 @@ def transform(self, model, node: FixedPointQuantizer): def register_hgq_proxy_model(): register_layer('FixedPointQuantizer', FixedPointQuantizer) + register_layer('HGQ>FixedPointQuantizer', FixedPointQuantizer) register_layer('UnaryLUT', UnaryLUT) + register_layer('HGQ>UnaryLUT', UnaryLUT) register_pass('enforce_proxy_model_embedded_config', EnforceProxyModelEmbeddedConfig) From ba19daa5dca77777688759cb8f93d5a6677dc960 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Thu, 25 Apr 2024 22:26:44 -0700 Subject: [PATCH 103/272] update hgq tests --- hls4ml/utils/fixed_point_quantizer.py | 253 ------------------------ test/pytest/test_hgq_layers.py | 264 ++++++++++++++++++++++++++ test/pytest/test_hgq_proxy_model.py | 164 ---------------- 3 files changed, 264 insertions(+), 417 deletions(-) delete mode 100644 hls4ml/utils/fixed_point_quantizer.py create mode 100644 test/pytest/test_hgq_layers.py delete mode 100644 test/pytest/test_hgq_proxy_model.py diff --git a/hls4ml/utils/fixed_point_quantizer.py b/hls4ml/utils/fixed_point_quantizer.py deleted file mode 100644 index 628e7bb4f9..0000000000 --- a/hls4ml/utils/fixed_point_quantizer.py +++ /dev/null @@ -1,253 +0,0 @@ -from typing import Callable - -import keras -import numpy as np -import tensorflow as tf -from keras import backend as K - -# Nice figure (Figure. 2 and 3) from https://www.researchgate.net/publication/226964494_Formalization_of_Fixed-Point_Arithmetic_in_HOL to illustrate the rounding and saturation modes. # noqa: E501 - - -def TRN(x): - # Truncate towards negative infinity. Fast. Preferred when possible. - return tf.floor(x) - - -def RND(x): - # Round to nearest, ties to even. - # Can be reduced to TRN with a bias. - return tf.floor(x + 0.5) # type:ignore - - -def RND_CONV(x): - # towards nearest integer, ties to even. - return tf.round(x) - - -def TRN_ZERO(x): - # Truncate towards zero. - sign = K.sign(x) - return tf.floor(K.abs(x)) * sign - - -def RND_ZERO(x): - # Round to nearest, ties to zero. - sign = K.sign(x) - return -tf.floor(-K.abs(x) + 0.5) * sign - - -def RND_MIN_INF(x): - # Round to nearest, ties to negative infinity. - return -tf.floor(-x + 0.5) # type: ignore - - -def RND_INF(x): - # Round to nearest, ties away from zero. - sign = K.sign(x) - return tf.floor(K.abs(x) + 0.5) * sign - - -def SAT(x, k, b): - # Saturate between highest and lowest representable values. - high = 2 ** (b - k) - 1 - low = -(high + 1) * k - return tf.clip_by_value(x, low, high) - - -def SAT_ZERO(x, k, b): - # Overflow to zero. - high = 2 ** (b - k) - 1 - low = (-high - 1) * k - mask = tf.cast((x <= high) & (x >= low), 'float32') - return x * mask - - -def SAT_SYM(x, k, b): - # Saturate between highest and lowest representable values when unsigned; between highest and -highest when signed. - high = 2 ** (b - k) - 1 - low = -high * k - return tf.clip_by_value(x, low, high) - - -def WRAP(x, k, b): - # Wrap around. - high = 2 ** (b - k) - 1 - low = -(high + 1) * k - return tf.math.floormod(x - low, high - low + 1) + low - - -def WRAP_SYM(x, k, b): - # High and low bounds are reflective.When overflows, can be less trash than WARP but still more trash than SAT. # noqa: E501 - dtype = x.dtype - high = 2 ** (b - k) - 1 - low = -(high + 1) * k - interval = (high - low + 1) * 2 - mapped = K.cast(tf.math.floormod(x - high - 1, interval), 'float32') - return K.cast(K.abs(mapped - interval / 2 + 0.5) - 0.5 + low, dtype) - - -RND_MAP = { - 'RND': RND, - 'RND_ZERO': RND_ZERO, - 'RND_MIN_INF': RND_MIN_INF, - 'RND_INF': RND_INF, - 'RND_CONV': RND_CONV, - 'TRN_ZERO': TRN_ZERO, - 'TRN': TRN, -} - -SAT_MAP = { - 'SAT': SAT, - 'SAT_ZERO': SAT_ZERO, - 'SAT_SYM': SAT_SYM, - 'WRAP': WRAP, - 'WRAP_SYM': WRAP_SYM, -} - - -@tf.function(autograph=False, jit_compile=True) -def gfixed_quantizer(x, keep_negative, bits, integer_bits, RND='TRN', SAT='WRAP'): - '''Generalized fixed point quantizer, should have the same behavior to ap_fixed/ap_ufixed. - Support high granularity quantization and broadcasting of bitwidths. RND and SAT mode must be strings.''' - - keep_negative = tf.cast(keep_negative, 'float32') - bits = tf.cast(bits, 'float32') - integer_bits = tf.cast(integer_bits, dtype='float32') - - two = tf.constant(2, dtype='float32') - float_bits = bits - integer_bits # type:ignore - scale = tf.pow(two, float_bits) - - scaled_input = x * scale - rnd, sat = RND_MAP[RND], SAT_MAP[SAT] - quantized = sat(rnd(scaled_input), keep_negative, bits) - return quantized / scale * tf.cast(bits != 0, 'float32') - - -def gfixed(keep_negative, bits, integer_bits, RND='TRN', SAT='WRAP') -> Callable: - '''Functional form of generalized fixed point quantizer, should have the same behavior to ap_fixed/ap_ufixed. - Support high granularity quantization and broadcasting of bitwidths. RND and SAT mode must be strings.''' - - def compute(x): - return gfixed_quantizer(x, keep_negative, bits, integer_bits, RND, SAT) # type:ignore - - return compute - - -def ufixed(bits, integer_bits, RND='TRN', SAT='WRAP') -> Callable: - """Grammatical sugar for gfixed(0, bits, integer_bits, RND, SAT).""" - return gfixed(0, bits, integer_bits, RND, SAT) - - -def fixed(bits, integer_bits, RND='TRN', SAT='WRAP') -> Callable: - """Grammatical sugar for gfixed(1, bits, integer_bits, RND, SAT).""" - return gfixed(1, bits, integer_bits, RND, SAT) - - -class FixedPointQuantizer(keras.layers.Layer): - """Fixed point quantizer layer. This layer is not trainable. It is used as a proxy layer when converting a trained model into hls4ml readable form, and can also be used for bit-accurate hls4ml model emulation (up to fp32 representable precision). - - This class is not intended to be instantiated by users. - - Properties: - - overrides: dict. Stores the precision overrides for layers. Currently only `overrides/layers/{layer_name}` field is used. - - fusible: bool, property method. If True, this quantizer can be deleted and fused into the layer before it. - - heterogeneous: bool, property method. If True, this quantizer has different bitwidths for different position. - - result_t_kif: tuple of int. The (keep_negative, integer_bits, float_bits) of the quantized result. - - keep_negative: tf.Variable. The keep_negative flag for each position. - - bits: tf.Variable. The total bitwidth for each position. - - integers: tf.Variable. The integer bitwidth for each position. - - RND: str. The rounding mode. Only 'TRN' and 'RND' are fully tested. - - SAT: str. The saturation mode. Only 'WRAP' and 'SAT' are fully tested. - """ # noqa: E501 - - def __init__( - self, - keep_negative, - bits, - integers, - RND: str = 'TRN', - SAT: str = 'WRAP', - overrides: dict | None = None, - accum_bits_bias=None, - **kwargs, - ): - zeros = bits == 0 - keep_negative = tf.where(zeros, tf.zeros_like(keep_negative), keep_negative) - integers = tf.where(zeros, tf.zeros_like(integers), integers) - self.keep_negative = tf.Variable(keep_negative, dtype='int8', name='keep_negative', trainable=False) - self.bits = tf.Variable(bits, dtype='int8', name='bits', trainable=False) - self.integers = tf.Variable(integers, dtype='int8', name='integers', trainable=False) - - msg = f'Shapes mismatch: keep_negative, bits, and integers must have the same shape. Got {self.keep_negative.shape}, {self.bits.shape}, {self.integers.shape}.' # noqa: E501 - assert self.keep_negative.shape == self.bits.shape == self.integers.shape, msg - - self.accum_bits_bias = accum_bits_bias - self.RND = RND - self.SAT = SAT - - self.overrides = overrides or {'layers': {}} - kwargs.pop('trainable', None) - self._quantizer_created = False - - super().__init__(trainable=False, **kwargs) - - def call(self, x): - if not self.built: - self.build(x.shape) - return gfixed_quantizer(x, self.keep_negative, self.bits, self.integers, self.RND, self.SAT) # type:ignore - - @property - def result_t_kif(self): - k, i, f = self.keep_negative, self.integers - self.keep_negative, self.bits - self.integers # type:ignore - k, i, f = np.max(k), np.max(i), np.max(f) # type:ignore - return k, i, f - - @property - def fusible(self): - """Delete this quantizer if no heterogeneity is detected.""" - assert ( - len(self._inbound_nodes) == 1 - ), 'FixedPointQuantizer must not be reused. Create proxy model only via proviced functions.' - last_layer = self._inbound_nodes[0].inbound_layers - assert not isinstance( - last_layer, list - ), f'FixedPointQuantizer has exactly one inbound layer. Got a list of {len(last_layer)} layers.' - if len(last_layer._outbound_nodes) != 1: - return False - return not self.heterogeneous - - @property - def heterogeneous(self): - k0, b0, i0 = tf.reduce_max(self.keep_negative), tf.reduce_max(self.bits), tf.reduce_max(self.integers) - if not tf.reduce_all(self.keep_negative == k0): - return True - if not tf.reduce_all(self.bits == b0): - return True - if not tf.reduce_all(self.integers == i0): - return True - return False - - def get_config(self): - assert tf.reduce_all( - (self.keep_negative == 0) | (self.keep_negative == 1) - ), 'Illegal bitwidth config: keep_negative must be 0 or 1.' - assert tf.reduce_all(self.bits >= 0), 'Illegal bitwidth config: bits must be non-negative.' # type:ignore - conf = super().get_config() - conf['RND'] = self.RND - conf['SAT'] = self.SAT - conf['shape'] = tuple(self.bits.shape) - overrides = self.overrides - - conf['overrides'] = overrides - conf['fusible'] = self.fusible - return conf - - @classmethod - def from_config(cls, config: dict): - dummy_v = np.full(config.pop('shape'), -128, dtype='int8') - keep_negative = K.variable(dummy_v, dtype='int8', name='keep_negative') - bits = K.variable(dummy_v, dtype='int8', name='bits') - integers = K.variable(dummy_v, dtype='int8', name='integers') - config.pop('fusible', None) - return cls(keep_negative, bits, integers, **config) diff --git a/test/pytest/test_hgq_layers.py b/test/pytest/test_hgq_layers.py new file mode 100644 index 0000000000..b6d66f9d00 --- /dev/null +++ b/test/pytest/test_hgq_layers.py @@ -0,0 +1,264 @@ +from pathlib import Path + +import HGQ # noqa: F401 +import numpy as np +import pytest +import tensorflow as tf +from HGQ import get_default_paq_conf, set_default_paq_conf, trace_minmax +from HGQ.layers import ( # noqa: F401 + HConv1D, + HDense, + HQuantize, + PAvgPool1D, + PAvgPool2D, + PConcatenate, + PFlatten, + PMaxPool1D, + PMaxPool2D, + PReshape, + Signature, +) +from HGQ.proxy import to_proxy_model +from HGQ.proxy.fixed_point_quantizer import gfixed +from tensorflow import keras + +from hls4ml.converters import convert_from_keras_model + +# tf.config.experimental_run_functions_eagerly(True) # noqa + + +test_path = Path(__file__).parent + + +def _run_synth_match_test(proxy: keras.Model, data, io_type: str, backend: str, dir: str, cond=None): + + output_dir = dir + '/hls4ml_prj' + hls_model = convert_from_keras_model( + proxy, + io_type=io_type, + output_dir=output_dir, + backend=backend, + hls_config={'Model': {'Precision': 'fixed<1,0>', 'ReuseFactor': 1}}, + ) + hls_model.compile() + + data_len = data.shape[0] if isinstance(data, np.ndarray) else data[0].shape[0] + # Multiple output case. Check each output separately + if len(proxy.outputs) > 1: # type: ignore + r_proxy: list[np.ndarray] = [x.numpy() for x in proxy(data)] # type: ignore + r_hls: list[np.ndarray] = hls_model.predict(data) # type: ignore + r_hls = [x.reshape(r_proxy[i].shape) for i, x in enumerate(r_hls)] + else: + r_proxy: list[np.ndarray] = [proxy(data).numpy()] # type: ignore + r_hls: list[np.ndarray] = [hls_model.predict(data).reshape(r_proxy[0].shape)] # type: ignore + + errors = [] + for i, (p, h) in enumerate(zip(r_proxy, r_hls)): + try: + if cond is None: + mismatch_ph = p != h + assert ( + np.sum(mismatch_ph) == 0 + ), f"Proxy-HLS4ML mismatch for out {i}: {np.sum(np.any(mismatch_ph,axis=1))} out of {data_len} samples are different. Sample: {p[mismatch_ph].ravel()[:5]} vs {h[mismatch_ph].ravel()[:5]}" # noqa: E501 + else: + cond(p, h) + except AssertionError as e: + errors.append(e) + if len(errors) > 0: + msgs = [str(e) for e in errors] + raise AssertionError('\n'.join(msgs)) + + +def run_model_test( + model: keras.Model, cover_factor: float | None, data, io_type: str, backend: str, dir: str, aggressive: bool, cond=None +): + data_len = data.shape[0] if isinstance(data, np.ndarray) else data[0].shape[0] + if cover_factor is not None: + trace_minmax(model, data, cover_factor=cover_factor, bsz=data_len) + proxy = to_proxy_model(model, aggressive=aggressive, unary_lut_max_table_size=4096) + _run_synth_match_test(proxy, data, io_type, backend, dir, cond=cond) + + +def create_player_model(layer: str, rnd_strategy: str, io_type: str): + pa_config = get_default_paq_conf() + pa_config['rnd_strategy'] = rnd_strategy + pa_config['skip_dims'] = 'all' if io_type == 'io_stream' else 'batch' + set_default_paq_conf(pa_config) + + inp = keras.Input(shape=(15)) + if 'PConcatenate' in layer: + _inp = [HQuantize()(inp)] * 2 + out = eval(layer)(_inp) + out = HDense(15)(out) + return keras.Model(inp, out) + elif 'Signature' in layer: + _inp = eval(layer)(inp) + out = HDense(15)(_inp) + return keras.Model(inp, out) + elif 'Pool2D' in layer: + _inp = PReshape((3, 5, 1))(HQuantize()(inp)) + elif 'Pool1D' in layer: + _inp = PReshape((5, 3))(HQuantize()(inp)) + elif 'Dense' in layer or 'Activation' in layer: + _inp = HQuantize()(inp) + elif 'Flatten' in layer: + out = HQuantize()(inp) + out = PReshape((3, 5))(out) + out = HConv1D(2, 2)(out) + out = eval(layer)(out) + out = HDense(15)(out) + return keras.Model(inp, out) + else: + raise Exception(f'Please add test for {layer}') + + out = eval(layer)(_inp) + model = keras.Model(inp, out) + + for layer in model.layers: + # No weight bitwidths to randomize + # And activation bitwidths + if hasattr(layer, 'paq'): + fbw: tf.Variable = layer.paq.fbw + fbw.assign(tf.constant(np.random.uniform(4, 6, fbw.shape).astype(np.float32))) + + return model + + +def create_hlayer_model(layer: str, rnd_strategy: str, io_type: str): + pa_config = get_default_paq_conf() + pa_config['rnd_strategy'] = rnd_strategy + pa_config['skip_dims'] = 'all' if io_type == 'io_stream' else 'batch' + set_default_paq_conf(pa_config) + + inp = keras.Input(shape=(16)) + if 'Add' in layer: + _inp = [HQuantize()(inp)] * 2 + elif 'Conv2D' in layer: + _inp = PReshape((4, 4, 1))(HQuantize()(inp)) + elif 'Conv1D' in layer: + _inp = PReshape((16, 1))(HQuantize()(inp)) + elif 'Dense' in layer or 'Activation' in layer: + _inp = HQuantize()(inp) + else: + raise Exception(f'Please add test for {layer}') + + _layer = eval('HGQ.layers.' + layer) + if hasattr(_layer, 'bias') and _layer.bias is not None: + bias: tf.Variable = _layer.bias + bias.assign(tf.constant(np.random.uniform(-4, 4, _layer.bias.shape).astype(np.float32))) + + out = _layer(_inp) + model = keras.Model(inp, out) + + for layer in model.layers: + # Randomize weight bitwidths + if hasattr(layer, 'kq'): + fbw: tf.Variable = layer.kq.fbw + fbw.assign(tf.constant(np.random.uniform(2, 6, fbw.shape).astype(np.float32))) + # And activation bitwidths + if hasattr(layer, 'paq'): + fbw: tf.Variable = layer.paq.fbw + fbw.assign(tf.constant(np.random.uniform(2, 6, fbw.shape).astype(np.float32))) + + return model + + +def get_data(shape: tuple[int, ...], v: float, max_scale: float): + rng = np.random.default_rng() + a1 = rng.uniform(-v, v, shape).astype(np.float32) + a2 = rng.uniform(0, max_scale, (1, shape[1])).astype(np.float32) + return (a1 * a2).astype(np.float32) + + +def softmax_cond(proxy, hls): + match_precent = np.mean(np.argmax(proxy, axis=1) == np.argmax(hls, axis=1)) + assert ( + match_precent > 0.90 + ), f"Proxy-HLS4ML mismatch: {(1-match_precent) * 100}% of samples are different. Sample: {proxy[:5]} vs {hls[:5]}" + + +def custom_activation_fn(x): + return tf.sin(x) ** 2.0 - x # type: ignore + + +@pytest.mark.parametrize( + 'layer', + [ + "HDense(10)", + "HDense(10, use_bias=False)", + "HDenseBatchNorm(10)", + "HConv1D(2, 3, padding='same')", + "HConv1D(2, 3, padding='valid')", + "HConv1D(2, 3, padding='valid', use_bias=False)", + "HConv1D(2, 3, padding='valid', strides=2)", + "HConv1D(2, 3, padding='same', strides=2)", + "HConv1DBatchNorm(2, 3, padding='valid')", + "HConv2D(2, (3,3), padding='same')", + "HConv2D(2, (3,3), padding='valid')", + "HConv2D(2, (3,3), padding='valid', use_bias=False)", + "HConv2D(2, (3,3), padding='valid', strides=2)", + "HConv2D(2, (3,3), padding='same', strides=2)", + "HConv2DBatchNorm(2, (3,3), padding='valid')", + "HAdd()", + "HActivation('relu')", + # "HActivation('leaky_relu')", + "HActivation('tanh')", + "HActivation('sigmoid')", + # "HActivation('softmax')", + "HActivation(custom_activation_fn)", + ], +) +@pytest.mark.parametrize("N", [1000]) +@pytest.mark.parametrize("rnd_strategy", ['standard_round', 'floor']) +@pytest.mark.parametrize("io_type", ['io_parallel', 'io_stream']) +@pytest.mark.parametrize("cover_factor", [1.0]) +@pytest.mark.parametrize("aggressive", [True, False]) +@pytest.mark.parametrize("backend", ['vivado', 'vitis']) +def test_syn_hlayers(layer, N: int, rnd_strategy: str, io_type: str, cover_factor: float, aggressive: bool, backend: str): + model = create_hlayer_model(layer=layer, rnd_strategy=rnd_strategy, io_type=io_type) + data = get_data((N, 16), 7, 1) + + cond = None if 'softmax' not in layer else softmax_cond + path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}' + + run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive, cond=cond) + + +@pytest.mark.parametrize( + 'layer', + [ + "PConcatenate()", + "PMaxPool1D(2, padding='same')", + "PMaxPool1D(4, padding='same')", + "PMaxPool2D((5,3), padding='same')", + "PMaxPool1D(2, padding='valid')", + "PMaxPool2D((2,3), padding='valid')", + "Signature(1,6,3)", + "PAvgPool1D(2, padding='same')", + "PAvgPool2D((1,2), padding='same')", + "PAvgPool2D((2,2), padding='same')", + "PAvgPool1D(2, padding='valid')", + "PAvgPool2D((1,2), padding='valid')", + "PAvgPool2D((2,2), padding='valid')", + "PFlatten()", + ], +) +@pytest.mark.parametrize("N", [1000]) +@pytest.mark.parametrize("rnd_strategy", ['floor', 'standard_round']) +@pytest.mark.parametrize("io_type", ['io_parallel', 'io_stream']) +@pytest.mark.parametrize("cover_factor", [1.0]) +@pytest.mark.parametrize("aggressive", [True, False]) +@pytest.mark.parametrize("backend", ['vivado', 'vitis']) +def test_syn_players(layer, N: int, rnd_strategy: str, io_type: str, cover_factor: float, aggressive: bool, backend: str): + model = create_player_model(layer=layer, rnd_strategy=rnd_strategy, io_type=io_type) + data = get_data((N, 15), 7, 1) + + path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}' + + if 'Signature' in layer: + q = gfixed(1, 6, 3) + data = q(data).numpy() + if "padding='same'" in layer and io_type == 'io_stream': + pytest.skip("io_stream does not support padding='same' for pools at the moment") + + run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive) diff --git a/test/pytest/test_hgq_proxy_model.py b/test/pytest/test_hgq_proxy_model.py deleted file mode 100644 index bd5963a58e..0000000000 --- a/test/pytest/test_hgq_proxy_model.py +++ /dev/null @@ -1,164 +0,0 @@ -from pathlib import Path - -import numpy as np -import pytest -import tensorflow as tf -from sklearn.datasets import fetch_openml -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from tensorflow import keras - -from hls4ml.converters import convert_from_keras_model -from hls4ml.utils.fixed_point_quantizer import FixedPointQuantizer - -################################################################# -# Proxy model is implemented as a submodule of HGQ. # -# As HGQ requires python>=3.10,<3.12, and tensorflow==2.13, # -# As the current testing environment is based on python==3.8, # -# HGQ cannot be marked as a dependency at the moment. # -################################################################# - - -test_root_path = Path(__file__).parent -example_model_path = test_root_path.parent.parent / 'example-models' - - -@pytest.fixture(scope='module') -def jet_classifier_model(): - with open(example_model_path / 'keras/proxy_jet_classifier.json') as f: - model_config = f.read() - co = {'FixedPointQuantizer': FixedPointQuantizer} - model: keras.Model = keras.models.model_from_json(model_config, custom_objects=co) # type: ignore - model.load_weights(example_model_path / 'keras/proxy_jet_classifier.h5') - return model - - -@pytest.fixture(scope='module') -def jet_classifier_data(): - print('Fetching data...') - data = fetch_openml('hls4ml_lhc_jets_hlf') - - X, y = data['data'], data['target'] - codecs = {'g': 0, 'q': 1, 't': 4, 'w': 2, 'z': 3} - y = np.array([codecs[i] for i in y]) - - X_train_val, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - X_train_val, X_test = X_train_val.astype(np.float32), X_test.astype(np.float32) - - scaler = StandardScaler() - X_train_val = scaler.fit_transform(X_train_val) - X_test = scaler.transform(X_test) - - X_test = np.ascontiguousarray(X_test) - return X_test, y_test - - -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) -@pytest.mark.parametrize('io_type', ['io_parallel']) -@pytest.mark.parametrize('overflow', [True, False]) -def test_proxy_jet_classifier(jet_classifier_model, jet_classifier_data, backend: str, io_type: str, overflow: bool): - X, y = jet_classifier_data - if overflow: - X *= 2 # This will cause overflow - - output_dir = str(test_root_path / f'hls4mlprj_proxy_jet_classifier_{backend}_{io_type}_{overflow}') - hls_config = {'Model': {'Precision': 'fixed<1,0>', 'ReuseFactor': 1}} - model_hls = convert_from_keras_model( - jet_classifier_model, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type - ) - model_hls.compile() - - r_hls = model_hls.predict(X) - r_keras = jet_classifier_model(X).numpy() - acc = np.mean(np.argmax(r_hls, axis=1) == y) - - if overflow: - assert acc < 0.7 - if not overflow and io_type == 'io_parallel': - assert 0.750 < acc < 0.751 - assert np.all(r_hls == r_keras) - - -def get_mnist_model_stream(): - with open(example_model_path / 'keras/proxy_mnist_homogeneous_act.json') as f: - model_config = f.read() - co = {'FixedPointQuantizer': FixedPointQuantizer} - model: keras.Model = keras.models.model_from_json(model_config, custom_objects=co) # type: ignore - model.load_weights(example_model_path / 'keras/proxy_mnist_homogeneous_act.h5') - return model - - -def get_mnist_model_parallel(): - with open(example_model_path / 'keras/proxy_mnist_heterogeneous_act.json') as f: - model_config = f.read() - co = {'FixedPointQuantizer': FixedPointQuantizer} - model: keras.Model = keras.models.model_from_json(model_config, custom_objects=co) # type: ignore - model.load_weights(example_model_path / 'keras/proxy_mnist_heterogeneous_act.h5') - return model - - -@pytest.fixture(scope='module') -def mnist_data(): - mnist = tf.keras.datasets.mnist - _, (X_test, y_test) = mnist.load_data() - X_test = (X_test / 255.0).astype(np.float32) - X_test = np.ascontiguousarray(X_test) - return X_test, y_test - - -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) -@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('overflow', [True, False]) -def test_proxy_mnist(mnist_data, backend: str, io_type: str, overflow: bool): - X, y = mnist_data - if overflow: - X *= 2 # This will cause overflow - - print(X[0].mean()) - if backend.lower() != 'quartus': - model = get_mnist_model_stream() if io_type == 'io_stream' else get_mnist_model_parallel() - else: - # Codegen is not working for Quartus backend, intra-layer heterogeneous activation quantization not possible. - # Only use stream-compatible model, in which all quantizer layers are fusible (homogeneous + layer has no sibling) - model = get_mnist_model_stream() - - output_dir = str(test_root_path / f'hls4mlprj_proxy_mnist_{backend}_{io_type}_{overflow}') - hls_config = { - 'Strategy': 'Latency', - 'Model': {'Precision': 'fixed<1,0>', 'ReuseFactor': 1}, - } # Accum for io_stream is not fixed. Set a large number as placeholder. - - model_hls = convert_from_keras_model( - model, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type - ) - - if backend.lower() != 'quartus': - if io_type == 'io_parallel': - # Check parallel factor is propagated to the hls model - assert model_hls.graph['h_conv2d'].attributes.attributes['n_partitions'] == 1 - assert model_hls.graph['h_conv2d_1'].attributes.attributes['n_partitions'] == 1 - else: - assert model_hls.graph['h_conv2d_2'].attributes.attributes['n_partitions'] == 26**2 - assert model_hls.graph['h_conv2d_3'].attributes.attributes['n_partitions'] == 11**2 - else: - # n_partitions is not used in Quartus backend - assert model_hls.graph['h_conv2d_2'].attributes.attributes['parallelization'] == 1 - assert model_hls.graph['h_conv2d_3'].attributes.attributes['parallelization'] == 1 - - model_hls.compile() - r_keras = model(X).numpy() # type: ignore - acc = np.mean(np.argmax(r_keras, axis=1) == y) - - if overflow: - assert acc < 0.9 - else: - if io_type == 'io_parallel' and backend.lower() != 'quartus': - assert 0.927 < acc < 0.928 - else: - assert 0.957 < acc < 0.958 - - r_hls = model_hls.predict(X) - mismatch_ph = r_hls != r_keras - assert np.all( - r_hls == r_keras - ), f"Proxy-HLS4ML mismatch for out: {np.sum(np.any(mismatch_ph,axis=1))} out of {len(X)} samples are different. Sample: {r_keras[mismatch_ph].ravel()[:5]} vs {r_hls[mismatch_ph].ravel()[:5]}" # noqa: From 1474ad1217651909edac0271bc62c33bb720506d Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Fri, 28 Jun 2024 13:48:45 -0700 Subject: [PATCH 104/272] naughty flake8 E231 --- hls4ml/backends/fpga/passes/hgq_proxy_model.py | 2 +- test/pytest/test_hgq_layers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/backends/fpga/passes/hgq_proxy_model.py b/hls4ml/backends/fpga/passes/hgq_proxy_model.py index a58c9f43db..5ec1200ac7 100644 --- a/hls4ml/backends/fpga/passes/hgq_proxy_model.py +++ b/hls4ml/backends/fpga/passes/hgq_proxy_model.py @@ -33,7 +33,7 @@ def generate_mask_fn( if b == 0: fn = f'out[{idx}] = 0;' else: - fn = f'out[{idx}] = {to_fixed(k,b,i,RND,SAT)}(inp[{idx}]);' + fn = f'out[{idx}] = {to_fixed(k, b, i, RND, SAT)}(inp[{idx}]);' masks.append(f' {fn}') body = "\n".join(masks) mask_fn = f''' diff --git a/test/pytest/test_hgq_layers.py b/test/pytest/test_hgq_layers.py index b6d66f9d00..92a7ea1876 100644 --- a/test/pytest/test_hgq_layers.py +++ b/test/pytest/test_hgq_layers.py @@ -59,7 +59,7 @@ def _run_synth_match_test(proxy: keras.Model, data, io_type: str, backend: str, mismatch_ph = p != h assert ( np.sum(mismatch_ph) == 0 - ), f"Proxy-HLS4ML mismatch for out {i}: {np.sum(np.any(mismatch_ph,axis=1))} out of {data_len} samples are different. Sample: {p[mismatch_ph].ravel()[:5]} vs {h[mismatch_ph].ravel()[:5]}" # noqa: E501 + ), f"Proxy-HLS4ML mismatch for out {i}: {np.sum(np.any(mismatch_ph, axis=1))} out of {data_len} samples are different. Sample: {p[mismatch_ph].ravel()[:5]} vs {h[mismatch_ph].ravel()[:5]}" # noqa: E501 else: cond(p, h) except AssertionError as e: From b7664b7b422904d6cc73e4f9ce42f7edfecb2972 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Mon, 15 Jul 2024 19:16:51 -0700 Subject: [PATCH 105/272] less tests per ci job disassemble hgq hlayer tests, less tests per ci job undo useless test assemble --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a4aa6d507a..89535c1937 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,7 +7,7 @@ generator: stage: generate image: python:3.8-alpine variables: - N_TESTS_PER_YAML: 5 + N_TESTS_PER_YAML: 4 tags: - k8s-default before_script: From 4a356236dac8416da148bb053764b041a14d880e Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Sun, 21 Jul 2024 18:43:29 -0700 Subject: [PATCH 106/272] Allow 'parallelization_factor' propagation from dense layer --- hls4ml/model/optimizer/passes/multi_dense.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hls4ml/model/optimizer/passes/multi_dense.py b/hls4ml/model/optimizer/passes/multi_dense.py index 2b303ea5a4..fadfdc19c4 100644 --- a/hls4ml/model/optimizer/passes/multi_dense.py +++ b/hls4ml/model/optimizer/passes/multi_dense.py @@ -27,6 +27,9 @@ def transform(self, model, node): 'bias_data': node.get_attr('bias_data'), } + if (pf := node.get_attr('parallelization_factor', None)) is not None: + pointwise_attrs['parallelization_factor'] = pf + if dim == 1: pointwise_attrs.update( { From 99eb3eb9d0baadd83ab12614b4f91344d5924078 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Mon, 22 Jul 2024 15:01:39 -0400 Subject: [PATCH 107/272] add Vitis to pytorch API tests --- test/pytest/test_pytorch_api.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index 9d67c2867d..4413ffa487 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -22,7 +22,7 @@ def forward(self, x): return self.linear(x) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_linear(backend, io_type): model = LinearModel() @@ -73,7 +73,7 @@ def test_linear(backend, io_type): nn.Threshold(threshold=1.0, value=0.0), ], ) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_activations(activation_function, backend, io_type): model = torch.nn.Sequential(nn.Linear(1, 1), activation_function).to() @@ -164,7 +164,7 @@ def forward(self, x): ThresholdModel(), ], ) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_activation_functionals(activation_function, backend, io_type): model = activation_function @@ -201,7 +201,7 @@ def test_activation_functionals(activation_function, backend, io_type): @pytest.mark.parametrize('padds', padds_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_conv1d(padds, backend, io_type): n_in = 2 @@ -242,7 +242,7 @@ def test_conv1d(padds, backend, io_type): if io_type == 'io_stream': # Vivado inserts and additional layer for 'same' padding in io_stream - if backend == "Vivado" and padds == 1: + if (backend == "Vivado" or backend == "Vitis") and padds == 1: assert nNodes == len(hls_model.get_layers()) else: assert nNodes - 1 == len(hls_model.get_layers()) @@ -269,13 +269,13 @@ def test_conv1d(padds, backend, io_type): # if not (backend == 'Vivado' and io_type == 'io_stream' and padds == 1): conv_index = 2 act_index = 3 - if io_type == "io_stream" and not (backend == "Vivado" and padds == 1): + if io_type == "io_stream" and not ((backend == "Vivado" or backend == "Vitis") and padds == 1): conv_index = 1 act_index = 2 assert list(hls_model.get_layers())[conv_index].attributes['name'] == convNode.name assert list(hls_model.get_layers())[conv_index].attributes['class_name'] == 'Conv1D' assert list(hls_model.get_layers())[act_index].attributes['activation'] == class_object_relu.__class__.__name__ - if io_type == "io_stream" and backend == "Vivado" and padds == 1: + if io_type == "io_stream" and (backend == "Vivado" or backend == "Vitis") and padds == 1: assert list(hls_model.get_layers())[conv_index].attributes["in_width"] == size_in + 2 else: assert list(hls_model.get_layers())[conv_index].attributes["in_width"] == size_in @@ -287,7 +287,7 @@ def test_conv1d(padds, backend, io_type): padding = 0 else: padding = 1 - if io_type == "io_stream" and backend == "Vivado" and padds == 1: + if io_type == "io_stream" and (backend == "Vivado" or backend == "Vitis") and padds == 1: padding = 1 padds = 0 @@ -311,7 +311,7 @@ def test_conv1d(padds, backend, io_type): @pytest.mark.parametrize('padds', padds_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_conv2d(padds, backend, io_type): n_in = 2 @@ -409,7 +409,7 @@ def test_conv2d(padds, backend, io_type): # results are not very good at the moment np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2) - if not (backend == 'Vivado' and io_type == 'io_stream' and padds == 1): + if not ((backend == 'Vivado' or backend == 'Vitis') and io_type == 'io_stream' and padds == 1): # Vivado inserts and additional layer for 'same' padding in io_stream conv_index = 2 act_index = 3 @@ -464,7 +464,7 @@ def test_conv2d(padds, backend, io_type): @pytest.mark.parametrize('pooling', pooling_layers) @pytest.mark.parametrize('padds', padds_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_pooling(pooling, padds, backend): assert '1d' in pooling.__name__ or '2d' in pooling.__name__ @@ -588,7 +588,7 @@ def forward(self, x): return self.bn(x) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_bn(backend, io_type): model = BatchNormModel() @@ -631,7 +631,7 @@ def forward(self, x): return x -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_squeeze(backend, io_type): model = SqueezeModel() @@ -667,7 +667,7 @@ def test_squeeze(backend, io_type): assert list(hls_model.get_layers())[3].attributes['target_shape'] == [3] -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_flatten(backend): input = torch.randn(1, 1, 5, 5) model = nn.Sequential(nn.Conv2d(1, 32, 5, 1, 1), nn.Flatten(), nn.ReLU()) @@ -711,7 +711,7 @@ def forward(self, x): return x -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_skipped_layers(backend, io_type): model = ModelSkippedLayers() @@ -743,7 +743,7 @@ def test_skipped_layers(backend, io_type): np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel']) # Only io_parallel for now @pytest.mark.parametrize('tensor_rank', [2, 3]) def test_remove_transpose(backend, io_type, tensor_rank): From 3fc0302ce6c1ffa407fee772116182e23eb7b4ec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:13:54 +0000 Subject: [PATCH 108/272] [pre-commit.ci] auto fixes from pre-commit hooks --- test/pytest/test_pytorch_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index 03f2a7dc8e..1c18c67b8e 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -409,7 +409,7 @@ def test_conv2d(padds, backend, io_type): # results are not very good at the moment np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2) - if not ((backend == 'Vivado' or backend == 'Vitis') and io_type == 'io_stream' and padds == 1): + if not ((backend == 'Vivado' or backend == 'Vitis') and io_type == 'io_stream' and padds == 1): # Vivado inserts and additional layer for 'same' padding in io_stream conv_index = 2 act_index = 3 From 54d7a34c28da10b0e68e8c9f93b03954f7b49b40 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 23 Jul 2024 12:46:02 -0400 Subject: [PATCH 109/272] addressing Jovan's comments --- hls4ml/backends/quartus/passes/recurrent_templates.py | 4 ++-- hls4ml/backends/vivado/passes/recurrent_templates.py | 2 +- test/pytest/test_pytorch_api.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hls4ml/backends/quartus/passes/recurrent_templates.py b/hls4ml/backends/quartus/passes/recurrent_templates.py index e4faceaf5a..82dcc54f5a 100644 --- a/hls4ml/backends/quartus/passes/recurrent_templates.py +++ b/hls4ml/backends/quartus/passes/recurrent_templates.py @@ -93,7 +93,7 @@ def format(self, node): params['config_mult_h'] = f'config{node.index}_h_mult' params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') params['act_recurrent_t'] = '{}_config{}'.format(node.get_attr('recurrent_activation'), str(node.index) + '_rec_act') - params['pytorch'] = 'true' if "pytorch" in node.attributes.keys() else 'false' + params['pytorch'] = 'true' if node.get_attr('pytorch', False) else 'false' gru_config = self.gru_template.format(**params) # Activation is on candidate hidden state, dimensionality (1, n_units) @@ -306,7 +306,7 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) - if "pytorch" in node.attributes.keys(): + if node.get_attr('pytorch', False): self.template = simple_rnn_pytorch_function_template params['weights'] = 'w{0}, wr{0}, b{0}, br{0}'.format(str(node.index)) else: diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py index 3c550a6dff..adf95defdc 100644 --- a/hls4ml/backends/vivado/passes/recurrent_templates.py +++ b/hls4ml/backends/vivado/passes/recurrent_templates.py @@ -98,7 +98,7 @@ def format(self, node): params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), node.index) params['strategy'] = node.get_attr('strategy') params['static'] = 'true' if node.attributes['static'] else 'false' - params['pytorch'] = 'true' if "pytorch" in node.attributes.keys() else 'false' + params['pytorch'] = 'true' if node.get_attr('pytorch', False) else 'false' params['recr_type'] = node.class_name.lower() params['RECR_TYPE'] = node.class_name diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index 74b0cdce2b..f380718c32 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -846,7 +846,7 @@ def forward(self, x): # X_input is channels last X_input = np.ascontiguousarray(X_input.transpose(0, 2, 1)) - config = config_from_pytorch_model(model, inputs_channel_last=True, transpose_outputs=False) + config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) output_dir = str(test_root_path / f'hls4mlprj_pytorch_view_{backend}_{io_type}') hls_model = convert_from_pytorch_model( From ac7021756b3373df27a4accadff8e5441a3c4ee1 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 23 Jul 2024 19:35:08 -0500 Subject: [PATCH 110/272] clean up mult-dimensional dense --- hls4ml/backends/catapult/passes/pointwise.py | 5 -- hls4ml/backends/quartus/passes/pointwise.py | 5 -- hls4ml/backends/vivado/passes/pointwise.py | 5 -- hls4ml/model/optimizer/__init__.py | 2 +- hls4ml/model/optimizer/passes/multi_dense.py | 21 ++++---- test/pytest/test_multi_dense.py | 50 +++++++------------- 6 files changed, 27 insertions(+), 61 deletions(-) diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py index 2dd982b5d4..0141d7f108 100755 --- a/hls4ml/backends/catapult/passes/pointwise.py +++ b/hls4ml/backends/catapult/passes/pointwise.py @@ -1,7 +1,5 @@ from copy import copy -import numpy as np - from hls4ml.backends.catapult.passes.convolution_templates import ( Conv1DConfigTemplate, Conv1DFunctionTemplate, @@ -78,9 +76,6 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy()) - if len(node.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D - expand_axis = tuple(range(int(dim[0]))) - pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis) pw_node.weights['bias'].data = node.weights['bias'].data # Set strategy to ensure lowercase string is passed to the template if model.config.is_resource_strategy(pw_node): diff --git a/hls4ml/backends/quartus/passes/pointwise.py b/hls4ml/backends/quartus/passes/pointwise.py index 84ae79e495..0f7f6821ae 100644 --- a/hls4ml/backends/quartus/passes/pointwise.py +++ b/hls4ml/backends/quartus/passes/pointwise.py @@ -1,7 +1,5 @@ from copy import copy -import numpy as np - from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D from hls4ml.backends.quartus.passes.convolution_templates import ( Conv1DConfigTemplate, @@ -86,9 +84,6 @@ def transform(self, model, node): pw_node = model.make_node( 'PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy() ) - if len(node.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D - expand_axis = tuple(range(int(dim[0]))) - pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis) pw_node.weights['bias'].data = node.weights['bias'].data model.replace_node(node, pw_node) diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py index c353a10604..85d2635cb8 100644 --- a/hls4ml/backends/vivado/passes/pointwise.py +++ b/hls4ml/backends/vivado/passes/pointwise.py @@ -1,7 +1,5 @@ from copy import copy -import numpy as np - from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D from hls4ml.backends.vivado.passes.convolution_templates import ( Conv1DConfigTemplate, @@ -78,9 +76,6 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy()) - if len(node.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D - expand_axis = tuple(range(int(dim[0]))) - pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis) pw_node.weights['bias'].data = node.weights['bias'].data # Set strategy to ensure lowercase string is passed to the template if model.config.is_resource_strategy(pw_node): diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 3aa247d03f..7edc0fc519 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -44,6 +44,7 @@ 'qkeras_factorize_alpha', 'extract_ternary_threshold', 'fuse_consecutive_batch_normalization', + 'replace_multidimensional_dense_with_conv', ], ) # TODO Maybe not all QKeras optmizers belong here? @@ -53,7 +54,6 @@ 'eliminate_linear_activation', 'fuse_consecutive_batch_normalization', 'fuse_batch_normalization', - 'replace_multidimensional_dense_with_conv', 'infer_precision_types', 'set_precision_concat', ], diff --git a/hls4ml/model/optimizer/passes/multi_dense.py b/hls4ml/model/optimizer/passes/multi_dense.py index 2b303ea5a4..0d16cb4c76 100644 --- a/hls4ml/model/optimizer/passes/multi_dense.py +++ b/hls4ml/model/optimizer/passes/multi_dense.py @@ -5,14 +5,14 @@ class ReplaceMultidimensionalDenseWithConv(OptimizerPass): + """ + This matches all multidimensional Dense layers and changes them to a convolution. + Note: the convolution may subsequently be changed to a pointwise convolution for + bakends that implement special pointwise convolutions. + """ + def match(self, node): - return ( - isinstance(node, Dense) - and len(node.get_input_variable().shape) - sum(d == 1 for d in node.get_input_variable().shape) > 1 - ) - # The above sum checks for the number of dimensions in the Dense with size 1 - # The subtraction allows the check to only count the number of dimensions with non-1 size - # For example, this prevents matching for a Dense layer with shape (1,N) + return isinstance(node, Dense) and len(node.get_input_variable().shape) > 1 def transform(self, model, node): dim = len(node.get_input_variable().shape) - 1 @@ -23,7 +23,7 @@ def transform(self, model, node): 'padding': 'valid', 'n_chan': input_shape[-1], 'n_filt': node.get_attr('n_out'), - 'weight_data': node.get_attr('weight_data'), + 'weight_data': np.expand_dims(node.get_attr('weight_data'), axis=tuple(range(dim))), 'bias_data': node.get_attr('bias_data'), } @@ -58,11 +58,8 @@ def transform(self, model, node): else: raise Exception('Cannot replace Dense over {dim}D tensor with Conv{dim}D.'.format(dim=dim)) - class_name = 'PointwiseConv' + str(dim) + 'D' + class_name = 'Conv' + str(dim) + 'D' pw_node = model.make_node(class_name, node.name, pointwise_attrs, node.inputs.copy()) - if len(node.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D - pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=tuple(range(dim))) - pw_node.weights['bias'].data = node.weights['bias'].data model.replace_node(node, pw_node) return True diff --git a/test/pytest/test_multi_dense.py b/test/pytest/test_multi_dense.py index 558ab2aece..43cadfe7b9 100644 --- a/test/pytest/test_multi_dense.py +++ b/test/pytest/test_multi_dense.py @@ -11,46 +11,32 @@ @pytest.mark.parametrize( - 'backend, io_type', + 'backend, strategy', [ - ('Quartus', 'io_parallel'), - ('Vivado', 'io_parallel'), - ('Vitis', 'io_parallel'), - ('Vivado', 'io_stream'), - ('Vivado', 'io_stream'), - ('Vitis', 'io_stream'), + ('Vitis', 'Latency'), + ('Vitis', 'Resource'), + ('Quartus', 'Resource'), + ('Catapult', 'Latency'), + ('Catapult', 'Resource'), ], ) -def test_multi_dense(backend, io_type): +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +@pytest.mark.parametrize('shape', [(4, 3), (4, 1), (2, 3, 2), (1, 3, 1)]) +def test_multi_dense(backend, strategy, io_type, shape): model = tf.keras.models.Sequential() - model.add( - Dense( - 4, - input_shape=( - 8, - 8, - ), - name='Dense', - use_bias=True, - kernel_initializer=tf.keras.initializers.RandomUniform(minval=1, maxval=10), - bias_initializer='zeros', - kernel_regularizer=None, - bias_regularizer=None, - activity_regularizer=None, - kernel_constraint=None, - bias_constraint=None, - activation='relu', - ) - ) + model.add(Dense(7, input_shape=shape, activation='relu')) + model.add(Dense(2, activation='relu')) model.compile(optimizer='adam', loss='mse') - X_input = np.random.rand(100, 8, 8) + X_input = np.random.rand(100, *shape) + X_input = np.round(X_input * 2**10) * 2**-10 # make it an exact ap_fixed<16,6> keras_prediction = model.predict(X_input) - default_precision = 'ap_fixed<32, 16>' if backend in ['Vivado', 'Vitis'] else 'ac_fixed<32, 16, true>' - config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision) - output_dir = str(test_root_path / f'hls4mlprj_multi_dense_{backend}_{io_type}') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) + config['Model']['Strategy'] = strategy + shapestr = '_'.join(str(x) for x in shape) + output_dir = str(test_root_path / f'hls4mlprj_multi_dense_{backend}_{strategy}_{io_type}_{shapestr}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type @@ -61,5 +47,3 @@ def test_multi_dense(backend, io_type): hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape) np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=1e-2, atol=0.01) - - assert list(hls_model.get_layers())[1].class_name == 'PointwiseConv1D' From d22a7f5670d7b673373190f0abf46b3149504583 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 24 Jul 2024 10:28:11 -0500 Subject: [PATCH 111/272] add back Vivado to tests, change pointwise to conv notation --- hls4ml/model/optimizer/passes/multi_dense.py | 10 +++++----- test/pytest/test_multi_dense.py | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/hls4ml/model/optimizer/passes/multi_dense.py b/hls4ml/model/optimizer/passes/multi_dense.py index 0d16cb4c76..23a4e24c4a 100644 --- a/hls4ml/model/optimizer/passes/multi_dense.py +++ b/hls4ml/model/optimizer/passes/multi_dense.py @@ -18,7 +18,7 @@ def transform(self, model, node): dim = len(node.get_input_variable().shape) - 1 input_shape = node.get_input_variable().shape - pointwise_attrs = { + conv_attrs = { 'data_format': 'channels_last', 'padding': 'valid', 'n_chan': input_shape[-1], @@ -28,7 +28,7 @@ def transform(self, model, node): } if dim == 1: - pointwise_attrs.update( + conv_attrs.update( { 'in_width': input_shape[0], 'out_width': input_shape[0], @@ -39,7 +39,7 @@ def transform(self, model, node): } ) elif dim == 2: - pointwise_attrs.update( + conv_attrs.update( { 'in_height': input_shape[0], 'in_width': input_shape[1], @@ -59,7 +59,7 @@ def transform(self, model, node): raise Exception('Cannot replace Dense over {dim}D tensor with Conv{dim}D.'.format(dim=dim)) class_name = 'Conv' + str(dim) + 'D' - pw_node = model.make_node(class_name, node.name, pointwise_attrs, node.inputs.copy()) - model.replace_node(node, pw_node) + conv_node = model.make_node(class_name, node.name, conv_attrs, node.inputs.copy()) + model.replace_node(node, conv_node) return True diff --git a/test/pytest/test_multi_dense.py b/test/pytest/test_multi_dense.py index 43cadfe7b9..e07dc119bd 100644 --- a/test/pytest/test_multi_dense.py +++ b/test/pytest/test_multi_dense.py @@ -13,6 +13,8 @@ @pytest.mark.parametrize( 'backend, strategy', [ + ('Vivado', 'Latency'), + ('Vivado', 'Resource'), ('Vitis', 'Latency'), ('Vitis', 'Resource'), ('Quartus', 'Resource'), From 253fabdcb0cf924d10c04845d3cc3b9ba1b8b79c Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 29 Jul 2024 11:25:17 -0700 Subject: [PATCH 112/272] Update keras-to-hls.sh --- test/keras-to-hls.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/keras-to-hls.sh b/test/keras-to-hls.sh index f8f63443dc..82de464da9 100755 --- a/test/keras-to-hls.sh +++ b/test/keras-to-hls.sh @@ -146,9 +146,9 @@ do fi # Write tarball echo "WriterConfig:" >> ${file} - echo " Namespace: None" >> ${file} - echo " WriteWeightsTxt: True" >> ${file} - echo " WriteTar: True" >> ${file} + echo " Namespace: null" >> ${file} + echo " WriteWeightsTxt: true" >> ${file} + echo " WriteTar: true" >> ${file} ${pycmd} ../scripts/hls4ml convert -c ${file} || exit 1 rm ${file} From d7acb61b22f2d79ceab33a8d9d53d9dc66dd2b22 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Jul 2024 20:12:37 +0000 Subject: [PATCH 113/272] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/pyupgrade: v3.16.0 → v3.17.0](https://github.com/asottile/pyupgrade/compare/v3.16.0...v3.17.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f2f1be02cc..68fab57675 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,7 +30,7 @@ repos: args: ["--profile", "black", --line-length=125] - repo: https://github.com/asottile/pyupgrade - rev: v3.16.0 + rev: v3.17.0 hooks: - id: pyupgrade args: ["--py36-plus"] From 2f104310b849e2e6ae5f2ac7145ed95ccd2c3436 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 30 Jul 2024 17:47:09 -0500 Subject: [PATCH 114/272] update multi_dense to be compatible with upstream changes --- hls4ml/model/optimizer/passes/multi_dense.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/passes/multi_dense.py b/hls4ml/model/optimizer/passes/multi_dense.py index 4cb840a030..008011bde2 100644 --- a/hls4ml/model/optimizer/passes/multi_dense.py +++ b/hls4ml/model/optimizer/passes/multi_dense.py @@ -28,7 +28,7 @@ def transform(self, model, node): } if (pf := node.get_attr('parallelization_factor', None)) is not None: - pointwise_attrs['parallelization_factor'] = pf + conv_attrs['parallelization_factor'] = pf if dim == 1: conv_attrs.update( From 1dd0f9e935186be4572bae43f0a7b2e2e7cd3b95 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 20:40:05 +0000 Subject: [PATCH 115/272] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.4.2 → 24.8.0](https://github.com/psf/black/compare/24.4.2...24.8.0) - [github.com/pycqa/flake8: 7.1.0 → 7.1.1](https://github.com/pycqa/flake8/compare/7.1.0...7.1.1) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 68fab57675..1b3d872190 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte repos: - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.8.0 hooks: - id: black language_version: python3 @@ -41,7 +41,7 @@ repos: - id: setup-cfg-fmt - repo: https://github.com/pycqa/flake8 - rev: 7.1.0 + rev: 7.1.1 hooks: - id: flake8 exclude: docs/conf.py From fd26eeac4e3bbc40c674111c197ba83b3a5a6a05 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Tue, 4 Jun 2024 16:10:00 -0700 Subject: [PATCH 116/272] rm slow mnist training in test --- test/pytest/test_cnn_mnist.py | 94 ----------------------------------- 1 file changed, 94 deletions(-) delete mode 100644 test/pytest/test_cnn_mnist.py diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py deleted file mode 100644 index 2b0a53014a..0000000000 --- a/test/pytest/test_cnn_mnist.py +++ /dev/null @@ -1,94 +0,0 @@ -from pathlib import Path - -import numpy as np -import pytest -from sklearn.metrics import accuracy_score -from tensorflow.keras.datasets import mnist -from tensorflow.keras.layers import Activation, AveragePooling2D, Conv2D, Dense, Flatten, MaxPooling2D -from tensorflow.keras.models import Sequential -from tensorflow.keras.utils import to_categorical - -import hls4ml - -test_root_path = Path(__file__).parent - - -@pytest.fixture(scope='module') -def mnist_data(): - (x_train, y_train), (x_test, y_test) = mnist.load_data() - x_train = x_train.astype("float32") / 255.0 - x_test = x_test.astype("float32") / 255.0 - x_train = np.expand_dims(x_train, -1) - x_test = np.expand_dims(x_test, -1) - y_train = to_categorical(y_train, 10) - y_test = to_categorical(y_test, 10) - x_test, y_test = x_test[:1000], y_test[:1000] - return x_train, y_train, x_test, y_test - - -@pytest.fixture(scope='module') -def keras_model(mnist_data): - # Aim of this model is to test different CNN paramaters, including: - # The common filter sizes, 3x3 and 5x5 - # A non-power of 2 number of filters - # Both Average and Max Pooling - # Both Same and Valid Padding - x_train, y_train, x_test, y_test = mnist_data - keras_model = Sequential() - keras_model.add(Conv2D(4, (3, 3), input_shape=(28, 28, 1), padding='same')) - keras_model.add(Activation('relu')) - keras_model.add(MaxPooling2D(name='max_pooling2d')) - keras_model.add(Conv2D(6, (5, 5), padding='valid')) - keras_model.add(Activation('relu')) - keras_model.add(AveragePooling2D(name='average_pooling2d')) - keras_model.add(Flatten()) - keras_model.add(Dense(10, kernel_initializer='lecun_uniform')) - keras_model.add(Activation('softmax', name='softmax')) - keras_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) - keras_model.fit(x_train, y_train, batch_size=32, epochs=5, verbose=0) - return keras_model - - -@pytest.mark.parametrize( - 'backend,io_type,strategy', - [ - ('Quartus', 'io_parallel', 'resource'), - ('Quartus', 'io_stream', 'resource'), - ('Vivado', 'io_parallel', 'resource'), - ('Vivado', 'io_parallel', 'latency'), - ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource'), - ('Vitis', 'io_parallel', 'resource'), - ('Vitis', 'io_parallel', 'latency'), - ('Vitis', 'io_stream', 'latency'), - ('Vitis', 'io_stream', 'resource'), - ], -) -def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy): - x_train, y_train, x_test, y_test = mnist_data - - hls_config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name', backend=backend) - hls_config['Model']['Strategy'] = strategy - hls_config['LayerName']['average_pooling2d']['Precision']['accum'] = 'auto' - hls_config['LayerName']['max_pooling2d']['Precision']['result'] = 'auto' - hls_config['LayerName']['softmax']['Implementation'] = 'stable' - output_dir = str(test_root_path / f'hls4mlprj_cnn_mnist_{backend}_{io_type}_{strategy}') - - hls_model = hls4ml.converters.convert_from_keras_model( - keras_model, hls_config=hls_config, output_dir=output_dir, backend=backend, io_type=io_type - ) - hls_model.compile() - - # Model under test predictions and accuracy - y_keras = keras_model.predict(x_test) - y_hls4ml = hls_model.predict(x_test) - - acc_keras = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_keras, axis=1)) - acc_hls4ml = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_hls4ml, axis=1)) - rel_diff = abs(acc_keras - acc_hls4ml) / acc_keras - - print(f'Accuracy keras: {acc_keras}') - print(f'Accuracy hls4ml: {acc_hls4ml}') - print(f'Relative difference: {rel_diff}') - - assert acc_keras > 0.95 and rel_diff < 0.03 From 63acf34569b669a3780b796dbc83c8eadb5a4ff3 Mon Sep 17 00:00:00 2001 From: sei-rquartiano <143530648+sei-rquartiano@users.noreply.github.com> Date: Sun, 18 Aug 2024 09:14:34 -0400 Subject: [PATCH 117/272] Bug Fix for Operand Shape Mismatch in BatchNorm Fusion (PyTorch) (#1045) * fixed operand dimension mismatch error in bn_fuse.py * moved test file to test/ * added channels_last_conversion to config * updating PR to match contribution guidelines. pre-commit has been run and test case has been moved from standalone file to existing pytests * fix shape of bias tensor in pytorch if zero, add additional batchnorm tests * consistent uses of ' * reverting changes in test_batchnorm_pytorch.py from merge * reverting changes in test_batchnorm_pytorch.py from merge --------- Co-authored-by: Jan-Frederik Schulte --- hls4ml/model/layers.py | 8 ++- test/pytest/test_batchnorm_pytorch.py | 90 ++++++++++++++++++++++++++- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index d1ae2b3893..d972787164 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -258,7 +258,13 @@ def add_bias(self, quantizer=None): precision = None type_name = None if data is None: - data = np.zeros(self.get_output_variable().shape[-1]) + if 'data_format' in self.attributes: + if self.attributes['data_format'] == 'channels_first': + data = np.zeros(self.get_output_variable().shape[0]) + elif self.attributes['data_format'] == 'channels_last': + data = np.zeros(self.get_output_variable().shape[-1]) + else: + data = np.zeros(self.get_output_variable().shape[-1]) precision = IntegerPrecisionType(width=1, signed=False) type_name = 'bias{index}_t' quantizer = None # Don't quantize non-existant bias diff --git a/test/pytest/test_batchnorm_pytorch.py b/test/pytest/test_batchnorm_pytorch.py index 93cda2729c..b2ef2f79b9 100644 --- a/test/pytest/test_batchnorm_pytorch.py +++ b/test/pytest/test_batchnorm_pytorch.py @@ -13,13 +13,22 @@ atol = 5e-3 -@pytest.fixture(scope='module') +@pytest.fixture def data(): np.random.seed(0) X = np.random.rand(100, in_shape) return X +@pytest.fixture(scope='module') +def fusion_data(): + n_batch = 2 + n_in = 2 + size_in_height = 32 + X = np.random.rand(n_batch, n_in, size_in_height) + return X + + @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) def test_batchnorm(data, backend, io_type): @@ -41,3 +50,82 @@ def test_batchnorm(data, backend, io_type): pytorch_prediction = model(torch.Tensor(data)).detach().numpy() hls_prediction = hls_model.predict(data) np.testing.assert_allclose(pytorch_prediction, hls_prediction, rtol=0, atol=atol, verbose=True) + + +atol = 5e-2 + + +class BatchNorm_w_Fusion(nn.Module): + def __init__(self, filters, momentum): + super().__init__() + self.conv1 = nn.Conv1d( + int(filters), + filters, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.bn1 = nn.BatchNorm1d(filters) + self.relu1 = nn.ReLU() + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu1(x) + return x + + +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) +def test_batchnorm_fusion(fusion_data, backend, io_type): + n_in = 2 + momentum = 0.99 + size_in_height = 32 + filters = n_in + + # see above for model definition + model = BatchNorm_w_Fusion(filters, momentum) + # Important to set model to eval to fix batchnorm behavior + model.eval() + # generating config + pytorch_prediction = model(torch.Tensor(fusion_data)).detach().numpy() + + # We do not have an implementation of a transpose for io_stream, need to transpose inputs and outputs outside of hls4ml + if io_type == 'io_stream': + fusion_data = np.ascontiguousarray(fusion_data.transpose(0, 2, 1)) + config = hls4ml.utils.config_from_pytorch_model(model, channels_last_conversion='internal', transpose_outputs=False) + else: + config = hls4ml.utils.config_from_pytorch_model(model, channels_last_conversion='full', transpose_outputs=True) + + config['Model']['Strategy'] = 'Resource' + + default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>' + + config['Model']['Precision'] = default_precision + + # conversion + output_dir = str(test_root_path / f'hls4mlprj_block_{backend}_{io_type}') + hls_model = hls4ml.converters.convert_from_pytorch_model( + model, + (None, n_in, size_in_height), + hls_config=config, + output_dir=output_dir, + backend=backend, + io_type=io_type, + ) + + # compiling model + hls_model.compile() + + if io_type == 'io_stream': + hls_prediction = np.transpose( + np.reshape( + hls_model.predict(fusion_data), + (pytorch_prediction.shape[0], pytorch_prediction.shape[2], pytorch_prediction.shape[1]), + ), + (0, 2, 1), + ) + else: + hls_prediction = np.reshape(hls_model.predict(fusion_data), pytorch_prediction.shape) + np.testing.assert_allclose(pytorch_prediction, hls_prediction, rtol=0, atol=atol, verbose=True) From f1c722515fe541eccab131643e23b7a39af4dfe2 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Mon, 19 Aug 2024 15:10:43 -0400 Subject: [PATCH 118/272] remove precision settings that make pytest for batchnorm in pytorch fail --- test/pytest/test_batchnorm_pytorch.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/pytest/test_batchnorm_pytorch.py b/test/pytest/test_batchnorm_pytorch.py index b2ef2f79b9..1e45e7ae0f 100644 --- a/test/pytest/test_batchnorm_pytorch.py +++ b/test/pytest/test_batchnorm_pytorch.py @@ -100,10 +100,6 @@ def test_batchnorm_fusion(fusion_data, backend, io_type): config['Model']['Strategy'] = 'Resource' - default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>' - - config['Model']['Precision'] = default_precision - # conversion output_dir = str(test_root_path / f'hls4mlprj_block_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_pytorch_model( From 9ab6a2e5c2de020154493d9f32270277d1e69b0e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 20 Aug 2024 17:22:09 -0500 Subject: [PATCH 119/272] fix pre-commit errors --- hls4ml/model/graph.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 68b8c74a5d..d0a1fdf7fc 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -198,7 +198,6 @@ def get_compression(self, layer): return compression - def parse_name_config(self, layer_name, layer_cfg): """This is used by _parse_hls_config below, but also in optimizers when a new layer config is created""" precision_cfg = layer_cfg.get('Precision') @@ -228,11 +227,9 @@ def parse_name_config(self, layer_name, layer_cfg): if compression is not None: self.layer_name_compression[layer_name.lower()] = bool(compression) - def get_writer_config(self): return self.writer_config - def _parse_hls_config(self): hls_config = self.config['HLSConfig'] From ef02b4f4a45ae4c032d8ea49fc9854e8d4de7bc7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 21 Aug 2024 11:15:05 -0500 Subject: [PATCH 120/272] move multi_dense to conv above inferming precision types --- hls4ml/model/optimizer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index e311eb96cf..64be9903ad 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -63,6 +63,7 @@ 'eliminate_linear_activation', 'qkeras_factorize_alpha', 'extract_ternary_threshold', + 'replace_multidimensional_dense_with_conv', 'seperable_to_depthwise_and_conv', # The ones above here need to be before infer_precision_types 'infer_precision_types', @@ -74,7 +75,6 @@ 'expand_layer_group', 'output_rounding_saturation_mode', 'fuse_consecutive_batch_normalization', - 'replace_multidimensional_dense_with_conv', 'enforce_proxy_model_embedded_config', ], requires=['parse_qonnx'], From c3ffa7bf5fde0c54b4d514ff2a18c5c1228e9549 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 21 Aug 2024 11:38:40 -0500 Subject: [PATCH 121/272] fix the default reuse factor --- hls4ml/utils/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py index 51e87244e4..78f033c28c 100644 --- a/hls4ml/utils/config.py +++ b/hls4ml/utils/config.py @@ -401,6 +401,8 @@ def make_layer_config(layer): precision_cfg[name] = 'auto' else: precision_cfg[name] = str(attr.default) + elif attr.name == 'reuse_factor': + layer_config[attr.config_name] = default_reuse_factor else: if attr.default is not None: layer_config[attr.config_name] = attr.default From 2ed0865032b3decea6af3a246c876ecc5cd2aa81 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 22 Aug 2024 19:04:57 +0200 Subject: [PATCH 122/272] Reorganize codegen of unrolled implementation --- hls4ml/backends/fpga/fpga_backend.py | 4 +- hls4ml/backends/fpga/passes/codegen.py | 230 +---------------- hls4ml/backends/vitis/passes/feature_check.py | 23 +- hls4ml/backends/vitis/vitis_backend.py | 3 +- .../vivado/passes/unrolled_codegen.py | 243 ++++++++++++++++++ hls4ml/backends/vivado/vivado_backend.py | 103 ++++++-- hls4ml/writer/vivado_writer.py | 17 +- 7 files changed, 359 insertions(+), 264 deletions(-) create mode 100644 hls4ml/backends/vivado/passes/unrolled_codegen.py diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 8d0ed64aad..ad8e917dd8 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -227,10 +227,12 @@ def get_closest_reuse_factor(self, valid_rf, chosen_rf): else: return before - def set_closest_reuse_factor(self, layer, n_in, n_out, attribute='reuse_factor'): + def set_closest_reuse_factor(self, layer, n_in, n_out, attribute='reuse_factor', include_max_rf=True): assert attribute is not None, 'Reuse factor attribute cannot be None' valid_rf = self.get_valid_reuse_factors(n_in, n_out) + if not include_max_rf: + valid_rf.pop() chosen_rf = layer.get_attr(attribute) if chosen_rf not in valid_rf: closest_rf = self.get_closest_reuse_factor(valid_rf, chosen_rf) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index 3667680ed5..f1f1080996 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -1,8 +1,4 @@ -import math - -import numpy as np - -from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense +from hls4ml.model.layers import Conv1D, Conv2D from hls4ml.model.optimizer import OptimizerPass from hls4ml.model.types import Source @@ -53,227 +49,3 @@ def _generate_im2col_2d(self, node): ) node.set_attr('line_buffer_codegen', Source(code_str)) - - -class GenerateUnrolledDenseResource(OptimizerPass): - '''Generates C++ code for unrolled Dense resource''' - - def match(self, node): - # Only apply to layers use that use Dense Matrix Multiplication - # TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers - layers_with_dense = (Dense, Conv1D, Conv2D, LSTM, GRU) - - # Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer - weights_transposed = node.get_attr('_weights_transposed', False) - - # RF = 1 will optimize DSPs anyway, so no need to unroll code - rf_gt_one = node.get_attr('reuse_factor', 1) > 1 - - # User requested unrolled implementation of Dense - is_unrolled = node.get_attr('strategy', 'latency') == 'unrolled' - - return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled - - def transform(self, model, node): - if isinstance(node, (LSTM, GRU)): - n_in, n_out, n_in_recr, n_out_recr = node.model.config.backend.get_layer_mult_size(node) - - reuse_factor = node.get_attr('reuse_factor') - weights = node.weights['weight'] - code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, str(node.index) + '_1') - node.set_attr('unrolled_dense_resource_codegen_1', Source(code_str)) - - recr_reuse_factor = node.get_attr('recurrent_reuse_factor') - recr_weights = node.weights['recurrent_weight'] - code_str = self._generate_unrolled_function( - n_in_recr, n_out_recr, recr_reuse_factor, recr_weights, str(node.index) + '_2' - ) - node.set_attr('unrolled_dense_resource_codegen_2', Source(code_str)) - - else: - n_in, n_out = node.model.config.backend.get_layer_mult_size(node) - reuse_factor = node.get_attr('reuse_factor') - weights = node.weights['weight'] - - code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, node.index) - node.set_attr('unrolled_dense_resource_codegen', Source(code_str)) - - def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, function_suffix): - """ - Generate a C++ function that mimics the Dense Resource implementation. - - The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero. - Latency strategy can optimize zero multiplications - Resource strategy, on the other hand, cannot. - When all the weights in the same BRAM block are zero, Vivado is unable to optimize it - With this (and additional TCL scripts) zero BRAM are optimized - - Args: - node: Layer to generate code for - Returns: - generated_code: Generated C++ function (string) - """ - - # Variable instantiation and function pragmas - generated_code = ( - 'template\n' - 'class dense_unrolled_{suffix} : public DenseKernel {{\n' - ' public:\n' - ' static void dense(\n' - ' data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n' - ' typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n' - ' typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n' - ' ) {{\n' - ' #pragma HLS pipeline II=CONFIG_T::reuse_factor\n' - '\n' - ' constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n' - ' #pragma HLS function_instantiate variable=weights,biases\n' - ' #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n' - ' #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n' - ' #pragma HLS ARRAY_PARTITION variable=biases complete\n' - '\n' - ' typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n' - ' #pragma HLS ARRAY_PARTITION variable=acc complete\n' - '\n' - ' InitAccum:\n' - ' for (int i = 0; i < CONFIG_T::n_out; i++) {{\n' - ' #pragma HLS UNROLL\n' - ' acc[i] = (typename CONFIG_T::accum_t) biases[i];\n' - ' }}\n' - '\n' - ).format(suffix=function_suffix) - - # Unrolled multiplication, according to the three cases - if reuse_factor <= n_in: - mult_code = self._generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights) - elif reuse_factor > n_in and reuse_factor % n_in == 0: - mult_code = self._generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights) - else: - # This case shouldn't happen if my understanding of RF is correct - # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in - raise Exception('Not implemented...') - - # Write output - generated_code += mult_code + '\n' - generated_code += ( - ' Result:\n' - ' for (int i = 0; i < CONFIG_T::n_out; i++) {\n' - ' #pragma HLS UNROLL\n' - ' res[i] = cast(acc[i]);\n' - ' }\n' - ' }\n' - '};\n' - ) - - return generated_code - - def _generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights): - # Function constants - mult_factor = min(n_in, reuse_factor) - block_factor = int(math.ceil(n_in * n_out / reuse_factor)) - mult_limit = int(math.ceil(n_in * n_out / mult_factor)) - mult_scale = mult_limit // n_out - - # Zero DSPs are the DSP blocks that always have zero input - # In this case, it is the number of rows in the transposed and reshaped weight matrix - # The new shape is (parallel_mult, reuse_factor) - zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) - - # Used to pad the code to make it human-readable - indent = ' ' - - # Generate unrolled multiplications - mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n' - mult_code += f'{indent*2}MULT: {{\n' - mult_code += f'{indent*3}#pragma HLS protocol\n' - - for ir in range(reuse_factor): - acc_step = 0 - out_index = 0 - w_index = ir - in_index = ir - - mult_code += f'{indent*3}M{ir}: {{\n' - for _ in range(block_factor): - if weights.data.flatten()[w_index] != 0: - mult_code += ( - f'{indent*4}acc[{out_index}] += ' - 'static_cast' - '(CONFIG_T::template product::' - f'product(data[{in_index}], weights[{w_index}]));\n' - ) - - w_index += reuse_factor - in_index += reuse_factor - if in_index >= n_in: - in_index = ir - if acc_step + 1 >= mult_scale: - acc_step = 0 - out_index += 1 - else: - acc_step += 1 - - mult_code += f'{indent*3}}}\n' - - mult_code += f'{indent*2}}}\n' - - return mult_code - - def _generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights): - # Function constants - mult_factor = min(n_in, reuse_factor) - block_factor = int(math.ceil(n_in * n_out / reuse_factor)) - mult_limit = int(math.ceil(n_in * n_out / mult_factor)) - - # Zero DSPs are the DSP blocks that always have zero input - # In this case, it is the number of rows in the transposed and reshaped weight matrix - # The new shape is (parallel_mult, reuse_factor) - zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) - - # Used to pad the code to make it human-readable - indent = ' ' - - # Generate out indices - outidx = [0] * reuse_factor - outstep = 0 - outscale = reuse_factor // n_in - for ir in range(reuse_factor): - outidx[ir] = outstep - if (ir + 1) % n_in == 0: - outstep += 1 - - # Define variables - in_index = 0 - - # Generate unrolled multiplications - mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n' - mult_code += f'{indent*2}MULT: {{\n' - mult_code += f'{indent*3}#pragma HLS protocol\n' - - for ir in range(reuse_factor): - w_index = ir - out_index = outidx[ir] - - mult_code += f'{indent*3}M{ir}: {{\n' - for _ in range(block_factor): - if weights.data.flatten()[w_index] != 0: - mult_code += ( - f'{indent*4}acc[{int(out_index)}] += ' - 'static_cast' - '(CONFIG_T::template product::' - f'product(data[{in_index}], weights[{w_index}]));\n' - ) - - w_index += reuse_factor - if w_index > n_in * n_out: - break - out_index += outscale - mult_code += f'{indent*3}}}\n' - - in_index += 1 - if in_index >= n_in: - in_index = 0 - - mult_code += f'{indent*2}}}\n' - - return mult_code diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py index d7f9c2a7f5..7f0b832765 100644 --- a/hls4ml/backends/vitis/passes/feature_check.py +++ b/hls4ml/backends/vitis/passes/feature_check.py @@ -14,7 +14,7 @@ def transform(self, model, node): node.set_attr('implementation', 'linebuffer') -class ValidateStrategy(OptimizerPass): +class ValidateResourceStrategy(OptimizerPass): _resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense'] def match(self, node): @@ -29,6 +29,23 @@ def transform(self, model, node): if rf > n_in and rf % n_in > 0: print( f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis ' - 'backend due to use of "urem" cores.\n' - 'Consider using a different ReuseFactor or switching to "Latency" strategy.' + 'backend due to use of "urem" cores in Vitis HLS <= 2022.1.\n' + 'Consider using a different ReuseFactor or switching to "Latency" strategy if using older versions ' + 'of Vitis HLS.' ) + + +class ValidateUnrolledStrategy(OptimizerPass): + _unrolled_layer_cls = ['Conv1D', 'Conv2D', 'Dense', 'GRU', 'LSTM'] + + def match(self, node): + is_unrolled_layer = len([layer_cls for layer_cls in self._unrolled_layer_cls if layer_cls in node.class_name]) > 0 + is_unrolled_strategy = node.get_attr('strategy', 'latency').lower() == 'unrolled' + + return is_unrolled_layer and is_unrolled_strategy + + def transform(self, model, node): + print( + f'WARNING: "Unrolled" strategy in "{node.name}" ({node.class_name}) may have unexpected II in Vitis backend.\n' + 'Verify that the final design satisfies the latency/II constraints.' + ) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index 2a0616a198..6e9cbbb10c 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -15,7 +15,8 @@ def __init__(self): def _register_flows(self): validation_passes = [ 'vitis:validate_conv_implementation', - 'vitis:validate_strategy', + 'vitis:validate_resource_strategy', + 'vitis:validate_unrolled_strategy', ] validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name) diff --git a/hls4ml/backends/vivado/passes/unrolled_codegen.py b/hls4ml/backends/vivado/passes/unrolled_codegen.py new file mode 100644 index 0000000000..6fd6c584af --- /dev/null +++ b/hls4ml/backends/vivado/passes/unrolled_codegen.py @@ -0,0 +1,243 @@ +import math + +import numpy as np + +from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import Source + + +class GenerateUnrolledDenseResource(OptimizerPass): + '''Generates C++ code for unrolled Dense resource''' + + def match(self, node): + # Only apply to layers use that use Dense Matrix Multiplication + # TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers + layers_with_dense = (Dense, Conv1D, Conv2D, LSTM, GRU) + + # Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer + weights_transposed = node.get_attr('_weights_transposed', False) + + # RF = 1 will optimize DSPs anyway, so no need to unroll code + rf_gt_one = node.get_attr('reuse_factor', 1) > 1 + + # User requested unrolled implementation of Dense + is_unrolled = node.get_attr('strategy', 'latency') == 'unrolled' + + return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled + + def transform(self, model, node): + if isinstance(node, (LSTM, GRU)): + n_in, n_out, n_in_recr, n_out_recr = node.model.config.backend.get_layer_mult_size(node) + + reuse_factor = node.get_attr('reuse_factor') + weights = node.weights['weight'] + code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, str(node.index) + '_1') + code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend) + node.set_attr('unrolled_dense_resource_codegen_1', Source(code_str)) + + recr_reuse_factor = node.get_attr('recurrent_reuse_factor') + recr_weights = node.weights['recurrent_weight'] + code_str = self._generate_unrolled_function( + n_in_recr, n_out_recr, recr_reuse_factor, recr_weights, str(node.index) + '_2' + ) + code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend) + node.set_attr('unrolled_dense_resource_codegen_2', Source(code_str)) + + else: + n_in, n_out = node.model.config.backend.get_layer_mult_size(node) + reuse_factor = node.get_attr('reuse_factor') + weights = node.weights['weight'] + + code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, node.index) + code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend) + node.set_attr('unrolled_dense_resource_codegen', Source(code_str)) + + def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, function_suffix): + """ + Generate a C++ function that mimics the Dense Resource implementation. + + The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero. + Latency strategy can optimize zero multiplications + Resource strategy, on the other hand, cannot. + When all the weights in the same BRAM block are zero, Vivado is unable to optimize it + With this (and additional TCL scripts) zero BRAM are optimized + + Args: + node: Layer to generate code for + Returns: + generated_code: Generated C++ function (string) + """ + + # Variable instantiation and function pragmas + generated_code = ( + 'template\n' + 'class dense_unrolled_{suffix} : public DenseKernel {{{{\n' + ' public:\n' + ' static void dense(\n' + ' data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n' + ' typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n' + ' typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n' + ' ) {{{{\n' + ' #pragma HLS pipeline II=CONFIG_T::reuse_factor\n' + '\n' + ' constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n' + ' #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n' + ' {{weights_resource_pragma}}\n' + ' #pragma HLS ARRAY_PARTITION variable=biases complete\n' + '\n' + ' typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n' + ' #pragma HLS ARRAY_PARTITION variable=acc complete\n' + '\n' + ' InitAccum:\n' + ' for (int i = 0; i < CONFIG_T::n_out; i++) {{{{\n' + ' #pragma HLS UNROLL\n' + ' acc[i] = (typename CONFIG_T::accum_t) biases[i];\n' + ' }}}}\n' + '\n' + ).format(suffix=function_suffix) + + # Unrolled multiplication, according to the three cases + if reuse_factor <= n_in: + mult_code = self._generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights) + elif reuse_factor > n_in and reuse_factor % n_in == 0: + mult_code = self._generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights) + else: + # This case shouldn't happen if my understanding of RF is correct + # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in + raise Exception('Not implemented...') + + # Write output + generated_code += mult_code + '\n' + generated_code += ( + ' Result:\n' + ' for (int i = 0; i < CONFIG_T::n_out; i++) {{\n' + ' #pragma HLS UNROLL\n' + ' res[i] = cast(acc[i]);\n' + ' }}\n' + ' }}\n' + '}};\n' + ) + + return generated_code + + def _generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights): + # Function constants + mult_factor = min(n_in, reuse_factor) + block_factor = int(math.ceil(n_in * n_out / reuse_factor)) + mult_limit = int(math.ceil(n_in * n_out / mult_factor)) + mult_scale = mult_limit // n_out + + # Zero DSPs are the DSP blocks that always have zero input + # In this case, it is the number of rows in the transposed and reshaped weight matrix + # The new shape is (parallel_mult, reuse_factor) + zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) + + # Used to pad the code to make it human-readable + indent = ' ' + + # Generate unrolled multiplications + mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n' + mult_code += f'{indent*2}MULT: {{{{\n' + + for ir in range(reuse_factor): + acc_step = 0 + out_index = 0 + w_index = ir + in_index = ir + + mult_code += f'{indent*3}M{ir}: {{{{\n' + for _ in range(block_factor): + if weights.data.flatten()[w_index] != 0: + mult_code += ( + f'{indent*4}acc[{out_index}] += ' + 'static_cast' + '(CONFIG_T::template product::' + f'product(data[{in_index}], weights[{w_index}]));\n' + ) + + w_index += reuse_factor + in_index += reuse_factor + if in_index >= n_in: + in_index = ir + if acc_step + 1 >= mult_scale: + acc_step = 0 + out_index += 1 + else: + acc_step += 1 + + mult_code += f'{indent*3}}}}}\n' + + mult_code += f'{indent*2}}}}}\n' + + return mult_code + + def _generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights): + # Function constants + mult_factor = min(n_in, reuse_factor) + block_factor = int(math.ceil(n_in * n_out / reuse_factor)) + mult_limit = int(math.ceil(n_in * n_out / mult_factor)) + + # Zero DSPs are the DSP blocks that always have zero input + # In this case, it is the number of rows in the transposed and reshaped weight matrix + # The new shape is (parallel_mult, reuse_factor) + zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) + + # Used to pad the code to make it human-readable + indent = ' ' + + # Generate out indices + outidx = [0] * reuse_factor + outstep = 0 + outscale = reuse_factor // n_in + for ir in range(reuse_factor): + outidx[ir] = outstep + if (ir + 1) % n_in == 0: + outstep += 1 + + # Define variables + in_index = 0 + + # Generate unrolled multiplications + mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n' + mult_code += f'{indent*2}MULT: {{{{\n' + + for ir in range(reuse_factor): + w_index = ir + out_index = outidx[ir] + + mult_code += f'{indent*3}M{ir}: {{{{\n' + for _ in range(block_factor): + if weights.data.flatten()[w_index] != 0: + mult_code += ( + f'{indent*4}acc[{int(out_index)}] += ' + 'static_cast' + '(CONFIG_T::template product::' + f'product(data[{in_index}], weights[{w_index}]));\n' + ) + + w_index += reuse_factor + if w_index > n_in * n_out: + break + out_index += outscale + mult_code += f'{indent*3}}}}}\n' + + in_index += 1 + if in_index >= n_in: + in_index = 0 + + mult_code += f'{indent*2}}}}}\n' + + return mult_code + + def _add_backend_specific_pragmas_to_generated_code(self, code, backend): + if backend.name == 'Vivado': + weights_resource_pragma = '#pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM' + elif backend.name == 'Vitis': + weights_resource_pragma = '#pragma HLS BIND_STORAGE variable=weights type=ROM_NP impl=BRAM' + else: + raise Exception(f'Unexpected backend {backend.name} in GenerateUnrolledDenseResource optimizer.') + + code = code.format(weights_resource_pragma=weights_resource_pragma) + + return code diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 6c5deccc68..834dec9d5e 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -250,11 +250,22 @@ def init_dense(self, layer): index_t = layer.get_weights('weight').type.index_precision else: layer.set_attr('strategy', 'resource') - elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + use_resource_instead = False + if layer.get_attr('reuse_factor', 1) == 1: + print( + f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' + 'Using "resource" strategy instead.' + ) + use_resource_instead = True n_in, n_out = self.get_layer_mult_size(layer) self.set_target_reuse_factor(layer) - self.set_closest_reuse_factor(layer, n_in, n_out) - layer.set_attr('strategy', 'unrolled') + if use_resource_instead: + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + else: + self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False) + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) @@ -271,11 +282,28 @@ def init_conv1d(self, layer): n_in, n_out = self.get_layer_mult_size(layer) self.set_target_reuse_factor(layer) self.set_closest_reuse_factor(layer, n_in, n_out) - elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + use_resource_instead = False + if layer.get_attr('reuse_factor', 1) == 1: + print( + f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}".' + 'Using "resource" strategy instead.' + ) + use_resource_instead = True + elif layer.model.config.get_config_value('IOType') == 'io_parallel': + print( + f'Unrolled strategy cannot be combined with io_parallel in layer "{layer.name}". ' + 'Using "resource" strategy instead.' + ) + use_resource_instead = True n_in, n_out = self.get_layer_mult_size(layer) self.set_target_reuse_factor(layer) - self.set_closest_reuse_factor(layer, n_in, n_out) - layer.set_attr('strategy', 'unrolled') + if use_resource_instead: + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + else: + self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False) + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') @@ -335,11 +363,28 @@ def init_conv2d(self, layer): self.set_target_reuse_factor(layer) n_in, n_out = self.get_layer_mult_size(layer) self.set_closest_reuse_factor(layer, n_in, n_out) - elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + use_resource_instead = False + if layer.get_attr('reuse_factor', 1) == 1: + print( + f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' + 'Using "resource" strategy instead.' + ) + use_resource_instead = True + elif layer.model.config.get_config_value('IOType') == 'io_parallel': + print( + f'Unrolled strategy cannot be combined with io_parallel in layer "{layer.name}". ' + 'Using "resource" strategy instead.' + ) + use_resource_instead = True n_in, n_out = self.get_layer_mult_size(layer) self.set_target_reuse_factor(layer) - self.set_closest_reuse_factor(layer, n_in, n_out) - layer.set_attr('strategy', 'unrolled') + if use_resource_instead: + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + else: + self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False) + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') @@ -459,11 +504,23 @@ def init_lstm(self, layer): self.set_closest_reuse_factor(layer, n_in, n_out) self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') layer.set_attr('strategy', 'resource') - elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + use_resource_instead = False + if layer.get_attr('reuse_factor', 1) == 1: + print( + f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' + 'Using "resource" strategy instead.' + ) + use_resource_instead = True n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) - self.set_closest_reuse_factor(layer, n_in, n_out) - self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') - layer.set_attr('strategy', 'unrolled') + if use_resource_instead: + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + else: + self.set_closest_reuse_factor( + layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor', include_max_rf=False + ) + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') @@ -482,11 +539,23 @@ def init_gru(self, layer): self.set_closest_reuse_factor(layer, n_in, n_out) self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') layer.set_attr('strategy', 'resource') - elif layer.model.config.get_strategy(layer).lower() == 'unrolled' and layer.get_attr('reuse_factor', 1) > 1: + elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + use_resource_instead = False + if layer.get_attr('reuse_factor', 1) == 1: + print( + f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' + 'Using "resource" strategy instead.' + ) + use_resource_instead = True n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) - self.set_closest_reuse_factor(layer, n_in, n_out) - self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') - layer.set_attr('strategy', 'unrolled') + if use_resource_instead: + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + else: + self.set_closest_reuse_factor( + layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor', include_max_rf=False + ) + layer.set_attr('strategy', 'unrolled') else: layer.set_attr('strategy', 'latency') diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 4202ba9700..ab691912be 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -13,13 +13,6 @@ class VivadoWriter(Writer): - def __get_max_reuse_factor(self, model): - max_rf = 0 - for layer in model.get_layers(): - rf = int(layer.get_attr('reuse_factor')) - if rf > max_rf: - max_rf = rf - return max_rf def print_array_to_cpp(self, var, odir, write_txt_file=True): """Write a weights array to C++ header files. @@ -181,11 +174,9 @@ def write_project_cpp(self, model): ) model_cfg = model.config.get_config_value('HLSConfig')['Model'] - if ( - 'DenseResourceImplementation' in model_cfg - and model_cfg['DenseResourceImplementation'].lower() == 'unrolled' - ): - newline += indent + f'#pragma HLS PIPELINE ii={self.__get_max_reuse_factor(model)} \n' + if model_cfg.get('Strategy', 'latency').lower() == 'unrolled': + max_rf = max([int(layer.get_attr('reuse_factor')) for layer in model.get_layers()]) + newline += indent + f'#pragma HLS PIPELINE II={max_rf} \n' else: if model.config.pipeline_style.lower() == 'dataflow': newline += indent + '#pragma HLS DATAFLOW \n' @@ -724,7 +715,7 @@ def write_tar(self, model): """ with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive: - archive.add(model.config.get_output_dir(), recursive=True) + archive.add(model.config.get_output_dir(), recursive=True, arcname='') def write_hls(self, model): print('Writing HLS project') From fbc4107948892d2332aac8e05349dc623a0d3fca Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 26 Aug 2024 00:54:10 +0200 Subject: [PATCH 123/272] Remove mentions of dense_resource_implementation --- hls4ml/backends/vivado/vivado_backend.py | 1 - hls4ml/model/graph.py | 20 -------------------- 2 files changed, 21 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index fb7377a655..b1e2ffddd8 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -292,7 +292,6 @@ def init_dense(self, layer): else: layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) - layer.set_attr('dense_resource_implementation', layer.model.config.get_dense_resource_implementation(layer).lower()) # TODO consolidate these functions into a single `init_conv` @layer_optimizer(Conv1D) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 0e2c3d33ae..d0a1fdf7fc 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -43,10 +43,6 @@ def __init__(self, config): self.layer_type_conv_implementation = {} self.layer_name_conv_implementation = {} - self.model_dense_resource_implementation = 'Standard' - self.layer_type_dense_resource_implementation = {} - self.layer_name_dense_resource_implementation = {} - self.model_compression = False self.layer_type_compression = {} self.layer_name_compression = {} @@ -190,17 +186,6 @@ def get_conv_implementation(self, layer): return conv_implementation - def get_dense_resource_implementation(self, layer): - dense_resource_implementation = self.layer_name_dense_resource_implementation.get(layer.name.lower()) - if dense_resource_implementation is None: - dense_resource_implementation = self.layer_type_dense_resource_implementation.get( - layer.__class__.__name__.lower() - ) - if dense_resource_implementation is None: - dense_resource_implementation = self.model_dense_resource_implementation - - return dense_resource_implementation - def is_resource_strategy(self, layer): return self.get_strategy(layer).lower() == 'resource' @@ -280,7 +265,6 @@ def _parse_hls_config(self): self.model_rf = model_cfg.get('ReuseFactor') self.model_targ_cycles = model_cfg.get('TargetCycles') self.model_conv_implementation = model_cfg.get('ConvImplementation', 'LineBuffer') - self.model_dense_resource_implementation = model_cfg.get('DenseResourceImplementation', 'Standard') self.model_strategy = model_cfg.get('Strategy', 'Latency') self.model_compression = bool(model_cfg.get('Compression', 0)) self.pipeline_style = model_cfg.get('PipelineStyle', 'pipeline') @@ -311,10 +295,6 @@ def _parse_hls_config(self): if conv_implementation is not None: self.layer_type_conv_implementation[layer_type.lower()] = conv_implementation - dense_resource_implementation = layer_cfg.get('DenseResourceImplementation') - if conv_implementation is not None: - self.layer_type_dense_resource_implementation[layer_type.lower()] = dense_resource_implementation - compression = layer_cfg.get('Compression') if compression is not None: self.layer_type_compression[layer_type.lower()] = bool(compression) From ecda5c946e6757b68579b590d09e9a9e6e0f3ac5 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 26 Aug 2024 01:02:13 +0200 Subject: [PATCH 124/272] Default to 'auto' for pipeline style and move check to an optimizer --- .../backends/vivado/passes/pipeline_style.py | 131 ++++++++++++++++++ hls4ml/backends/vivado/vivado_backend.py | 14 +- hls4ml/model/graph.py | 51 +------ hls4ml/writer/vivado_writer.py | 22 +-- test/pytest/test_dense_unrolled.py | 6 +- test/pytest/test_pipeline_style.py | 99 +++++++++++++ 6 files changed, 255 insertions(+), 68 deletions(-) create mode 100644 hls4ml/backends/vivado/passes/pipeline_style.py create mode 100755 test/pytest/test_pipeline_style.py diff --git a/hls4ml/backends/vivado/passes/pipeline_style.py b/hls4ml/backends/vivado/passes/pipeline_style.py new file mode 100644 index 0000000000..326745e455 --- /dev/null +++ b/hls4ml/backends/vivado/passes/pipeline_style.py @@ -0,0 +1,131 @@ +from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.optimizer import ModelOptimizerPass + + +class SetPipelineStyle(ModelOptimizerPass): + def __init__(self): + pass + + def transform(self, model): + if model.config.pipeline_style not in ['auto', 'pipeline', 'dataflow']: + print( + f'WARNING: Pipeline style set to {model.config.pipeline_style}, valid values: auto, pipeline, dataflow. ' + 'Using "auto".' + ) + self._set_pipeline_style(model, 'auto') + + if model.config.pipeline_style is None or model.config.pipeline_style == 'auto': + + if self._maybe_set_dataflow_io_stream(model): + return True + + if self._maybe_set_dataflow_conv_layers(model): + return True + + if self._maybe_set_dataflow_resource_strategy(model): + return True + + if self._maybe_set_pipeline_unrolled_strategy(model): + return True + + if self._maybe_set_pipeline_io_parallel(model): + return True + + self._set_safe_default_dataflow(model) + return True + else: + self._validate_hls_config(model) + + return False # No model changes made + + def _set_pipeline_style(self, model, pipeline_style): + # Could add logging here + model.config.pipeline_style = pipeline_style + + def _maybe_set_dataflow_io_stream(self, model): + if model.config.get_config_value('IOType') == 'io_stream': + self._set_pipeline_style(model, 'dataflow') + return True + + return False + + def _maybe_set_dataflow_conv_layers(self, model): + for layer in model.get_layers(): + if isinstance(layer, (Conv1D, Conv2D)): + self._set_pipeline_style(model, 'dataflow') + return True + + return False + + def _maybe_set_dataflow_resource_strategy(self, model): + for layer in model.get_layers(): + if model.config.is_resource_strategy(layer): + self._set_pipeline_style(model, 'dataflow') + return True + + return False + + def _maybe_set_pipeline_unrolled_strategy(self, model): + have_unrolled = False + for layer in model.get_layers(): + if model.config.get_strategy(layer).lower() == 'unrolled': + self._set_pipeline_style(model, 'pipeline') + have_unrolled = True + break + + if have_unrolled: + model.config.pipeline_ii = max([int(layer.get_attr('reuse_factor')) for layer in model.get_layers()]) + + return have_unrolled + + def _maybe_set_pipeline_io_parallel(self, model): + if model.config.get_config_value('IOType') == 'io_parallel': + self._set_pipeline_style(model, 'pipeline') + return True + + return False + + def _set_safe_default_dataflow(self, model): + print( + 'WARNING: Couldn\'t determine best pipeline style, defaulting to "DATAFLOW". ' + 'Use "PipelineStyle" property to override.' + ) + self._set_pipeline_style(model, 'dataflow') + + def _validate_hls_config(self, model): + if model.config.pipeline_style.lower() == 'pipeline': + if model.config.model_compression: + print('WARNING: Compression enabled while pipeline style set to "pipeline".') + if model.config.model_strategy.lower() == 'resource': + print( + 'WARNING: Model strategy "Resource" will lead to bad QoR in combination ' + 'with pipeline style set to "pipeline".' + ) + if any(isinstance(layer, (Conv1D, Conv2D)) for layer in model.get_layers()): + print('WARNING: Convolution layers require "dataflow" pipeline style.') + for layer_type, strategy in model.config.layer_type_strategy.items(): + if strategy.lower() == 'resource' and model.config.pipeline_style.lower() == 'pipeline': + print( + f'WARNING: Strategy for layer type {layer_type} set to "Resource", while pipeline style set to ' + '"pipeline". This will lead to bad QoR.' + ) + + for layer_name, strategy in model.config.layer_name_strategy.items(): + if strategy.lower() == 'resource' and model.config.pipeline_style.lower() == 'pipeline': + print( + 'WARNING: Strategy for layer {} set to "Resource", while pipeline style set to "pipeline".'.format( + layer_name + ) + ) + + for layer_type, compression in model.config.layer_type_compression.items(): + if compression and model.config.pipeline_style.lower() == 'pipeline': + print( + 'WARNING: Compression enabled for layer type {}, while pipeline style set to "pipeline".'.format( + layer_type + ) + ) + + for layer_name, compression in model.config.layer_name_compression.items(): + if compression and model.config.pipeline_style.lower() == 'pipeline': + print(f'WARNING: Compression enabled for layer {layer_name}, while pipeline style set to "pipeline".') diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index b1e2ffddd8..17fd994598 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -114,6 +114,7 @@ def _register_flows(self): 'vivado:apply_resource_strategy', 'vivado:generate_conv_im2col', 'vivado:generate_unrolled_dense_resource', + 'vivado:set_pipeline_style', ] vivado_types_flow = register_flow('specific_types', vivado_types, requires=[init_flow], backend=self.name) @@ -247,11 +248,6 @@ def build( return parse_vivado_report(model.config.get_output_dir()) - def _validate_conv_strategy(self, layer): - if layer.model.config.pipeline_style.lower() != 'dataflow': - print(f'WARNING: Layer {layer.name} requires "dataflow" pipeline style. Switching to "dataflow" pipeline style.') - layer.model.config.pipeline_style = 'dataflow' - @layer_optimizer(Layer) def init_base_layer(self, layer): reuse_factor = layer.model.config.get_reuse_factor(layer) @@ -356,8 +352,6 @@ def init_conv1d(self, layer): layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) - self._validate_conv_strategy(layer) - @layer_optimizer(SeparableConv1D) def init_sepconv1d(self, layer): if layer.model.config.is_resource_strategy(layer): @@ -480,8 +474,6 @@ def init_conv2d(self, layer): layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) - self._validate_conv_strategy(layer) - @layer_optimizer(SeparableConv2D) def init_sepconv2d(self, layer): if layer.model.config.is_resource_strategy(layer): @@ -585,8 +577,10 @@ def init_lstm(self, layer): n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) if use_resource_instead: self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') layer.set_attr('strategy', 'resource') else: + self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False) self.set_closest_reuse_factor( layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor', include_max_rf=False ) @@ -617,8 +611,10 @@ def init_gru(self, layer): n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) if use_resource_instead: self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') layer.set_attr('strategy', 'resource') else: + self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False) self.set_closest_reuse_factor( layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor', include_max_rf=False ) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index d0a1fdf7fc..609417f94a 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -49,7 +49,8 @@ def __init__(self, config): self.trace_output = self.get_config_value('TraceOutput', False) - self.pipeline_style = 'pipeline' + self.pipeline_style = 'auto' + self.pipeline_ii = None if 'WriterConfig' in self.config: self.writer_config = self.config['WriterConfig'] @@ -61,7 +62,6 @@ def __init__(self, config): } self._parse_hls_config() - self._validate_hls_config() def get_config_value(self, key, default=None): return self.config.get(key, default) @@ -267,7 +267,8 @@ def _parse_hls_config(self): self.model_conv_implementation = model_cfg.get('ConvImplementation', 'LineBuffer') self.model_strategy = model_cfg.get('Strategy', 'Latency') self.model_compression = bool(model_cfg.get('Compression', 0)) - self.pipeline_style = model_cfg.get('PipelineStyle', 'pipeline') + self.pipeline_style = model_cfg.get('PipelineStyle', 'auto') + self.pipeline_ii = model_cfg.get('PipelineInterval', None) layer_type_cfg = hls_config.get('LayerType') if layer_type_cfg is not None: @@ -304,50 +305,6 @@ def _parse_hls_config(self): for layer_name, layer_cfg in layer_name_cfg.items(): self.parse_name_config(layer_name, layer_cfg) - def _validate_hls_config(self): - use_dataflow = False - if self.pipeline_style.lower() == 'pipeline' and self.model_compression: - print('WARNING: Compression enabled while pipeline style set to "pipeline".') - use_dataflow = True - for layer_type, strategy in self.layer_type_strategy.items(): - if strategy.lower() == 'resource' and self.pipeline_style.lower() == 'pipeline': - print( - 'WARNING: Strategy for layer type {} set to "Resource", while pipeline style set to "pipeline".'.format( - layer_type - ) - ) - use_dataflow = True - - for layer_name, strategy in self.layer_name_strategy.items(): - if strategy.lower() == 'resource' and self.pipeline_style.lower() == 'pipeline': - print( - 'WARNING: Strategy for layer {} set to "Resource", while pipeline style set to "pipeline".'.format( - layer_name - ) - ) - use_dataflow = True - - for layer_type, compression in self.layer_type_compression.items(): - if compression and self.pipeline_style.lower() == 'pipeline': - print( - 'WARNING: Compression enabled for layer type {}, while pipeline style set to "pipeline".'.format( - layer_type - ) - ) - use_dataflow = True - - for layer_name, compression in self.layer_name_compression.items(): - if compression and self.pipeline_style.lower() == 'pipeline': - print(f'WARNING: Compression enabled for layer {layer_name}, while pipeline style set to "pipeline".') - use_dataflow = True - - if self.model_strategy.lower() == 'resource': - use_dataflow = True - - if use_dataflow: - print('WARNING: Changing pipeline style to "dataflow".') - self.pipeline_style = 'dataflow' - class ModelGraph: """The ModelGraph represents the network that is being processed by hls4ml. diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index fae3984e5f..1adee08093 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -199,7 +199,15 @@ def write_project_cpp(self, model): all_inputs = [i.name for i in model_inputs] all_outputs = [o.name for o in model_outputs] all_brams = [b.name for b in model_brams] - io_type = model.config.get_config_value("IOType") + io_type = model.config.get_config_value('IOType') + + pipeline_style = model.config.pipeline_style + pipeline_ii = model.config.pipeline_ii + pipeline_pragma = indent + f'#pragma HLS {pipeline_style.upper()}' + if pipeline_style == 'pipeline' and pipeline_ii is not None: + pipeline_pragma += f' II={pipeline_ii}\n' + else: + pipeline_pragma += '\n' if io_type == 'io_parallel': for i in model_inputs: @@ -211,23 +219,15 @@ def write_project_cpp(self, model): newline += indent + '#pragma HLS INTERFACE ap_vld port={},{} \n'.format( ','.join(all_inputs), ','.join(all_outputs) ) + newline += pipeline_pragma - model_cfg = model.config.get_config_value('HLSConfig')['Model'] - if model_cfg.get('Strategy', 'latency').lower() == 'unrolled': - max_rf = max([int(layer.get_attr('reuse_factor')) for layer in model.get_layers()]) - newline += indent + f'#pragma HLS PIPELINE II={max_rf} \n' - else: - if model.config.pipeline_style.lower() == 'dataflow': - newline += indent + '#pragma HLS DATAFLOW \n' - else: - newline += indent + '#pragma HLS PIPELINE \n' if io_type == 'io_stream': newline += indent + '#pragma HLS INTERFACE axis port={},{} \n'.format( ','.join(all_inputs), ','.join(all_outputs) ) if all_brams: newline += indent + '#pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams)) - newline += indent + '#pragma HLS DATAFLOW \n' + newline += pipeline_pragma elif '// hls-fpga-machine-learning insert layers' in line: newline = line + '\n' diff --git a/test/pytest/test_dense_unrolled.py b/test/pytest/test_dense_unrolled.py index 6b7503c543..69d024b84f 100644 --- a/test/pytest/test_dense_unrolled.py +++ b/test/pytest/test_dense_unrolled.py @@ -84,7 +84,7 @@ def test_dense_unrolled_streaming_conv(dim, io_type, reuse_factor): @pytest.mark.parametrize('backend', ['Vitis', 'Vivado']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) @pytest.mark.parametrize('static', [True, False]) -@pytest.mark.parametrize('reuse_factor', [1, 4, 32, 128]) # These should be enough +@pytest.mark.parametrize('reuse_factor', [1, 4, 32, 128]) # RF=128 also tests if setting closest RF works well def test_rnn_unrolled(rnn_layer, backend, io_type, static, reuse_factor): # Subtract 0.5 to include negative values input_shape = (12, 8) @@ -118,6 +118,10 @@ def test_rnn_unrolled(rnn_layer, backend, io_type, static, reuse_factor): hls_model = convert_from_keras_model( keras_model, hls_config=hls_config, output_dir=output_dir, backend=backend, io_type=io_type ) + + # Check if strategy was not overridden + assert list(hls_model.get_layers())[1].get_attr('strategy') == 'unrolled' if reuse_factor > 1 else 'latency' + hls_model.compile() keras_prediction = keras_model.predict(X) diff --git a/test/pytest/test_pipeline_style.py b/test/pytest/test_pipeline_style.py new file mode 100755 index 0000000000..f8706fa52c --- /dev/null +++ b/test/pytest/test_pipeline_style.py @@ -0,0 +1,99 @@ +""" Test that pipeline style is properly handled by optimizers (respected if user-defined, correctly set if 'auto'). """ + +from pathlib import Path + +import pytest +import tensorflow as tf + +import hls4ml + +test_root_path = Path(__file__).parent + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) +@pytest.mark.parametrize( + 'param_group, pipeline_style, io_type, strategy, ii', + [ + (1, 'auto', 'io_stream', 'resource', None), # io_stream should result in DATAFLOW pragma regardless of other params + (2, 'auto', 'io_stream', 'latency', None), + (3, None, 'io_stream', 'unrolled', None), # None should be interpreted as 'auto' + (4, 'auto', 'io_parallel', 'resource', None), # Should end up with DATAFLOW pragma + (5, 'auto', 'io_parallel', 'latency', None), # Should end up with PIPELINE pragma + (6, 'auto', 'io_parallel', 'unrolled', None), # Should end up with PIPELINE pragma and II + (7, 'pipeline', 'io_stream', 'resource', None), # Should result in a warning + (8, 'pipeline', 'io_parallel', 'resource', None), # Should result in a warning + (9, 'pipeline', 'io_parallel', 'latency', None), # No warning + (10, 'pipeline', 'io_parallel', 'latency', 10), # No warning, should include II=10 + (11, 'dataflow', 'io_stream', 'latency', None), # No warning + (12, 'dataflow', 'io_parallel', 'latency', None), # No warning + (13, 'dataflow', 'io_parallel', 'latency', None), # No warning + (14, 'wrong', 'io_parallel', 'latency', None), # Incorrect settings should issue a warning and switch to 'auto' + (15, 'auto', 'io_parallel', 'resource', None), # Special case to test Conv layer. No warning + (16, 'pipeline', 'io_parallel', 'resource', None), # Special case to test Conv layer. Should result in two warnings + ], +) +def test_pipeline_style(capfd, backend, param_group, pipeline_style, io_type, strategy, ii): + def _check_top_hls_pragma(model, pragma, ii=None): + assert model.config.pipeline_style == pragma + + pragma_to_check = f'#pragma HLS {pragma.upper()}' + if ii is not None: + pragma_to_check += f' II={ii}' + + with open(model.config.get_output_dir() + '/firmware/myproject.cpp') as main_file: + contents = main_file.readlines() + for line in contents: + if pragma_to_check in line: + return True + + return False + + if param_group in [15, 16]: + model = tf.keras.models.Sequential([tf.keras.layers.Conv1D(8, 2, input_shape=(10, 4))]) + else: + model = tf.keras.models.Sequential([tf.keras.layers.Dense(8, input_shape=(10,))]) + + config = hls4ml.utils.config_from_keras_model(model) + if pipeline_style is not None: + config['Model']['PipelineStyle'] = pipeline_style + if ii is not None: + config['Model']['PipelineInterval'] = ii + config['Model']['Strategy'] = strategy + config['Model']['ReuseFactor'] = 2 + + prj_name = f'hls4mlprj_pipeline_style_{backend}_{param_group}' + output_dir = str(test_root_path / prj_name) + hls_model = hls4ml.converters.convert_from_keras_model( + model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend + ) + hls_model.write() + + captured_warnings = [line for line in capfd.readouterr().out.split('\n') if line.startswith('WARNING')] + + if param_group in [1, 2, 3, 4]: + assert _check_top_hls_pragma(hls_model, 'dataflow') + elif param_group == 5: + assert _check_top_hls_pragma(hls_model, 'pipeline') + elif param_group == 6: + assert _check_top_hls_pragma(hls_model, 'pipeline', ii=2) + elif param_group in [7, 8]: + assert _check_top_hls_pragma(hls_model, 'pipeline') + assert any('bad QoR' in warning for warning in captured_warnings) + elif param_group == 9: + assert _check_top_hls_pragma(hls_model, 'pipeline') + assert len(captured_warnings) == 0 + elif param_group == 10: + assert _check_top_hls_pragma(hls_model, 'pipeline', ii=ii) + assert len(captured_warnings) == 0 + elif param_group in [11, 12, 13]: + assert _check_top_hls_pragma(hls_model, 'dataflow') + assert len(captured_warnings) == 0 + elif param_group == 14: + assert _check_top_hls_pragma(hls_model, 'pipeline') + assert any('Using "auto"' in warning for warning in captured_warnings) + elif param_group == 15: + assert _check_top_hls_pragma(hls_model, 'dataflow') + elif param_group == 16: + assert _check_top_hls_pragma(hls_model, 'pipeline') + assert any('bad QoR' in warning for warning in captured_warnings) + assert any('Convolution' in warning for warning in captured_warnings) From ce8431d51a8d96307cd1016c2eafd1150a33d498 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 26 Aug 2024 01:27:03 +0200 Subject: [PATCH 125/272] Pimp the docs a bit --- docs/advanced/model_optimization.rst | 4 ++-- docs/api/configuration.rst | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst index a75224b8cc..c1396b3d20 100644 --- a/docs/advanced/model_optimization.rst +++ b/docs/advanced/model_optimization.rst @@ -130,5 +130,5 @@ Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used .. code-block:: Python hls_config = config_from_keras_model(optimized_model) - hls_config['Model']['DenseResourceImplementation'] = 'Unrolled' - # Any addition hls4ml config, such as strategy, reuse factor etc... + hls_config['Model']['Strategy'] = 'Unrolled' + # Any addition hls4ml config, reuse factor etc... diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst index 091f88e619..9303e652c9 100644 --- a/docs/api/configuration.rst +++ b/docs/api/configuration.rst @@ -103,7 +103,10 @@ For Vivado backend the options are: * **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here `__. * **HLSConfig**\: the detailed configuration of precision and parallelism, including: * **ReuseFactor**\ : in the case that you are pipelining, this defines the pipeline interval or initiation interval - * **Strategy**\ : Optimization strategy on FPGA, either "Latency" or "Resource". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" strategy. An example of using larger reuse factor can be found `here. `__ + * **ParallelizationFactor**\ : The number of output "pixels" to compute in parallel in convolutional layers. Increasing this parameter results in significant increase in resources required on the FPGA. + * **Strategy**\ : Optimization strategy on FPGA, either "Latency", "Resource" or "Unrolled". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" or "unrolled" strategy. An example of using larger reuse factor can be found `here. `__ + * **PipelineStyle**\ : Set the top level pipeline style. Valid options are "auto", "pipeline" and "dataflow". If unspecified, it defaults to "auto". + * **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network. * **Precision**\ : this defines the precsion of your inputs, outputs, weights and biases. It is denoted by ``ap_fixed``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits. Additionally, integers in fixed precision data type (\ ``ap_int``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. You have a chance to further configure this more finely with per-layer configuration described below. From 2cb6fe1b433195280274a9e4aaf86cf3fbc75200 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 27 Aug 2024 08:33:54 -0400 Subject: [PATCH 126/272] Add functionality to use granularity option also for pytorch models (#1051) * allow granularity options in pytorch parser * pre-commit * [pre-commit.ci] auto fixes from pre-commit hooks * add torch to setup? * add torch to setup2? * add torch to setup3? * add torch to requirements * fix failing pytest * adapat new batchnorm pytests to changes in interface * addressing comments from Vladimir and Jovan * remvoving torch from requirements --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- hls4ml/converters/__init__.py | 9 +-- hls4ml/converters/pytorch_to_hls.py | 44 ++++++++--- hls4ml/utils/config.py | 79 +++++++++++++++++++ test/pytest/test_backend_config.py | 33 +++++--- test/pytest/test_batchnorm_pytorch.py | 15 ++-- test/pytest/test_merge_pytorch.py | 8 +- test/pytest/test_pytorch_api.py | 79 ++++++++----------- test/pytest/test_recurrent_pytorch.py | 19 +++-- .../pytest/test_sequential_parsing_pytorch.py | 12 +-- test/pytest/test_upsampling_pytorch.py | 6 +- 10 files changed, 204 insertions(+), 100 deletions(-) diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py index 3bd6d06c3b..092e53b3d3 100644 --- a/hls4ml/converters/__init__.py +++ b/hls4ml/converters/__init__.py @@ -10,6 +10,8 @@ from hls4ml.converters.keras_to_hls import get_supported_keras_layers # noqa: F401 from hls4ml.converters.keras_to_hls import parse_keras_model # noqa: F401 from hls4ml.converters.keras_to_hls import keras_to_hls, register_keras_layer_handler + +# from hls4ml.converters.pytorch_to_hls import parse_pytorch_model # noqa: F401 from hls4ml.model import ModelGraph from hls4ml.utils.config import create_config from hls4ml.utils.symbolic_utils import LUTFunction @@ -238,7 +240,6 @@ def convert_from_keras_model( def convert_from_pytorch_model( model, - input_shape, output_dir='my-hls-test', project_name='myproject', input_data_tb=None, @@ -251,7 +252,6 @@ def convert_from_pytorch_model( Args: model: PyTorch model to convert. - input_shape (list): The shape of the input tensor. First element is the batch size, needs to be None output_dir (str, optional): Output directory of the generated HLS project. Defaults to 'my-hls-test'. project_name (str, optional): Name of the HLS project. Defaults to 'myproject'. input_data_tb (str, optional): String representing the path of input data in .npy or .dat format that will be @@ -293,7 +293,6 @@ def convert_from_pytorch_model( config = create_config(output_dir=output_dir, project_name=project_name, backend=backend, **kwargs) config['PytorchModel'] = model - config['InputShape'] = input_shape config['InputData'] = input_data_tb config['OutputPredictions'] = output_data_tb config['HLSConfig'] = {} @@ -301,9 +300,9 @@ def convert_from_pytorch_model( if hls_config is None: hls_config = {} - model_config = hls_config.get('Model', None) + model_config = hls_config.get('Model') config['HLSConfig']['Model'] = _check_model_config(model_config) - + config['InputShape'] = hls_config.get('InputShape') _check_hls_config(config, hls_config) return pytorch_to_hls(config) diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py index bd483b3690..40336835a6 100644 --- a/hls4ml/converters/pytorch_to_hls.py +++ b/hls4ml/converters/pytorch_to_hls.py @@ -102,7 +102,7 @@ def decorator(function): # ---------------------------------------------------------------- -def pytorch_to_hls(config): +def parse_pytorch_model(config, verbose=True): """Convert PyTorch model to hls4ml ModelGraph. Args: @@ -118,14 +118,15 @@ def pytorch_to_hls(config): # This is a list of dictionaries to hold all the layer info we need to generate HLS layer_list = [] - print('Interpreting Model ...') - + if verbose: + print('Interpreting Model ...') reader = PyTorchFileReader(config) if isinstance(config['PytorchModel'], str) else PyTorchModelReader(config) if type(reader.input_shape) is tuple: input_shapes = [list(reader.input_shape)] else: input_shapes = list(reader.input_shape) - input_shapes = [list(shape) for shape in input_shapes] + # first element needs to 'None' as placeholder for the batch size, insert it if not present + input_shapes = [[None] + list(shape) if shape[0] is not None else list(shape) for shape in input_shapes] model = reader.torch_model @@ -151,7 +152,8 @@ def pytorch_to_hls(config): output_shape = None # Loop through layers - print('Topology:') + if verbose: + print('Topology:') layer_counter = 0 n_inputs = 0 @@ -226,13 +228,14 @@ def pytorch_to_hls(config): pytorch_class, layer_name, input_names, input_shapes, node, class_object, reader, config ) - print( - 'Layer name: {}, layer type: {}, input shape: {}'.format( - layer['name'], - layer['class_name'], - input_shapes, + if verbose: + print( + 'Layer name: {}, layer type: {}, input shape: {}'.format( + layer['name'], + layer['class_name'], + input_shapes, + ) ) - ) layer_list.append(layer) assert output_shape is not None @@ -288,7 +291,12 @@ def pytorch_to_hls(config): operation, layer_name, input_names, input_shapes, node, None, reader, config ) - print('Layer name: {}, layer type: {}, input shape: {}'.format(layer['name'], layer['class_name'], input_shapes)) + if verbose: + print( + 'Layer name: {}, layer type: {}, input shape: {}'.format( + layer['name'], layer['class_name'], input_shapes + ) + ) layer_list.append(layer) assert output_shape is not None @@ -342,7 +350,12 @@ def pytorch_to_hls(config): operation, layer_name, input_names, input_shapes, node, None, reader, config ) - print('Layer name: {}, layer type: {}, input shape: {}'.format(layer['name'], layer['class_name'], input_shapes)) + if verbose: + print( + 'Layer name: {}, layer type: {}, input shape: {}'.format( + layer['name'], layer['class_name'], input_shapes + ) + ) layer_list.append(layer) assert output_shape is not None @@ -351,6 +364,11 @@ def pytorch_to_hls(config): if len(input_layers) == 0: input_layers = None + return layer_list, input_layers + + +def pytorch_to_hls(config): + layer_list, input_layers = parse_pytorch_model(config) print('Creating HLS model') hls_model = ModelGraph(config, layer_list, inputs=input_layers) return hls_model diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py index 1a297787d6..6cba033de2 100644 --- a/hls4ml/utils/config.py +++ b/hls4ml/utils/config.py @@ -269,6 +269,7 @@ def make_layer_config(layer): def config_from_pytorch_model( model, + input_shape, granularity='model', backend=None, default_precision='ap_fixed<16,6>', @@ -284,6 +285,7 @@ def config_from_pytorch_model( Args: model: PyTorch model + input_shape (tuple or list of tuples): The shape of the input tensor, excluding the batch size. granularity (str, optional): Granularity of the created config. Defaults to 'model'. Can be set to 'model', 'type' and 'layer'. @@ -321,6 +323,83 @@ def config_from_pytorch_model( model_config['Strategy'] = 'Latency' config['Model'] = model_config + config['PytorchModel'] = model + if not (isinstance(input_shape, tuple) or (isinstance(input_shape, list) and isinstance(input_shape[0], tuple))): + raise Exception('Input shape must be tuple (single input) or list of tuples (multiple inputs)') + config['InputShape'] = input_shape + + if granularity.lower() not in ['model', 'type', 'name']: + raise Exception( + f'Invalid configuration granularity specified, expected "model", "type" or "name" got "{granularity}"' + ) + + if backend is not None: + backend = hls4ml.backends.get_backend(backend) + + from hls4ml.converters.pytorch_to_hls import parse_pytorch_model + + ( + layer_list, + _, + ) = parse_pytorch_model(config, verbose=False) + + def make_layer_config(layer): + cls_name = layer['class_name'] + if 'config' in layer.keys(): + if 'activation' in layer['config'].keys(): + if layer['config']['activation'] == 'softmax': + cls_name = 'Softmax' + + layer_cls = hls4ml.model.layers.layer_map[cls_name] + if backend is not None: + layer_cls = backend.create_layer_class(layer_cls) + + layer_config = {} + + config_attrs = [a for a in layer_cls.expected_attributes if a.configurable] + for attr in config_attrs: + if isinstance(attr, hls4ml.model.attributes.TypeAttribute): + precision_cfg = layer_config.setdefault('Precision', {}) + name = attr.name + if name.endswith('_t'): + name = name[:-2] + if attr.default is None: + precision_cfg[name] = default_precision + else: + precision_cfg[name] = str(attr.default) + elif attr.name == 'reuse_factor': + layer_config[attr.config_name] = default_reuse_factor + else: + if attr.default is not None: + layer_config[attr.config_name] = attr.default + + if layer['class_name'] == 'Input': + dtype = layer['config']['dtype'] + if dtype.startswith('int') or dtype.startswith('uint'): + typename = dtype[: dtype.index('int') + 3] + width = int(dtype[dtype.index('int') + 3 :]) + layer_config['Precision']['result'] = f'ap_{typename}<{width}>' + # elif bool, q[u]int, ... + + return layer_config + + if granularity.lower() == 'type': + type_config = {} + for layer in layer_list: + if layer['class_name'] in type_config: + continue + layer_config = make_layer_config(layer) + type_config[layer['class_name']] = layer_config + + config['LayerType'] = type_config + + elif granularity.lower() == 'name': + name_config = {} + for layer in layer_list: + layer_config = make_layer_config(layer) + name_config[layer['name']] = layer_config + + config['LayerName'] = name_config return config diff --git a/test/pytest/test_backend_config.py b/test/pytest/test_backend_config.py index 346402de13..c43a7c7680 100644 --- a/test/pytest/test_backend_config.py +++ b/test/pytest/test_backend_config.py @@ -31,7 +31,7 @@ def test_backend_config(framework, backend, part, clock_period, clock_unc): convert_fn = hls4ml.converters.convert_from_keras_model else: model = torch.nn.Sequential(torch.nn.Linear(1, 2), torch.nn.ReLU()) - config = hls4ml.utils.config_from_pytorch_model(model) + config = hls4ml.utils.config_from_pytorch_model(model, input_shape=(None, 1)) convert_fn = hls4ml.converters.convert_from_pytorch_model if clock_unc is not None: @@ -42,16 +42,27 @@ def test_backend_config(framework, backend, part, clock_period, clock_unc): test_dir = f'hls4mlprj_backend_config_{framework}_{backend}_part_{part}_period_{clock_period}_unc_{unc_str}' output_dir = test_root_path / test_dir - hls_model = convert_fn( - model, - input_shape=(None, 1), # This serves as a test of handling unexpected values by the backend in keras converer - hls_config=config, - output_dir=str(output_dir), - backend=backend, - part=part, - clock_period=clock_period, - clock_uncertainty=clock_unc, - ) + if framework == "keras": + hls_model = convert_fn( + model, + input_shape=(None, 1), # This serves as a test of handling unexpected values by the backend in keras converer + hls_config=config, + output_dir=str(output_dir), + backend=backend, + part=part, + clock_period=clock_period, + clock_uncertainty=clock_unc, + ) + else: + hls_model = convert_fn( + model, + hls_config=config, + output_dir=str(output_dir), + backend=backend, + part=part, + clock_period=clock_period, + clock_uncertainty=clock_unc, + ) hls_model.write() diff --git a/test/pytest/test_batchnorm_pytorch.py b/test/pytest/test_batchnorm_pytorch.py index 1e45e7ae0f..fd4efdf326 100644 --- a/test/pytest/test_batchnorm_pytorch.py +++ b/test/pytest/test_batchnorm_pytorch.py @@ -39,10 +39,12 @@ def test_batchnorm(data, backend, io_type): default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>' - config = hls4ml.utils.config_from_pytorch_model(model, default_precision=default_precision, granularity='name') + config = hls4ml.utils.config_from_pytorch_model( + model, (in_shape,), default_precision=default_precision, granularity='name' + ) output_dir = str(test_root_path / f'hls4mlprj_batchnorm_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_pytorch_model( - model, (None, in_shape), backend=backend, hls_config=config, io_type=io_type, output_dir=output_dir + model, backend=backend, hls_config=config, io_type=io_type, output_dir=output_dir ) hls_model.compile() @@ -94,9 +96,13 @@ def test_batchnorm_fusion(fusion_data, backend, io_type): # We do not have an implementation of a transpose for io_stream, need to transpose inputs and outputs outside of hls4ml if io_type == 'io_stream': fusion_data = np.ascontiguousarray(fusion_data.transpose(0, 2, 1)) - config = hls4ml.utils.config_from_pytorch_model(model, channels_last_conversion='internal', transpose_outputs=False) + config = hls4ml.utils.config_from_pytorch_model( + model, (n_in, size_in_height), channels_last_conversion='internal', transpose_outputs=False + ) else: - config = hls4ml.utils.config_from_pytorch_model(model, channels_last_conversion='full', transpose_outputs=True) + config = hls4ml.utils.config_from_pytorch_model( + model, (n_in, size_in_height), channels_last_conversion='full', transpose_outputs=True + ) config['Model']['Strategy'] = 'Resource' @@ -104,7 +110,6 @@ def test_batchnorm_fusion(fusion_data, backend, io_type): output_dir = str(test_root_path / f'hls4mlprj_block_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_pytorch_model( model, - (None, n_in, size_in_height), hls_config=config, output_dir=output_dir, backend=backend, diff --git a/test/pytest/test_merge_pytorch.py b/test/pytest/test_merge_pytorch.py index ac42a7bb42..1dc461e532 100644 --- a/test/pytest/test_merge_pytorch.py +++ b/test/pytest/test_merge_pytorch.py @@ -41,14 +41,16 @@ def test_merge(merge_op, io_type, backend): model = MergeModule(merge_op) model.eval() - batch_input_shape = (None,) + input_shape config = hls4ml.utils.config_from_pytorch_model( - model, default_precision='ap_fixed<32,16>', channels_last_conversion="internal", transpose_outputs=False + model, + [input_shape, input_shape], + default_precision='ap_fixed<32,16>', + channels_last_conversion="internal", + transpose_outputs=False, ) output_dir = str(test_root_path / f'hls4mlprj_merge_pytorch_{merge_op}_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_pytorch_model( model, - [batch_input_shape, batch_input_shape], hls_config=config, output_dir=output_dir, io_type=io_type, diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index 8d18c6a1d2..295867c4ff 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -32,12 +32,10 @@ def test_linear(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input)).detach().numpy() - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, (1,)) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_linear_{backend}_{io_type}') - hls_model = convert_from_pytorch_model( - model, (None, 1), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) hls_model.compile() @@ -83,13 +81,11 @@ def test_activations(activation_function, backend, io_type): pytorch_prediction = model(torch.Tensor(X_input)).detach().numpy() - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, (1,)) output_dir = str( test_root_path / f'hls4mlprj_pytorch_api_activations_{activation_function.__class__.__name__}_{backend}_{io_type}' ) - hls_model = convert_from_pytorch_model( - model, (None, 1), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) hls_model.compile() hls_prediction = hls_model.predict(X_input) @@ -174,12 +170,10 @@ def test_activation_functionals(activation_function, backend, io_type): pytorch_prediction = model(torch.Tensor(X_input)).detach().numpy() - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, (1,)) fn_name = activation_function.__class__.__name__ output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_activations_functional_relu_{backend}_{io_type}_{fn_name}') - hls_model = convert_from_pytorch_model( - model, (None, 1), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) hls_model.compile() hls_prediction = hls_model.predict(X_input) @@ -217,14 +211,14 @@ def test_conv1d(padds, backend, io_type): if io_type == 'io_stream': X_input = np.ascontiguousarray(X_input.transpose(0, 2, 1)) - config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + config = config_from_pytorch_model( + model, (n_in, size_in), channels_last_conversion="internal", transpose_outputs=False + ) else: - config = config_from_pytorch_model(model, channels_last_conversion="full", transpose_outputs=True) + config = config_from_pytorch_model(model, (n_in, size_in), channels_last_conversion="full", transpose_outputs=True) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_conv1d_{padds}_{backend}_{io_type}') - hls_model = convert_from_pytorch_model( - model, (None, n_in, size_in), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) hls_model.compile() from torch.fx import symbolic_trace @@ -328,14 +322,17 @@ def test_conv2d(padds, backend, io_type): if io_type == 'io_stream': X_input = np.ascontiguousarray(X_input.transpose(0, 2, 3, 1)) - config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + config = config_from_pytorch_model( + model, (n_in, size_in_height, size_in_width), channels_last_conversion="internal", transpose_outputs=False + ) else: - config = config_from_pytorch_model(model, channels_last_conversion="full", transpose_outputs=True) + config = config_from_pytorch_model( + model, (n_in, size_in_height, size_in_width), channels_last_conversion="full", transpose_outputs=True + ) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_conv2d_{padds}_{backend}_{io_type}') hls_model = convert_from_pytorch_model( model, - (None, n_in, size_in_height, size_in_width), hls_config=config, output_dir=output_dir, backend=backend, @@ -478,20 +475,16 @@ def test_pooling(pooling, padds, backend): size_in_height = 0 input_shape = (1, n_in, size_in_height, size_in_width) if '2d' in pooling.__name__ else (1, n_in, size_in_width) - input_shape_forHLS = ( - (None, n_in, size_in_height, size_in_width) if '2d' in pooling.__name__ else (None, n_in, size_in_width) - ) + input_shape_forHLS = (n_in, size_in_height, size_in_width) if '2d' in pooling.__name__ else (n_in, size_in_width) X_input = np.random.rand(*input_shape) model = torch.nn.Sequential(pooling(2, padding=padds)).to() model.eval() pytorch_prediction = model(torch.Tensor(X_input)).detach().numpy() - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, input_shape_forHLS) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_pooling_{pooling.__name__}_padds_{padds}_backend_{backend}') - hls_model = convert_from_pytorch_model( - model, input_shape_forHLS, hls_config=config, output_dir=output_dir, backend=backend - ) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend) hls_model.compile() from torch.fx import symbolic_trace @@ -598,12 +591,10 @@ def test_bn(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input)).detach().numpy().flatten() - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, (5,)) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_bn_{backend}_{io_type}') - hls_model = convert_from_pytorch_model( - model, (None, 5), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) hls_model.compile() @@ -641,13 +632,11 @@ def test_squeeze(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input)).detach().numpy().flatten() - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, (5,)) del config['Model']['ChannelsLastConversion'] # We don't want anything touched for this test output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_squeeze_{backend}_{io_type}') - hls_model = convert_from_pytorch_model( - model, (None, 5), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) hls_model.compile() @@ -672,11 +661,11 @@ def test_flatten(backend): input = torch.randn(1, 1, 5, 5) model = nn.Sequential(nn.Conv2d(1, 32, 5, 1, 1), nn.Flatten(), nn.ReLU()) pytorch_prediction = model(input).detach().numpy() - input_shape = (None, 1, 5, 5) + input_shape = (1, 5, 5) - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, input_shape) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_flatten_backend_{backend}') - hls_model = convert_from_pytorch_model(model, input_shape, hls_config=config, output_dir=output_dir, backend=backend) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend) hls_model.compile() pred = hls_model.predict(input.detach().numpy()) @@ -718,14 +707,16 @@ def test_skipped_layers(backend, io_type): model.eval() input_shape = (3, 8) - batch_input_shape = (None,) + input_shape config = config_from_pytorch_model( - model, default_precision='ap_fixed<32,16>', channels_last_conversion="full", transpose_outputs=False + model, + input_shape, + default_precision='ap_fixed<32,16>', + channels_last_conversion="full", + transpose_outputs=False, ) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_skipped_{backend}_{io_type}') hls_model = convert_from_pytorch_model( model, - batch_input_shape, hls_config=config, output_dir=output_dir, io_type=io_type, @@ -781,16 +772,15 @@ def forward(self, x): input_tensor = torch.randn(10, 1, 8, 8) hls_input = np.ascontiguousarray(torch.permute(input_tensor, (0, 2, 3, 1)).detach().numpy()) - batch_input_shape = (None,) + input_shape config = config_from_pytorch_model( model, + input_shape, default_precision='ap_fixed<32,16>', channels_last_conversion="full", # Crucial for testing if the first Transpose was removed ) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_transpose_nop_{tensor_rank}d_{backend}_{io_type}') hls_model = convert_from_pytorch_model( model, - batch_input_shape, hls_config=config, output_dir=output_dir, io_type=io_type, @@ -846,12 +836,11 @@ def forward(self, x): # X_input is channels last X_input = np.ascontiguousarray(X_input.transpose(0, 2, 1)) - config = config_from_pytorch_model(model, channels_last_conversion="internal", transpose_outputs=False) + config = config_from_pytorch_model(model, (n_in, size_in), channels_last_conversion="internal", transpose_outputs=False) output_dir = str(test_root_path / f'hls4mlprj_pytorch_view_{backend}_{io_type}') hls_model = convert_from_pytorch_model( model, - (None, n_in, size_in), hls_config=config, output_dir=output_dir, backend=backend, diff --git a/test/pytest/test_recurrent_pytorch.py b/test/pytest/test_recurrent_pytorch.py index c1672c73b9..e4737ea675 100644 --- a/test/pytest/test_recurrent_pytorch.py +++ b/test/pytest/test_recurrent_pytorch.py @@ -32,12 +32,12 @@ def test_gru(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0)).detach().numpy() - config = config_from_pytorch_model(model, channels_last_conversion="off", transpose_outputs=False) + config = config_from_pytorch_model( + model, [(None, 1, 10), (None, 1, 20)], channels_last_conversion="off", transpose_outputs=False + ) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_gru_{backend}_{io_type}') - hls_model = convert_from_pytorch_model( - model, [(None, 1, 10), (None, 1, 20)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) hls_model.compile() @@ -69,12 +69,13 @@ def test_lstm(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0), torch.tensor(c0)).detach().numpy() - config = config_from_pytorch_model(model, channels_last_conversion="off", transpose_outputs=False) + config = config_from_pytorch_model( + model, [(None, 1, 10), (None, 1, 20), (None, 1, 20)], channels_last_conversion="off", transpose_outputs=False + ) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_lstm_{backend}_{io_type}') hls_model = convert_from_pytorch_model( model, - [(None, 1, 10), (None, 1, 20), (None, 1, 20)], hls_config=config, output_dir=output_dir, backend=backend, @@ -112,11 +113,13 @@ def test_rnn(backend, io_type): pytorch_prediction = model(torch.Tensor(X_input), torch.Tensor(h0)).detach().numpy() - config = config_from_pytorch_model(model, channels_last_conversion="off", transpose_outputs=False) + config = config_from_pytorch_model( + model, [(1, 10), (1, 20)], channels_last_conversion="off", transpose_outputs=False + ) output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_rnn_{backend}_{io_type}') hls_model = convert_from_pytorch_model( - model, [(None, 1, 10), (None, 1, 20)], hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type ) hls_model.compile() diff --git a/test/pytest/test_sequential_parsing_pytorch.py b/test/pytest/test_sequential_parsing_pytorch.py index 569c6a5b1c..20b273400a 100644 --- a/test/pytest/test_sequential_parsing_pytorch.py +++ b/test/pytest/test_sequential_parsing_pytorch.py @@ -59,12 +59,10 @@ def test_unnamed_seq(backend, io_type, named_layers): model = seq_named else: model = seq_unnamed - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, (1, 5, 5)) output_dir = str(test_root_path / f'hls4mlprj_pytorch_seq_unnamed_{backend}_{io_type}_{named_layers}') - convert_from_pytorch_model( - model, (None, 1, 5, 5), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) @pytest.mark.parametrize('backend', ['Vivado']) @@ -75,9 +73,7 @@ def test_named_seq(backend, io_type, named_layers): model = SeqModelNamedLayers() else: model = SeqModelUnnamedLayers() - config = config_from_pytorch_model(model) + config = config_from_pytorch_model(model, (1, 5, 5)) output_dir = str(test_root_path / f'hls4mlprj_pytorch_seq_named_{backend}_{io_type}_{named_layers}') - convert_from_pytorch_model( - model, (None, 1, 5, 5), hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type - ) + convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) diff --git a/test/pytest/test_upsampling_pytorch.py b/test/pytest/test_upsampling_pytorch.py index e881c39bbf..6e0d8f78ad 100644 --- a/test/pytest/test_upsampling_pytorch.py +++ b/test/pytest/test_upsampling_pytorch.py @@ -55,13 +55,14 @@ def test_pytorch_upsampling1d(data_1d, io_type, backend): config = hls4ml.utils.config_from_pytorch_model( model, + (None, in_feat, in_width), default_precision='ap_fixed<16,6>', channels_last_conversion="internal", transpose_outputs=False, ) odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_1d_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_pytorch_model( - model, (None, in_feat, in_width), hls_config=config, io_type=io_type, output_dir=odir, backend=backend + model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend ) hls_model.compile() @@ -84,13 +85,14 @@ def test_pytorch_upsampling2d(data_2d, io_type, backend): config = hls4ml.utils.config_from_pytorch_model( model, + (in_feat, in_height, in_width), default_precision='ap_fixed<16,6>', channels_last_conversion="full", # With conversion to channels_last transpose_outputs=True, ) odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_2d_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_pytorch_model( - model, (None, in_feat, in_height, in_width), hls_config=config, io_type=io_type, output_dir=odir, backend=backend + model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend ) hls_model.compile() From dd32d3b9ca83cd19e8adcfe7a28158935af3b388 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 30 Aug 2024 11:53:01 -0500 Subject: [PATCH 127/272] update vitis nnet_pooling with some changes from vivado backend --- hls4ml/templates/vitis/nnet_utils/nnet_pooling.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h index d8ac60a839..bb093f721a 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h @@ -70,6 +70,7 @@ struct pooling1d_config { static const unsigned n_out = (n_in - pool_width) / stride_width + 1; static const unsigned pad_left = 0; static const unsigned pad_right = 0; + static const bool count_pad = false; // Pooling function static const Pool_Op pool_op = Max; }; @@ -130,6 +131,7 @@ void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T r for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { data_T pool[CONFIG_T::n_in]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 for (int jj = 0; jj < CONFIG_T::n_in; jj++) { pool[jj] = data[jj * CONFIG_T::n_filt + ff]; } @@ -154,6 +156,7 @@ struct pooling2d_config { static const unsigned pad_bottom = 0; static const unsigned pad_left = 0; static const unsigned pad_right = 0; + static const bool count_pad = false; // Pooling function static const Pool_Op pool_op = Max; // Reuse factor @@ -245,6 +248,7 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ // Loop over input image x in steps of stride for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 // Keep track of number of pixels in image vs padding region unsigned img_overlap = 0; // Loop over pool window y @@ -255,10 +259,12 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { // Add padding pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; } else { pool[kk * CONFIG_T::stride_width + ll] = - data[(ii + kk) * CONFIG_T::in_width + ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + - jj]; + data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + + ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left]; img_overlap++; } } From 0be1ef5fa44ccbff88ae10a24031969b9f421e00 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 30 Aug 2024 12:17:36 -0500 Subject: [PATCH 128/272] fix padding issues with pooling for Vivado, Vitis, Catapult --- .../catapult/nnet_utils/nnet_pooling.h | 52 ++++++++----------- .../templates/vitis/nnet_utils/nnet_pooling.h | 44 +++++++--------- .../vivado/nnet_utils/nnet_pooling.h | 20 ++++--- 3 files changed, 52 insertions(+), 64 deletions(-) diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h index 82e281023b..d6ab38a960 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h @@ -107,22 +107,20 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit_1d(); #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit - // Add any necessary padding - unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image x in steps of stride - for (int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) { + for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 // Keep track of number of pixels in image vs padding region unsigned img_overlap = 0; // Loop over pool window x for (int jj = 0; jj < CONFIG_T::stride_width; jj++) { - if (ii + jj < CONFIG_T::pad_left || ii + jj >= (padded_width - CONFIG_T::pad_right)) { + if (ii + jj < CONFIG_T::pad_left || ii + jj >= (full_padded_width - CONFIG_T::pad_right)) { // Add padding pool[jj] = pad_val(); if (CONFIG_T::count_pad) { @@ -211,19 +209,17 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit - // Add any necessary padding - unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; - unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image y in steps of stride - for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) { // Loop over input image x in steps of stride - for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 // Keep track of number of pixels in image vs padding region @@ -232,8 +228,8 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { // Loop over pool window x for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { - if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) || - jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { + if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) || + jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) { // Add padding pool[kk * CONFIG_T::stride_width + ll] = pad_val(); if (CONFIG_T::count_pad) { @@ -275,19 +271,17 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit - // Add any necessary padding - unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; - unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image y in steps of stride - for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) { // Loop over input image x in steps of stride - for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 // Keep track of number of pixels in image vs padding region @@ -296,8 +290,8 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { // Loop over pool window x for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { - if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) || - jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { + if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) || + jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) { // Add padding pool[kk * CONFIG_T::stride_width + ll] = pad_val(); if (CONFIG_T::count_pad) { diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h index bb093f721a..93d23d2689 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_pooling.h @@ -89,14 +89,13 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF CONFIG_T::pool_op, typename CONFIG_T::accum_t> limit=limit // Add any necessary padding - unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image x in steps of stride - for (int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) { + for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) { unsigned overlap_pixel = 0; data_T pool[CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 @@ -179,18 +178,17 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ const int limit = pool_op_limit(); #pragma HLS ALLOCATION function instances=pool_op limit=limit - unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; - unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image y in steps of stride - for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) { // Loop over input image x in steps of stride - for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 @@ -234,19 +232,17 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ const int limit = pool_op_limit(); #pragma HLS ALLOCATION function instances=pool_op limit=limit - // Add any necessary padding - unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; - unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image y in steps of stride - for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) { // Loop over input image x in steps of stride - for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 // Keep track of number of pixels in image vs padding region @@ -255,8 +251,8 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { // Loop over pool window x for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { - if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) || - jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { + if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) || + jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) { // Add padding pool[kk * CONFIG_T::stride_width + ll] = pad_val(); if (CONFIG_T::count_pad) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h index e6182d20db..78af07fc10 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h @@ -87,14 +87,13 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit // Add any necessary padding - unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image x in steps of stride - for (int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) { + for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) { unsigned overlap_pixel = 0; data_T pool[CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 @@ -176,12 +175,11 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ const int limit = pool_op_limit(); #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit - unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; - unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { From 88f072bb0d112e5a9bfd909b928af3d531287ea3 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 30 Aug 2024 12:18:48 -0500 Subject: [PATCH 129/272] fix pre-commit --- hls4ml/templates/vivado/nnet_utils/nnet_pooling.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h index 78af07fc10..60f974564c 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h @@ -175,7 +175,7 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ const int limit = pool_op_limit(); #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit - // Add padding and reduce input width to area covered by pooling function + // Add padding and reduce input width to area covered by pooling function static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; From 938eb5e6ba0740315910dcdf1efb5d0b8bea0d9e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 30 Aug 2024 12:28:18 -0500 Subject: [PATCH 130/272] add missed Vivado backend pooling changes --- .../vivado/nnet_utils/nnet_pooling.h | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h index 60f974564c..bb9f0b3f05 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h @@ -184,9 +184,9 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image y in steps of stride - for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) { // Loop over input image x in steps of stride - for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 @@ -229,19 +229,17 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit - // Add any necessary padding - unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; - unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; - if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { - padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); - padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); - } + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height; for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image y in steps of stride - for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) { // Loop over input image x in steps of stride - for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 // Keep track of number of pixels in image vs padding region @@ -250,8 +248,8 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { // Loop over pool window x for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { - if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) || - jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { + if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) || + jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) { // Add padding pool[kk * CONFIG_T::stride_width + ll] = pad_val(); if (CONFIG_T::count_pad) From cc7652de36847360b54c99c6fb9cad3665760943 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 3 Sep 2024 11:15:53 -0500 Subject: [PATCH 131/272] Pre-commit fix --- hls4ml/converters/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py index 40515afea6..c5ff82703c 100644 --- a/hls4ml/converters/__init__.py +++ b/hls4ml/converters/__init__.py @@ -11,6 +11,7 @@ from hls4ml.converters.keras_to_hls import parse_keras_model # noqa: F401 from hls4ml.converters.keras_to_hls import keras_to_hls, register_keras_layer_handler from hls4ml.converters.onnx_to_hls import parse_onnx_model # noqa: F401 + # from hls4ml.converters.pytorch_to_hls import parse_pytorch_model # noqa: F401 from hls4ml.model import ModelGraph from hls4ml.utils.config import create_config From b36fe4ff2eadd2c023550dec580a64e0dbb2b5ef Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 3 Sep 2024 19:19:48 -0500 Subject: [PATCH 132/272] fix qonnx review suggestions --- hls4ml/converters/__init__.py | 2 -- hls4ml/converters/onnx/core.py | 14 ------------- .../model/optimizer/passes/batchnorm_opt.py | 21 ++++++++----------- hls4ml/model/optimizer/passes/bn_fuse.py | 11 ++++------ 4 files changed, 13 insertions(+), 35 deletions(-) diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py index c5ff82703c..13e90df687 100644 --- a/hls4ml/converters/__init__.py +++ b/hls4ml/converters/__init__.py @@ -11,8 +11,6 @@ from hls4ml.converters.keras_to_hls import parse_keras_model # noqa: F401 from hls4ml.converters.keras_to_hls import keras_to_hls, register_keras_layer_handler from hls4ml.converters.onnx_to_hls import parse_onnx_model # noqa: F401 - -# from hls4ml.converters.pytorch_to_hls import parse_pytorch_model # noqa: F401 from hls4ml.model import ModelGraph from hls4ml.utils.config import create_config from hls4ml.utils.symbolic_utils import LUTFunction diff --git a/hls4ml/converters/onnx/core.py b/hls4ml/converters/onnx/core.py index c6aaa6009c..d84ba98a95 100644 --- a/hls4ml/converters/onnx/core.py +++ b/hls4ml/converters/onnx/core.py @@ -29,7 +29,6 @@ def parse_matmul_layer(node, input_names, input_shapes, graph): 'Softmax', 'Softsign', 'Softplus', - # 'Clip', ] activation_map = { @@ -45,7 +44,6 @@ def parse_matmul_layer(node, input_names, input_shapes, graph): 'Softmax': 'Softmax', 'Softsign': 'Activation', 'Softplus': 'Activation', - # 'Clip': 'Clip', } # --------- @@ -69,18 +67,6 @@ def parse_activation_layer(node, input_names, input_shapes, graph): layer['activation'] = layer['class_name'] layer['activ_param'] = get_onnx_attribute(node, 'alpha', 0.01) - # # Don't yet support Clip - # elif layer['class_name'] == 'Clip': - # clip_min_node = [x for x in graph.initializer if x.name in input_names] - # clip_min = clip_min_node[0].float_data[0] - - # # Check if it's relu or not - # if clip_min == 0.0: - # layer['class_name'] = 'Activation' - # layer['activation'] = 'ReLU' - # else: - # raise Exception('Clip with min != 0 is not supported yet!') - else: layer['activation'] = layer['class_name'] layer['class_name'] = 'Activation' diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index 94a9a32d70..50bbf96e04 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -30,7 +30,7 @@ def transform(self, model, node): gamma_node = node.get_input_node(node.inputs[1]) if not isinstance(gamma_node, Constant): - raise TypeError('Only consant gammas supported') + raise TypeError('Only constant gammas supported') gamma = gamma_node.attributes['value'] attributes['gamma_data'] = gamma attributes['gamma_quantizer'] = gamma_node.get_attr('quantizer') @@ -40,7 +40,7 @@ def transform(self, model, node): beta_node = node.get_input_node(node.inputs[2]) if not isinstance(beta_node, Constant): - raise TypeError('Only consant betas supported') + raise TypeError('Only constant betas supported') beta = beta_node.attributes['value'] attributes['beta_data'] = beta attributes['beta_quantizer'] = beta_node.get_attr('quantizer') @@ -49,7 +49,7 @@ def transform(self, model, node): moving_mean_node = node.get_input_node(node.inputs[3]) if not isinstance(moving_mean_node, Constant): - raise TypeError('Only consant moving_means supported') + raise TypeError('Only constant moving_means supported') moving_mean = moving_mean_node.attributes['value'] attributes['mean_data'] = moving_mean attributes['mean_quantizer'] = moving_mean_node.get_attr('quantizer') @@ -58,7 +58,7 @@ def transform(self, model, node): moving_variance_node = node.get_input_node(node.inputs[4]) if not isinstance(moving_variance_node, Constant): - raise TypeError('Only consant moving_variances supported') + raise TypeError('Only constant moving_variances supported') moving_variance = moving_variance_node.attributes['value'] attributes['variance_data'] = moving_variance attributes['variance_quantizer'] = moving_variance_node.get_attr('quantizer') @@ -147,12 +147,14 @@ def transform(self, model, node): class FuseConsecutiveBatchNormalization(OptimizerPass): """ - OptimizerPass to merge consecutive BatchNormalization layers, - only if the earlier one does not have quantization specified + OptimizerPass to merge consecutive BatchNormalization layers, only if the earlier one does not have the output type + specified. There is a further check on the compatibility to merge: except in cases when merging a scale of 1 or a + bias of 0, this does not merge when both scales or both biases are quantized. Note: Consider restricting this to ApplyAlpha. Batch Normalization-style quantization seems to be ignored. - Note: This optimizer may not be safe if weights are updateable. May need to turn off. + Note: This optimizer may not be safe if weights are updateable, in particular if a scale can go from ones to other + values or if a bias can go from zeros to other values. """ def match(self, node): @@ -190,11 +192,6 @@ def transform(self, model, node): if len(prev_map[prev_node.outputs[0]]) > 1: return False - # # Not sure why this part is needed - # node_map = node.get_output_use_map() - # if len(node_map[node.outputs[0]]) > 1: - # return False - s0 = prev_node.weights['scale'].data_unquantized b0 = prev_node.weights['bias'].data_unquantized s1 = node.weights['scale'].data_unquantized diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py index b3e8e454c8..000d8380ce 100644 --- a/hls4ml/model/optimizer/passes/bn_fuse.py +++ b/hls4ml/model/optimizer/passes/bn_fuse.py @@ -7,8 +7,10 @@ class FuseBatchNormalization(OptimizerPass): """ - OptimizerPass to merge BatchNormalization layers, - only if the earlier one does not have quantization specified + OptimizerPass to merge a BatchNormalization layer with Dense or Conv layer, only if the Dense or Conv layer does not + have the output type specified. There is a further check on the compatibility to merge: except in cases when merging a + weight/scale of 1 or a bias of 0, this optimizer does not merge nodes when both the weight and scale or both biases + are quantized. Note: Consider restricting this to ApplyAlpha. Batch Normalization quantization seems to be ignored. @@ -49,11 +51,6 @@ def transform(self, model, node): if len(parent_map[parent_node.outputs[0]]) > 1: return False - # # Not sure why this part is needed - # node_map = node.get_output_use_map() - # if len(node_map[node.outputs[0]]) > 1: - # return False - parent_weight = parent_node.weights['weight'] parent_bias = parent_node.weights['bias'] From c37d953181f64396d079c5d4b5f51dabceae8e2e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 3 Sep 2024 19:22:44 -0500 Subject: [PATCH 133/272] fix qonnx review suggestions (part 2) --- hls4ml/model/optimizer/passes/merge_const.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py index 78591d203c..a75ed27aca 100644 --- a/hls4ml/model/optimizer/passes/merge_const.py +++ b/hls4ml/model/optimizer/passes/merge_const.py @@ -67,7 +67,7 @@ def transform(self, model, node): class MergeToApplyAlpha(OptimizerPass): - """Convert Add, Sub, Mul, or Div Merges with consant to ApplyAlpha""" + """Convert Add, Sub, Mul, or Div Merges with constant to ApplyAlpha""" def match(self, node): is_match = ( @@ -178,7 +178,7 @@ def transform(self, model, node): class MergeToApplyAlphaDiv(OptimizerPass): """ - Convert Div Merges with consant to ApplyAlpha + Convert Div Merges with constant to ApplyAlpha TODO: propagate precision """ From 23825ded13fb418516af29fa6a4768c97bc98ba8 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 4 Sep 2024 12:10:19 -0500 Subject: [PATCH 134/272] fix error message --- hls4ml/model/optimizer/passes/batchnorm_opt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index 50bbf96e04..cd238092c8 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -24,7 +24,7 @@ def transform(self, model, node): """ if not (len(node.inputs) == 5 and all(node.inputs)): - raise ValueError(f'All {len.node.inputs} BatchNormOnnnx inputs need to be defined') + raise ValueError('All 5 BatchNormOnnnx inputs need to be defined') attributes = {k: node.attributes.get(k, None) for k in _base_attributes} From cad06fa9361810fb006061a65fbea2b49feee50b Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 9 Sep 2024 14:50:21 -0500 Subject: [PATCH 135/272] change order of qonnx optimizers --- hls4ml/model/optimizer/__init__.py | 2 +- hls4ml/model/optimizer/passes/batchnorm_opt.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 64be9903ad..fee180b0c5 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -37,8 +37,8 @@ 'quant_constant_parameters', 'quant_to_activation', 'fuse_quant_with_constant', - 'quant_to_alpha_activation_alpha', 'const_quant_to_const_alpha', + 'quant_to_alpha_activation_alpha', 'batch_norm_onnx_constant_parameters', 'constant_batch_norm_fusion', 'merge_two_constants', diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index cd238092c8..0dde6b77a9 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np from hls4ml.model.layers import BatchNormalization, BatchNormOnnx, Constant @@ -94,11 +96,14 @@ def transform(self, model, node): """ Remove the batch norm """ + warnings.warn('ConstantBatchNormFusion should probably not be triggered. Check the optimizer order.', stacklevel=2) const_node = node.get_input_node(node.inputs[0]) const_prec = const_node.get_output_variable().type.precision - new_val = const_node.value * node.weights['scale'].data_unquantized + node.weights['bias'].data_unquantized + new_val = ( + const_node.attributes['value'] * node.weights['scale'].data_unquantized + node.weights['bias'].data_unquantized + ) const_node.set_attr('value', new_val) const_node.set_attr('quantizer', node.get_attr('quantizer')) # None if not defined From bb708ccb165e167462e61b41c081b405ec5bda2e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 10 Sep 2024 18:03:31 -0500 Subject: [PATCH 136/272] remove padding attribute (#1061) * remove padding attribute * remote checking the padding attribute in pytorch tests --- .../backends/catapult/passes/conv_same_pad.py | 18 +++++++----------- hls4ml/backends/vivado/passes/conv_same_pad.py | 18 +++++++----------- hls4ml/converters/keras/convolution.py | 6 ++---- hls4ml/converters/keras/pooling.py | 6 ++---- hls4ml/converters/pytorch/convolution.py | 10 ---------- hls4ml/model/optimizer/passes/multi_dense.py | 1 - .../optimizer/passes/seperable_to_dw_conv.py | 1 - test/pytest/test_pytorch_api.py | 10 ++-------- 8 files changed, 20 insertions(+), 50 deletions(-) diff --git a/hls4ml/backends/catapult/passes/conv_same_pad.py b/hls4ml/backends/catapult/passes/conv_same_pad.py index bb8354a3d0..8946e493fc 100755 --- a/hls4ml/backends/catapult/passes/conv_same_pad.py +++ b/hls4ml/backends/catapult/passes/conv_same_pad.py @@ -6,10 +6,8 @@ class InsertZeroPaddingBeforeConv1D(OptimizerPass): name = 'insert_zero_padding_before_conv1d' def match(self, node): - is_match = ( - isinstance(node, (Conv1D, SeparableConv1D)) - and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal')) - and node.get_attr('filt_width') != 1 + is_match = isinstance(node, (Conv1D, SeparableConv1D)) and ( + (node.get_attr('pad_left') != 0) or (node.get_attr('pad_right') != 0) ) return is_match @@ -37,7 +35,6 @@ def transform(self, model, node): } # Switch Conv1D layer padding to 'valid' - node.set_attr('padding', 'valid') node.set_attr('pad_left', 0) node.set_attr('pad_right', 0) node.set_attr('in_width', out_width) @@ -54,11 +51,11 @@ class InsertZeroPaddingBeforeConv2D(OptimizerPass): name = 'insert_zero_padding_before_conv2d' def match(self, node): - is_match = ( - isinstance(node, (Conv2D, SeparableConv2D)) - and node.get_attr('padding') == 'same' - and node.get_attr('filt_height') != 1 - and node.get_attr('filt_width') != 1 + is_match = isinstance(node, (Conv2D, SeparableConv2D)) and ( + (node.get_attr('pad_left') != 0) + or (node.get_attr('pad_right') != 0) + or (node.get_attr('pad_top') != 0) + or (node.get_attr('pad_bottom') != 0) ) return is_match @@ -93,7 +90,6 @@ def transform(self, model, node): } # Switch Conv2D layer padding to 'valid' - node.set_attr('padding', 'valid') node.set_attr('pad_top', 0) node.set_attr('pad_bottom', 0) node.set_attr('pad_left', 0) diff --git a/hls4ml/backends/vivado/passes/conv_same_pad.py b/hls4ml/backends/vivado/passes/conv_same_pad.py index bb8354a3d0..8946e493fc 100644 --- a/hls4ml/backends/vivado/passes/conv_same_pad.py +++ b/hls4ml/backends/vivado/passes/conv_same_pad.py @@ -6,10 +6,8 @@ class InsertZeroPaddingBeforeConv1D(OptimizerPass): name = 'insert_zero_padding_before_conv1d' def match(self, node): - is_match = ( - isinstance(node, (Conv1D, SeparableConv1D)) - and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal')) - and node.get_attr('filt_width') != 1 + is_match = isinstance(node, (Conv1D, SeparableConv1D)) and ( + (node.get_attr('pad_left') != 0) or (node.get_attr('pad_right') != 0) ) return is_match @@ -37,7 +35,6 @@ def transform(self, model, node): } # Switch Conv1D layer padding to 'valid' - node.set_attr('padding', 'valid') node.set_attr('pad_left', 0) node.set_attr('pad_right', 0) node.set_attr('in_width', out_width) @@ -54,11 +51,11 @@ class InsertZeroPaddingBeforeConv2D(OptimizerPass): name = 'insert_zero_padding_before_conv2d' def match(self, node): - is_match = ( - isinstance(node, (Conv2D, SeparableConv2D)) - and node.get_attr('padding') == 'same' - and node.get_attr('filt_height') != 1 - and node.get_attr('filt_width') != 1 + is_match = isinstance(node, (Conv2D, SeparableConv2D)) and ( + (node.get_attr('pad_left') != 0) + or (node.get_attr('pad_right') != 0) + or (node.get_attr('pad_top') != 0) + or (node.get_attr('pad_bottom') != 0) ) return is_match @@ -93,7 +90,6 @@ def transform(self, model, node): } # Switch Conv2D layer padding to 'valid' - node.set_attr('padding', 'valid') node.set_attr('pad_top', 0) node.set_attr('pad_bottom', 0) node.set_attr('pad_left', 0) diff --git a/hls4ml/converters/keras/convolution.py b/hls4ml/converters/keras/convolution.py index d223d55dfb..950a672692 100644 --- a/hls4ml/converters/keras/convolution.py +++ b/hls4ml/converters/keras/convolution.py @@ -30,10 +30,9 @@ def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader): layer['n_filt'] = layer['n_chan'] * layer.get('depth_multiplier') layer['filt_width'] = keras_layer['config']['kernel_size'][0] layer['stride_width'] = keras_layer['config']['strides'][0] - layer['padding'] = keras_layer['config']['padding'] (layer['out_width'], layer['pad_left'], layer['pad_right']) = compute_padding_1d( - layer['padding'], layer['in_width'], layer['stride_width'], layer['filt_width'] + keras_layer['config']['padding'], layer['in_width'], layer['stride_width'], layer['filt_width'] ) if layer['data_format'] == 'channels_last': @@ -74,7 +73,6 @@ def parse_conv2d_layer(keras_layer, input_names, input_shapes, data_reader): layer['filt_width'] = keras_layer['config']['kernel_size'][1] layer['stride_height'] = keras_layer['config']['strides'][0] layer['stride_width'] = keras_layer['config']['strides'][1] - layer['padding'] = keras_layer['config']['padding'] ( layer['out_height'], @@ -84,7 +82,7 @@ def parse_conv2d_layer(keras_layer, input_names, input_shapes, data_reader): layer['pad_left'], layer['pad_right'], ) = compute_padding_2d( - layer['padding'], + keras_layer['config']['padding'], layer['in_height'], layer['in_width'], layer['stride_height'], diff --git a/hls4ml/converters/keras/pooling.py b/hls4ml/converters/keras/pooling.py index f0e00242b0..14d6a9236a 100644 --- a/hls4ml/converters/keras/pooling.py +++ b/hls4ml/converters/keras/pooling.py @@ -15,10 +15,9 @@ def parse_pooling_layer(keras_layer, input_names, input_shapes, data_reader): layer['pool_width'] = keras_layer['config']['pool_size'][0] layer['stride_width'] = keras_layer['config']['strides'][0] - layer['padding'] = keras_layer['config']['padding'] (layer['n_out'], layer['pad_left'], layer['pad_right']) = compute_padding_1d( - layer['padding'], layer['n_in'], layer['stride_width'], layer['pool_width'] + keras_layer['config']['padding'], layer['n_in'], layer['stride_width'], layer['pool_width'] ) if layer['data_format'] == 'channels_last': @@ -32,7 +31,6 @@ def parse_pooling_layer(keras_layer, input_names, input_shapes, data_reader): layer['stride_width'] = keras_layer['config']['strides'][1] layer['pool_height'] = keras_layer['config']['pool_size'][0] layer['pool_width'] = keras_layer['config']['pool_size'][1] - layer['padding'] = keras_layer['config']['padding'] ( layer['out_height'], @@ -42,7 +40,7 @@ def parse_pooling_layer(keras_layer, input_names, input_shapes, data_reader): layer['pad_left'], layer['pad_right'], ) = compute_padding_2d( - layer['padding'], + keras_layer['config']['padding'], layer['in_height'], layer['in_width'], layer['stride_height'], diff --git a/hls4ml/converters/pytorch/convolution.py b/hls4ml/converters/pytorch/convolution.py index 5c0d4d2d4c..40295e0865 100644 --- a/hls4ml/converters/pytorch/convolution.py +++ b/hls4ml/converters/pytorch/convolution.py @@ -35,11 +35,6 @@ def parse_conv1d_layer(operation, layer_name, input_names, input_shapes, node, c else: padding = class_object.padding - if padding == 0: # No padding, i.e., 'VALID' padding in Keras/Tensorflow - layer['padding'] = 'valid' - else: # Only 'valid' and 'same' padding are available in Keras - layer['padding'] = 'same' - # Ouput info (layer['out_width'], pad_left, pad_right) = compute_padding_1d_pytorch( padding, layer['in_width'], layer['stride_width'], layer['filt_width'], layer['dilation'] @@ -84,11 +79,6 @@ def parse_conv2d_layer(operation, layer_name, input_names, input_shapes, node, c layer['pad_top'] = layer['pad_bottom'] = class_object.padding[0] layer['pad_left'] = layer['pad_right'] = class_object.padding[1] - if all(x == 0 for x in class_object.padding): # No padding, i.e., 'VALID' padding in Keras/Tensorflow - layer['padding'] = 'valid' - else: # Only 'valid' and 'same' padding are available in Keras - layer['padding'] = 'same' - # Ouput info (layer['out_height'], layer['out_width'], _, _, _, _) = compute_padding_2d_pytorch( class_object.padding, diff --git a/hls4ml/model/optimizer/passes/multi_dense.py b/hls4ml/model/optimizer/passes/multi_dense.py index 008011bde2..4419abf9c8 100644 --- a/hls4ml/model/optimizer/passes/multi_dense.py +++ b/hls4ml/model/optimizer/passes/multi_dense.py @@ -20,7 +20,6 @@ def transform(self, model, node): conv_attrs = { 'data_format': 'channels_last', - 'padding': 'valid', 'n_chan': input_shape[-1], 'n_filt': node.get_attr('n_out'), 'weight_data': np.expand_dims(node.get_attr('weight_data'), axis=tuple(range(dim))), diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py index 7d3b71dc96..38eef1e7d0 100644 --- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py +++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py @@ -33,7 +33,6 @@ class SeperableToDepthwiseAndConv(OptimizerPass): 'data_format', 'depthwise_data', 'depthwise_quantizer', - 'padding', ) _pw_attributes = ('out_width', 'n_filt', 'dilation_width', 'out_height', 'dilation_height', 'data_format', 'use_bias') diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index 295867c4ff..d49b5b8902 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -277,10 +277,7 @@ def test_conv1d(padds, backend, io_type): assert list(hls_model.get_layers())[conv_index].attributes['n_chan'] == class_object_conv.in_channels assert list(hls_model.get_layers())[conv_index].attributes['n_filt'] == class_object_conv.out_channels assert list(hls_model.get_layers())[conv_index].attributes['stride_width'] == class_object_conv.stride[0] - if list(hls_model.get_layers())[conv_index].attributes['padding'] == 'valid': - padding = 0 - else: - padding = 1 + padding = padds if io_type == "io_stream" and (backend == "Vivado" or backend == "Vitis") and padds == 1: padding = 1 padds = 0 @@ -424,10 +421,7 @@ def test_conv2d(padds, backend, io_type): assert list(hls_model.get_layers())[conv_index].attributes['n_filt'] == class_object_conv.out_channels assert list(hls_model.get_layers())[conv_index].attributes['stride_width'] == class_object_conv.stride[1] assert list(hls_model.get_layers())[conv_index].attributes['stride_height'] == class_object_conv.stride[0] - if list(hls_model.get_layers())[conv_index].attributes['padding'] == 'valid': - padding = 0 - else: - padding = 1 + padding = padds assert padding == class_object_conv.padding[0] assert list(hls_model.get_layers())[conv_index].attributes['data_format'] == 'channels_last' From 4a83abc1bc73de0363a0f713ddb2dc289a5aefae Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 11 Sep 2024 01:05:12 +0200 Subject: [PATCH 137/272] Run long-running pytests out of the bundle --- test/pytest/generate_ci_yaml.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py index f62d752ade..b130b43cef 100644 --- a/test/pytest/generate_ci_yaml.py +++ b/test/pytest/generate_ci_yaml.py @@ -20,8 +20,12 @@ n_test_files_per_yml = int(os.environ.get('N_TESTS_PER_YAML', 4)) +# Blacklisted tests will be skipped BLACKLIST = {'test_reduction'} +# Long-running tests will not be bundled with other tests +LONGLIST = {'test_hgq_layers'} + def path_to_name(test_path): path = Path(test_path) @@ -43,9 +47,7 @@ def uses_example_model(test_filename): def generate_test_yaml(test_root='.'): test_root = Path(test_root) - test_paths = [path for path in test_root.glob('**/test_*.py') if path.stem not in BLACKLIST] - for path in test_paths: - print(path.name) + test_paths = [path for path in test_root.glob('**/test_*.py') if path.stem not in (BLACKLIST | LONGLIST)] need_example_models = [uses_example_model(path) for path in test_paths] idxs = list(range(len(need_example_models))) @@ -63,6 +65,15 @@ def generate_test_yaml(test_root='.'): yml = diff_yml else: yml.update(diff_yml) + + test_paths = [path for path in test_root.glob('**/test_*.py') if path.stem in LONGLIST] + for path in test_paths: + name = path.stem.replace('test_', '') + test_file = str(path.relative_to(test_root)) + needs_examples = uses_example_model(path) + diff_yml = yaml.safe_load(template.format(name, test_file, needs_examples)) + yml.update(diff_yml) + return yml From d63033b3be4d3e76aa7e67e6641b6e72523dde25 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Wed, 11 Sep 2024 11:38:10 -0400 Subject: [PATCH 138/272] Fix tanh activiation in pytorch parser (#1055) * fix tanh activiation in pytorch parser * simplify fix but making the activation attribute lower case --- hls4ml/converters/pytorch/core.py | 8 +++----- hls4ml/converters/pytorch_to_hls.py | 1 + test/pytest/test_pytorch_api.py | 20 ++++++++++++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py index 0262fdab03..d3ba470bf5 100644 --- a/hls4ml/converters/pytorch/core.py +++ b/hls4ml/converters/pytorch/core.py @@ -43,14 +43,12 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod layer = {} layer['class_name'] = operation - layer['activation'] = layer['class_name'] + layer['activation'] = layer['class_name'].lower() layer['name'] = layer_name layer['inputs'] = input_names - # if layer['class_name'] != 'Activation': - # layer['activation'] = layer['class_name'] if node.op == 'call_module': - if layer['class_name'] == 'ReLU' or layer['class_name'] == 'Sigmoid': + if layer['class_name'] in ['ReLU', 'Sigmoid', 'Tanh']: layer['class_name'] = 'Activation' if layer['class_name'] == 'LeakyReLU': layer['activ_param'] = class_object.negative_slope @@ -68,7 +66,7 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod if hasattr(node, 'dim'): layer['axis'] = class_object.dim else: - if layer['class_name'] == 'ReLU' or layer['class_name'] == 'Sigmoid': + if layer['class_name'] in ['ReLU', 'Sigmoid', 'Tanh']: layer['class_name'] = 'Activation' if layer['class_name'] == 'LeakyReLU': layer['activ_param'] = node.kwargs['negative_slope'] diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py index 40336835a6..79ca1fa5c6 100644 --- a/hls4ml/converters/pytorch_to_hls.py +++ b/hls4ml/converters/pytorch_to_hls.py @@ -84,6 +84,7 @@ def decorator(function): # map names of operations between toch.nn and torch.nn.functionals layer_name_map = { 'relu': 'ReLU', + 'tanh': 'Tanh', 'leaky_relu': 'LeakyReLU', 'elu': 'ELU', 'prelu': 'PReLU', diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index d49b5b8902..fee7b9a3aa 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -64,6 +64,7 @@ def test_linear(backend, io_type): "activation_function", [ nn.ReLU(), + nn.Tanh(), nn.LeakyReLU(negative_slope=1.0), nn.ELU(alpha=1.0), nn.PReLU(init=0.25), @@ -102,7 +103,7 @@ def test_activations(activation_function, backend, io_type): assert nNodes - 1 == len(hls_model.get_layers()) - if activation_function.__class__.__name__ == 'ReLU' or activation_function.__class__.__name__ == 'Sigmoid': + if activation_function.__class__.__name__ in ['ReLU', 'Sigmoid', 'Tanh']: assert list(hls_model.get_layers())[2].attributes['class_name'] == 'Activation' elif activation_function.__class__.__name__ == 'Threshold': assert list(hls_model.get_layers())[2].attributes['class_name'] == 'ThresholdedReLU' @@ -118,6 +119,14 @@ def forward(self, x): return nn.functional.relu(x) +class TanHModel(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return nn.functional.tanh(x) + + class LeakyReLuModel(nn.Module): def __init__(self): super().__init__() @@ -154,6 +163,7 @@ def forward(self, x): "activation_function", [ ReLuModel(), + TanHModel(), LeakyReLuModel(), EluModel(), SigmoidModel(), @@ -172,7 +182,7 @@ def test_activation_functionals(activation_function, backend, io_type): config = config_from_pytorch_model(model, (1,)) fn_name = activation_function.__class__.__name__ - output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_activations_functional_relu_{backend}_{io_type}_{fn_name}') + output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_activations_functional_{fn_name}_{backend}_{io_type}') hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) hls_model.compile() @@ -268,7 +278,7 @@ def test_conv1d(padds, backend, io_type): act_index = 2 assert list(hls_model.get_layers())[conv_index].attributes['name'] == convNode.name assert list(hls_model.get_layers())[conv_index].attributes['class_name'] == 'Conv1D' - assert list(hls_model.get_layers())[act_index].attributes['activation'] == class_object_relu.__class__.__name__ + assert list(hls_model.get_layers())[act_index].attributes['activation'] == class_object_relu.__class__.__name__.lower() if io_type == "io_stream" and (backend == "Vivado" or backend == "Vitis") and padds == 1: assert list(hls_model.get_layers())[conv_index].attributes["in_width"] == size_in + 2 else: @@ -412,7 +422,9 @@ def test_conv2d(padds, backend, io_type): act_index = 2 assert list(hls_model.get_layers())[conv_index].attributes['name'] == convNode.name assert list(hls_model.get_layers())[conv_index].attributes['class_name'] == 'Conv2D' - assert list(hls_model.get_layers())[act_index].attributes['activation'] == class_object_relu.__class__.__name__ + assert ( + list(hls_model.get_layers())[act_index].attributes['activation'] == class_object_relu.__class__.__name__.lower() + ) assert list(hls_model.get_layers())[conv_index].attributes["in_width"] == size_in_width assert list(hls_model.get_layers())[conv_index].attributes["in_height"] == size_in_height assert list(hls_model.get_layers())[conv_index].attributes['filt_width'] == class_object_conv.kernel_size[1] From 5d0bdb5cde0e59348f3a68b4a41579ac87e6db7e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 11 Sep 2024 11:56:48 -0500 Subject: [PATCH 139/272] make auto the default for layer config (#1016) * make auto the default for layers * add max_precision, not currently used * add maximum precision in standard precision inference * minimal handling of other types in infer_precision (e.g. for binary) * add more checks for max precision * fix the incorrect setting of reuse factors * update tests to pass backend to config_from_* * fix parameters syntax error introduced in pytest commit * add basic type inference for embedding * add placeholder precision inference for rnn * fix syntax error in test_qkeras * fix up test_trace * don't pass auto in test_attributes * update documentation * update documentation (2) * move some optimizers before infering precision type * move up the channnels_last_converter * put missing precision_merge logic in infer_preicion and delete, reorder optimizers * add type inference to catapult --------- Co-authored-by: Vladimir --- docs/api/configuration.rst | 38 ++- docs/setup.rst | 2 +- docs/status.rst | 2 +- hls4ml/backends/catapult/catapult_backend.py | 1 + hls4ml/model/optimizer/__init__.py | 15 +- .../model/optimizer/passes/infer_precision.py | 313 +++++++++++++----- .../model/optimizer/passes/precision_merge.py | 40 --- hls4ml/utils/config.py | 16 +- test/pytest/test_batchnorm.py | 4 +- test/pytest/test_batchnorm_pytorch.py | 2 +- test/pytest/test_binary_cnn.py | 4 +- test/pytest/test_causalpadding.py | 4 +- test/pytest/test_clone_flatten.py | 4 +- test/pytest/test_cnn_mnist_qkeras.py | 2 +- test/pytest/test_embed.py | 4 +- test/pytest/test_garnet.py | 4 +- test/pytest/test_globalpooling.py | 8 +- test/pytest/test_keras_api.py | 6 +- .../test_optimization/test_attributes.py | 6 + test/pytest/test_pointwiseconv.py | 2 +- test/pytest/test_pooling.py | 8 +- test/pytest/test_qkeras.py | 38 ++- test/pytest/test_rnn.py | 9 +- test/pytest/test_softmax.py | 4 +- test/pytest/test_softsign.py | 2 +- test/pytest/test_trace.py | 4 +- test/pytest/test_transpose_concat.py | 2 +- test/pytest/test_upsampling.py | 4 +- test/pytest/test_zeropadding.py | 4 +- 29 files changed, 363 insertions(+), 189 deletions(-) delete mode 100644 hls4ml/model/optimizer/passes/precision_merge.py diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst index 091f88e619..72d677d196 100644 --- a/docs/api/configuration.rst +++ b/docs/api/configuration.rst @@ -9,6 +9,7 @@ We currently support two ways of setting hls4ml's model configuration. This page .. contents:: \ +The Python API approach is recommended for most users as there are more utilities to help create the configuration dictionaries. **NOTE:** @@ -16,8 +17,10 @@ We currently support two ways of setting hls4ml's model configuration. This page * One important part of ``hls4ml`` to remember is that the user is responsible for the format of the inputs. There is no automatic formatting or normalization so this must be done in the training. -* +.. + * For developers, you might also want to checkout this section: `Detailed configuration in converted hls codes <#detailed-configuration-in-converted-hls-codes>`_. + *Broken link* ---- @@ -31,11 +34,26 @@ Using hls4ml, you can quickly generate a simple configuration dictionary from a import hls4ml config = hls4ml.utils.config_from_keras_model(model, granularity='model') -For more advanced and detailed configuration, you can also set them through the created dictionary. For example, to change the reuse factor: +This python dictionary can be edited as needed. A more advanced configuration can be generated by, for example: + +.. code-block:: python + + import hls4ml + config = hls4ml.utils.config_from_keras_model( + model, + granularity='name', + default_precision='fixed<16,6>', + backend='Vitis') + +This will include per-layer configuration based on the model. Including the backend is recommended because some configation options depend on the backend. Note, the precisions at the +higher granularites usually default to 'auto', which means that ``hls4ml`` will try to set it automatically. Note that higher granularity settings take precendence +over model-level settings. See :py:class:`~hls4ml.utils.config.config_from_keras_model` for more information on the various options. + +One can override specific values before using the configuration: .. code-block:: python - config['Model']['ReuseFactor'] = 2 + config['LayerName']['fc1']['ReuseFactor'] = 2 Or to set the precision of a specific layer's weight: @@ -45,6 +63,20 @@ Or to set the precision of a specific layer's weight: To better understand how the configuration hierachy works, refer to the next section for more details. +Finally, one then uses the configuration to create an hls model: + +.. code-block:: python + + hls_model = hls4ml.converters.convert_from_keras_model( + model, + hls_config=config, + output_dir="my_project_dir", + io_type='io_stream', + backend='Vitis' + ) + +See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options. + ---- 2. YAML Configuration file diff --git a/docs/setup.rst b/docs/setup.rst index f99b2f2dcb..a735281c3f 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -57,7 +57,7 @@ To run FPGA synthesis, installation of following tools is required: * Xilinx Vivado HLS 2018.2 to 2020.1 for synthesis for Xilinx FPGAs - * Vitis HLS 2022.1 or newer is required for synthesis for Xilinx FPGAs using the experimental ``Vitis`` backend. + * Vitis HLS 2022.2 or newer is required for synthesis for Xilinx FPGAs using the ``Vitis`` backend. * Intel Quartus 20.1 to 21.4 for the synthesis for Intel FPGAs diff --git a/docs/status.rst b/docs/status.rst index e4cac5e735..4ff4d33282 100644 --- a/docs/status.rst +++ b/docs/status.rst @@ -81,7 +81,7 @@ Other feature notes: * ``hls4ml`` is tested on Linux, and supports * Vivado HLS versions 2018.2 to 2020.1 * Intel HLS versions 20.1 to 21.4 - * Vitis HLS versions 2020.2 to 2022.2 (experimentally) + * Vitis HLS versions 2022.2 to 2024.1 * Windows and macOS are not supported * BDT support has moved to the `Conifer `__ package diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py index 0583e80dab..d939e1f30b 100644 --- a/hls4ml/backends/catapult/catapult_backend.py +++ b/hls4ml/backends/catapult/catapult_backend.py @@ -110,6 +110,7 @@ def _register_flows(self): 'catapult:inplace_stream_flatten', 'catapult:skip_softmax', 'catapult:fix_softmax_table_size', + 'infer_precision_types', ] optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index de3dffb46c..77e38b0c5b 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -33,9 +33,8 @@ register_flow( 'convert', [ - 'seperable_to_depthwise_and_conv', # has to be before precision inference - 'infer_precision_types', 'channels_last_converter', + 'seperable_to_depthwise_and_conv', 'remove_transpose_before_flatten', 'remove_nop_transpose', 'remove_single_channel_transpose', @@ -45,19 +44,17 @@ 'qkeras_factorize_alpha', 'extract_ternary_threshold', 'fuse_consecutive_batch_normalization', + 'fuse_batch_normalization', 'replace_multidimensional_dense_with_conv', 'enforce_proxy_model_embedded_config', + 'eliminate_linear_activation', + # many of the above optimzers need to be done before this + 'infer_precision_types', ], ) # TODO Maybe not all QKeras optmizers belong here? register_flow( 'optimize', - [ - 'eliminate_linear_activation', - 'fuse_consecutive_batch_normalization', - 'fuse_batch_normalization', - 'infer_precision_types', - 'set_precision_concat', - ], + [], requires=['convert'], ) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 256e8a8152..bb24f2206e 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -1,9 +1,17 @@ import math +from typing import Iterable import numpy as np from hls4ml.model.optimizer import ConfigurableOptimizerPass -from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType +from hls4ml.model.types import ( + FixedPrecisionType, + IntegerPrecisionType, + PrecisionType, + RoundingMode, + SaturationMode, + UnspecifiedPrecisionType, +) # TODO: The code assumes everything is Fixed or Integer precision. Need to add checks @@ -70,6 +78,12 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['Dot']: return self._infer_dot_precision(node, types_to_infer) + if node_class in ['Embedding']: + return self._infer_embedding_precision(node, types_to_infer) + + if node_class in ['SimpleRNN', 'LSTM', 'GRU']: + return self._infer_rnn_precision(node, types_to_infer) + # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent # this in config_from_* functions @@ -79,6 +93,20 @@ def _get_default_precision(self, node): model_config = node.model.config return model_config.backend.convert_precision_string(model_config.model_precision['default']) + def _get_maximum_precision(self, node): + model_config = node.model.config + if 'maximum' in model_config.model_precision: + return model_config.backend.convert_precision_string(model_config.model_precision['maximum']) + else: + return None + + def _all_supported_types(self, types: Iterable[PrecisionType]): + """Are all the types supported for inference--currently Integer or Fixed""" + for tp in types: + if not isinstance(tp, (IntegerPrecisionType, FixedPrecisionType)): + return False + return True + def _infer_default_type(self, node, type_name): model_config = node.model.config default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default']) @@ -99,9 +127,6 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): inferred_types = [] input_precision = node.get_input_variable().type.precision - input_width = input_precision.width - input_integers = input_precision.integer - input_signed = input_precision.signed if 'weight_t' in types_to_infer: weight_quantizer = node.get_attr('weight_quantizer', None) @@ -113,10 +138,6 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): node.weights['weight'].update_precision(node.types['weight_t'].precision) inferred_types.append('weight_t') - weight_width = node.types['weight_t'].precision.width - weight_integers = node.types['weight_t'].precision.integer - weight_signed = node.types['weight_t'].precision.signed - if 'bias_t' in types_to_infer: bias_quantizer = node.get_attr('bias_quantizer', None) if bias_quantizer is not None: @@ -127,25 +148,42 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): node.weights['bias'].update_precision(node.types['bias_t'].precision) inferred_types.append('bias_t') - bias_width = node.types['bias_t'].precision.width - bias_integers = node.types['bias_t'].precision.integer - bias_signed = node.types['bias_t'].precision.signed - no_bias = node.weights['bias'].nonzeros == 0 and self.infer_no_bias # no bias + if self._all_supported_types((input_precision, node.types['weight_t'].precision, node.types['bias_t'].precision)): + input_width = input_precision.width + input_integers = input_precision.integer + input_signed = input_precision.signed - # using math.ceil instead of np.ceil because it returns an int - bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops)) - integers = weight_integers + input_integers + math.ceil(np.log2(n_ops)) - signed = weight_signed or input_signed + weight_width = node.types['weight_t'].precision.width + weight_integers = node.types['weight_t'].precision.integer + weight_signed = node.types['weight_t'].precision.signed - frac = bitwidth - integers + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer + bias_signed = node.types['bias_t'].precision.signed + no_bias = node.weights['bias'].nonzeros == 0 and self.infer_no_bias # no bias + + # using math.ceil instead of np.ceil because it returns an int + bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops)) + integers = weight_integers + input_integers + math.ceil(np.log2(n_ops)) + signed = weight_signed or input_signed + + frac = bitwidth - integers - if not no_bias: - integers = max(integers + (bias_signed and not signed), bias_integers + (signed and not bias_signed)) + 1 - bitwidth = integers + max(frac, bias_width - bias_integers) - signed = signed or bias_signed + if not no_bias: + integers = max(integers + (bias_signed and not signed), bias_integers + (signed and not bias_signed)) + 1 + bitwidth = integers + max(frac, bias_width - bias_integers) + signed = signed or bias_signed - # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. - new_type = FixedPrecisionType(bitwidth, integers, signed) + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + bitwidth = min(bitwidth, max_precision.width) + integers = min(integers, max_precision.integer) + + # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. + new_type = FixedPrecisionType(bitwidth, integers, signed) + else: + new_type = self._get_default_precision(node) if 'accum_t' in types_to_infer: node.types['accum_t'].name = node.name + '_accum_t' @@ -173,6 +211,8 @@ def _infer_depthconv_precision(self, node, types_to_infer): n_ops = node.get_attr('filt_height', 1) * node.get_attr('filt_width') return self._infer_common_precision(node, types_to_infer, n_ops) + # This function should generally not be called because we split sepconv to depthwise and regular (pointwise). + # It has not been updated. def _infer_sepconv_precision(self, node, types_to_infer): inferred_types = [] @@ -272,24 +312,35 @@ def _infer_bn_precision(self, node, types_to_infer): scale_precision = node.types['scale_t'].precision bias_precision = node.types['bias_t'].precision - after_scale_signed = scale_precision.signed or input_precision.signed - after_scale_width = input_precision.width + scale_precision.width - after_scale_integer = input_precision.integer + scale_precision.integer + if self._all_supported_types((input_precision, scale_precision, bias_precision)): + + after_scale_signed = scale_precision.signed or input_precision.signed + after_scale_width = input_precision.width + scale_precision.width + after_scale_integer = input_precision.integer + scale_precision.integer - out_precision_signed = after_scale_signed or bias_precision.signed - out_precision_integer = ( - max( - after_scale_integer + (bias_precision.signed and not after_scale_signed), - bias_precision.integer + (after_scale_signed and not bias_precision.signed), + out_precision_signed = after_scale_signed or bias_precision.signed + out_precision_integer = ( + max( + after_scale_integer + (bias_precision.signed and not after_scale_signed), + bias_precision.integer + (after_scale_signed and not bias_precision.signed), + ) + + 1 + ) + out_precision_width = out_precision_integer + max( + after_scale_width - after_scale_integer, bias_precision.fractional ) - + 1 - ) - out_precision_width = out_precision_integer + max( - after_scale_width - after_scale_integer, bias_precision.fractional - ) - # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. - out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed) + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + out_precision_width = min(out_precision_width, max_precision.width) + out_precision_integer = min(out_precision_integer, max_precision.integer) + + # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. + out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed) + + else: + out_precision = self._get_default_precision(node) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision @@ -305,20 +356,29 @@ def _infer_pooling_precision(self, node, types_to_infer): input_precision = node.get_input_variable().type.precision pool_op = node.attributes['pool_op'].lower() - width = input_precision.width - integer = input_precision.integer - signed = input_precision.signed + if pool_op == 'max': + # This has the benefit of working for xnor types. I don't think "copy" is needed + accum_type = input_precision + + elif pool_op == 'average': + if self._all_supported_types((input_precision,)): + width = input_precision.width + integer = input_precision.integer + signed = input_precision.signed + + pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width') + extra_bits = int(np.ceil(np.log2(pool_size))) + + # for now ignore max precision in this case + accum_type = FixedPrecisionType( + width=width + extra_bits * 2, integer=integer + extra_bits, signed=signed + ) + else: + accum_type = self._get_default_precision(node) - pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width') - if pool_op == 'average': - extra_bits = int(np.ceil(np.log2(pool_size))) - elif pool_op == 'max': - extra_bits = 0 else: raise ValueError(f'Unknown pooling operation: {pool_op}') - accum_type = FixedPrecisionType(width=width + extra_bits * 2, integer=integer + extra_bits, signed=signed) - node.types['accum_t'].name = node.name + '_accum_t' node.types['accum_t'].precision = accum_type @@ -338,22 +398,76 @@ def _infer_merge_precision(self, node, types_to_infer): op = node.get_attr('op').lower() if op in ('add', 'subtract', 'average'): - new_signed = input_1.signed or input_2.signed or op == 'subtract' - new_int = ( - max( - input_1.integer + (input_2.signed and not input_1.signed), - input_2.integer + (input_1.signed and not input_2.signed), + if self._all_supported_types((input_1, input_2)): + new_signed = input_1.signed or input_2.signed or op == 'subtract' + new_int = ( + max( + input_1.integer + (input_2.signed and not input_1.signed), + input_2.integer + (input_1.signed and not input_2.signed), + ) + + 1 ) - + 1 - ) - new_width = new_int + max(input_1.fractional, input_2.fractional) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) + new_width = new_int + max(input_1.fractional, input_2.fractional) + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + new_width = min(new_width, max_precision.width) + new_int = min(new_int, max_precision.integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + out_precision = self._get_default_precision(node) elif op == 'multiply': - new_signed = input_1.signed or input_2.signed - new_int = input_1.integer + input_2.integer - new_width = input_1.width + input_2.width - out_precision = FixedPrecisionType(new_width, new_int, new_signed) + if self._all_supported_types((input_1, input_2)): + new_signed = input_1.signed or input_2.signed + new_int = input_1.integer + input_2.integer + new_width = input_1.width + input_2.width + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + new_width = min(new_width, max_precision.width) + new_int = min(new_int, max_precision.integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + out_precision = self._get_default_precision(node) elif op in ('maximum', 'minimum'): + if input_1 == input_2: + # can handle binary and potentially others + out_precision = input_1 # I assume copy is not necessary + elif self._all_supported_types((input_1, input_2)): + new_signed = input_1.signed or input_2.signed + + input_1_integer = input_1.integer + input_2_integer = input_2.integer + + # add one to integer if unsigned while new is signed + if new_signed and not input_1.signed: + input_1_integer += 1 + if new_signed and not input_2.signed: + input_2_integer += 1 + + new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) + new_int = max(input_1_integer, input_2_integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + out_precision = self._get_default_precision(node) + else: + print(f'Warning: not propagating weights for type {op}') + out_precision = self._get_default_precision(node) + + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + + return ['result_t'] + + def _infer_cat_precision(self, node, types_to_infer): + assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + + input_1 = node.get_input_variable(node.inputs[0]).type.precision + input_2 = node.get_input_variable(node.inputs[1]).type.precision + + if input_1 == input_2: + # can handle binary and potentially others + out_precision = input_1 # I assume copy is not necessary + elif self._all_supported_types((input_1, input_2)): new_signed = input_1.signed or input_2.signed input_1_integer = input_1.integer @@ -367,9 +481,20 @@ def _infer_merge_precision(self, node, types_to_infer): new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) new_int = max(input_1_integer, input_2_integer) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) + + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + new_width = min(new_width, max_precision.width) + new_int = min(new_int, max_precision.integer) + + # some logic copied from former SetPrecisionConcat optimizer + newrmode = input_1.rounding_mode if input_1.rounding_mode != RoundingMode.TRN else input_2.rounding_mode + newsmode = input_1.saturation_mode if input_1.saturation_mode != SaturationMode.WRAP else input_2.saturation_mode + newsbits = input_1.saturation_bits if input_1.saturation_bits != 0 else input_2.saturation_bits + + out_precision = FixedPrecisionType(new_width, new_int, new_signed, newrmode, newsmode, newsbits) else: - print(f'Warning: not propagating weights for type {op}') out_precision = self._get_default_precision(node) node.types['result_t'].name = node.name + '_result_t' @@ -377,46 +502,58 @@ def _infer_merge_precision(self, node, types_to_infer): return ['result_t'] - def _infer_cat_precision(self, node, types_to_infer): + def _infer_dot_precision(self, node, types_to_infer): assert 'result_t' in types_to_infer and len(types_to_infer) == 1 input_1 = node.get_input_variable(node.inputs[0]).type.precision input_2 = node.get_input_variable(node.inputs[1]).type.precision - new_signed = input_1.signed or input_2.signed - - input_1_integer = input_1.integer - input_2_integer = input_2.integer + if self._all_supported_types((input_1, input_2)): + n_in = node.get_input_variable(node.inputs[0]).shape[0] - # add one to integer if unsigned while new is signed - if new_signed and not input_1.signed: - input_1_integer += 1 - if new_signed and not input_2.signed: - input_2_integer += 1 + new_signed = input_1.signed or input_2.signed + new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in)) + new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in)) - new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) - new_int = max(input_1_integer, input_2_integer) + # if max_precision is specified, limit the size to be less than max precisoin + max_precision = self._get_maximum_precision(node) + if max_precision is not None: + new_width = min(new_width, max_precision.width) + new_int = min(new_int, max_precision.integer) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + out_precision = self._get_default_precision(node) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision return ['result_t'] - def _infer_dot_precision(self, node, types_to_infer): - assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + def _infer_embedding_precision(self, node, types_to_infer): + inferred_types = [] - input_1 = node.get_input_variable(node.inputs[0]).type.precision - input_2 = node.get_input_variable(node.inputs[1]).type.precision + if 'embeddings_t' in types_to_infer: + self._infer_default_type(node, 'embeddings_t') + node.weights['embeddings'].update_precision(node.types['embeddings_t'].precision) + inferred_types.append('embeddings_t') - n_in = node.get_input_variable(node.inputs[0]).shape[0] + if 'result_t' in types_to_infer: + out_precision = self._get_default_precision(node) + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + inferred_types.append('result_t') - new_signed = input_1.signed or input_2.signed - new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in)) - new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in)) + return inferred_types - out_precision = FixedPrecisionType(new_width, new_int, new_signed) - node.types['result_t'].name = node.name + '_result_t' - node.types['result_t'].precision = out_precision + # TODO: This is just a placeholder + def _infer_rnn_precision(self, node, types_to_infer): + inferred_types = [] - return ['result_t'] + # for now just do the weights and leave the rest for the default catch + for weightvar in ('weight', 'bias', 'recurrent_weight', 'recurrent_bias'): + if f'{weightvar}_t' in types_to_infer: + self._infer_default_type(node, f'{weightvar}_t') + node.weights[weightvar].update_precision(node.types[f'{weightvar}_t'].precision) + inferred_types.append(f'{weightvar}_t') + + return inferred_types diff --git a/hls4ml/model/optimizer/passes/precision_merge.py b/hls4ml/model/optimizer/passes/precision_merge.py deleted file mode 100644 index 9e79b11000..0000000000 --- a/hls4ml/model/optimizer/passes/precision_merge.py +++ /dev/null @@ -1,40 +0,0 @@ -from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode - - -def get_concat_type(itype1, itype2): - newwidth = max(itype1.width, itype2.width) - newint = max(itype1.integer, itype2.integer) - if itype1.signed ^ itype2.signed: # XOR - newint += 1 - newwidth += 1 - newrmode = itype1.rounding_mode if itype1.rounding_mode != RoundingMode.TRN else itype2.rounding_mode - newsmode = itype1.saturation_mode if itype1.saturation_mode != SaturationMode.WRAP else itype2.saturation_mode - newsbits = itype1.saturation_bits if itype1.saturation_bits != 0 else itype2.saturation_bits - - newtype = FixedPrecisionType(newwidth, newint, itype1.signed or itype2.signed, newrmode, newsmode, newsbits) - return newtype - - -class SetPrecisionConcat(OptimizerPass): - def match(self, node): - if node.__class__.__name__ == 'Concatenate': - otype = node.get_output_variable().type.precision - itype1 = node.get_input_variable(node.inputs[0]).type.precision - itype2 = node.get_input_variable(node.inputs[1]).type.precision - if isinstance(otype, FixedPrecisionType) and otype != get_concat_type(itype1, itype2): - return True - return False - - def transform(self, model, node): - """ - Set concat output precision - """ - otype = node.get_output_variable().type.precision - itype1 = node.get_input_variable(node.inputs[0]).type.precision - itype2 = node.get_input_variable(node.inputs[1]).type.precision - newtype = get_concat_type(itype1, itype2) - print(f"Found {node.name} in the model, optimizing {otype} to {newtype}...") - node.get_output_variable().type.precision = newtype - - return True diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py index 6cba033de2..5cd17d02e9 100644 --- a/hls4ml/utils/config.py +++ b/hls4ml/utils/config.py @@ -112,7 +112,7 @@ def _get_precision_from_quantizer(quantizer): def config_from_keras_model( - model, granularity='model', backend=None, default_precision='fixed<16,6>', default_reuse_factor=1 + model, granularity='model', backend=None, default_precision='fixed<16,6>', default_reuse_factor=1, max_precision=None ): """Create an HLS conversion config given the Keras model. @@ -132,8 +132,11 @@ def config_from_keras_model( will generate config keys for every layer separately, allowing for highly specific configuration tweaks. backend(str, optional): Name of the backend to use - default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'. + default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'. Note, this must + be an explicit precision: 'auto' is not allowed. default_reuse_factor (int, optional): Default reuse factor. Defaults to 1. + max_precision (str or None, optional): Maximum width precision to use. Defaults to None, meaning no maximum. + Note: Only integer and fixed precisions are supported Raises: Exception: If Keras model has layers not supported by hls4ml. @@ -182,9 +185,11 @@ def make_layer_config(layer): if name.endswith('_t'): name = name[:-2] if attr.default is None: - precision_cfg[name] = default_precision + precision_cfg[name] = 'auto' else: precision_cfg[name] = str(attr.default) + elif attr.name == 'reuse_factor': + layer_config[attr.config_name] = default_reuse_factor else: if attr.default is not None: layer_config[attr.config_name] = attr.default @@ -238,7 +243,10 @@ def make_layer_config(layer): config = {} model_config = {} - model_config['Precision'] = default_precision + model_config['Precision'] = {} + model_config['Precision']['default'] = default_precision + if max_precision is not None: + model_config['Precision']['maximum'] = max_precision model_config['ReuseFactor'] = default_reuse_factor model_config['Strategy'] = 'Latency' model_config['BramFactor'] = 1_000_000_000 diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py index 727d2ee574..15774fa395 100644 --- a/test/pytest/test_batchnorm.py +++ b/test/pytest/test_batchnorm.py @@ -36,7 +36,9 @@ def test_batchnorm(model, data, backend, io_type): center = model.layers[0].center scale = model.layers[0].scale - config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') + config = hls4ml.utils.config_from_keras_model( + model, default_precision=default_precision, granularity='name', backend=backend + ) output_dir = str(test_root_path / f'hls4mlprj_batchnorm_{backend}_{io_type}_center{center}_scale{scale}') hls_model = hls4ml.converters.convert_from_keras_model( model, backend=backend, hls_config=config, io_type=io_type, output_dir=output_dir diff --git a/test/pytest/test_batchnorm_pytorch.py b/test/pytest/test_batchnorm_pytorch.py index fd4efdf326..137aee8a1e 100644 --- a/test/pytest/test_batchnorm_pytorch.py +++ b/test/pytest/test_batchnorm_pytorch.py @@ -40,7 +40,7 @@ def test_batchnorm(data, backend, io_type): default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>' config = hls4ml.utils.config_from_pytorch_model( - model, (in_shape,), default_precision=default_precision, granularity='name' + model, (in_shape,), default_precision=default_precision, granularity='name', backend=backend ) output_dir = str(test_root_path / f'hls4mlprj_batchnorm_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_pytorch_model( diff --git a/test/pytest/test_binary_cnn.py b/test/pytest/test_binary_cnn.py index 40af056df9..c1fa1b1551 100644 --- a/test/pytest/test_binary_cnn.py +++ b/test/pytest/test_binary_cnn.py @@ -66,7 +66,9 @@ def test_binary_cnn(backend, io_type, strategy): model2.summary() - hls_config = hls4ml.utils.config_from_keras_model(model2, granularity='name', default_precision='fixed<32,12>') + hls_config = hls4ml.utils.config_from_keras_model( + model2, granularity='name', default_precision='fixed<32,12>', backend=backend + ) hls_config['Model']['Strategy'] = strategy # hls_config['LayerName']['q_dense_7_softmax']['Implementation'] = 'legacy' diff --git a/test/pytest/test_causalpadding.py b/test/pytest/test_causalpadding.py index c076c99987..d91da35fac 100644 --- a/test/pytest/test_causalpadding.py +++ b/test/pytest/test_causalpadding.py @@ -23,7 +23,9 @@ def test_causalpadding(io_type, backend): data = np.expand_dims(data, axis=0) data = np.expand_dims(data, axis=-1) - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>', granularity='name') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,16>', granularity='name', backend=backend + ) odir = str(test_root_path / f'hls4mlprj_validpadding_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend diff --git a/test/pytest/test_clone_flatten.py b/test/pytest/test_clone_flatten.py index 5f631d027f..d819af54e7 100644 --- a/test/pytest/test_clone_flatten.py +++ b/test/pytest/test_clone_flatten.py @@ -31,9 +31,7 @@ def keras_model(): @pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult']) def hls_model(keras_model, backend, io_type): hls_config = hls4ml.utils.config_from_keras_model( - keras_model, - default_precision='ap_int<6>', - granularity='name', + keras_model, default_precision='ap_int<6>', granularity='name', backend=backend ) output_dir = str(test_root_path / f'hls4mlprj_clone_flatten_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( diff --git a/test/pytest/test_cnn_mnist_qkeras.py b/test/pytest/test_cnn_mnist_qkeras.py index b4c28c70d1..38489b5865 100644 --- a/test/pytest/test_cnn_mnist_qkeras.py +++ b/test/pytest/test_cnn_mnist_qkeras.py @@ -58,7 +58,7 @@ def mnist_model(): ) def hls_model(mnist_model, backend, io_type, strategy): keras_model = mnist_model - hls_config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name') + hls_config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name', backend=backend) hls_config['Model']['Strategy'] = strategy hls_config['LayerName']['softmax']['Strategy'] = 'Stable' output_dir = str(test_root_path / f'hls4mlprj_cnn_mnist_qkeras_{backend}_{io_type}_{strategy}') diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py index a27fc45b93..c045629a40 100644 --- a/test/pytest/test_embed.py +++ b/test/pytest/test_embed.py @@ -28,7 +28,9 @@ def keras_model(): @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def hls_model(keras_model, backend, io_type): - hls_config = hls4ml.utils.config_from_keras_model(keras_model, default_precision='ap_fixed<16,6>', granularity='name') + hls_config = hls4ml.utils.config_from_keras_model( + keras_model, default_precision='ap_fixed<16,6>', granularity='name', backend=backend + ) hls_config['LayerName']['embedding_input']['Precision']['result'] = 'ap_uint<4>' out_dir = str(test_root_path / 'hls4mlprj_embed_{}_{}').format(backend, io_type) hls_model = hls4ml.converters.convert_from_keras_model( diff --git a/test/pytest/test_garnet.py b/test/pytest/test_garnet.py index 67ddf77182..62bc82a8c0 100644 --- a/test/pytest/test_garnet.py +++ b/test/pytest/test_garnet.py @@ -33,7 +33,7 @@ def garnet_models(): model = Model(inputs=inputs, outputs=outputs) model.summary() - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado') config['Model'] = {} config['Model']['ReuseFactor'] = 1 config['Model']['Strategy'] = 'Latency' @@ -68,7 +68,7 @@ def garnet_stack_models(): model = Model(inputs=inputs, outputs=outputs) model.summary() - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado') config['Model'] = {} config['Model']['ReuseFactor'] = 1 config['Model']['Strategy'] = 'Latency' diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py index b99f0d8212..d0b635595a 100644 --- a/test/pytest/test_globalpooling.py +++ b/test/pytest/test_globalpooling.py @@ -53,7 +53,9 @@ def keras_model_1d(request): def test_global_pool1d(backend, keras_model_1d, data_1d, io_type): model, model_type, keepdims = keras_model_1d - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,9>', granularity='name') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,9>', granularity='name', backend=backend + ) hls_model = hls4ml.converters.convert_from_keras_model( model, @@ -108,7 +110,9 @@ def keras_model_2d(request): def test_global_pool2d(backend, keras_model_2d, data_2d, io_type): model, model_type, keepdims = keras_model_2d - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,9>', granularity='name') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,9>', granularity='name', backend=backend + ) hls_model = hls4ml.converters.convert_from_keras_model( model, diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py index b9f2d35f1a..6f00b2ec00 100644 --- a/test/pytest/test_keras_api.py +++ b/test/pytest/test_keras_api.py @@ -310,7 +310,9 @@ def test_depthwise2d(backend, io_type): model.add(DepthwiseConv2D(kernel_size=(3, 3), input_shape=(32, 32, 3))) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='fixed<32,12>') + config = hls4ml.utils.config_from_keras_model( + model, granularity='name', default_precision='fixed<32,12>', backend=backend + ) output_dir = str(test_root_path / f'hls4mlprj_keras_api_depthwiseconv2d_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type @@ -336,7 +338,7 @@ def test_depthwise1d(backend, io_type): model.add(DepthwiseConv1D(kernel_size=3, input_shape=(32, 3))) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) output_dir = str(test_root_path / f'hls4mlprj_keras_api_depthwiseconv1d_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type diff --git a/test/pytest/test_optimization/test_attributes.py b/test/pytest/test_optimization/test_attributes.py index 2669321e09..3ba8d08d14 100644 --- a/test/pytest/test_optimization/test_attributes.py +++ b/test/pytest/test_optimization/test_attributes.py @@ -38,6 +38,12 @@ def test_attributes(): cfg['Model']['Strategy'] = strategy cfg['LayerName']['dense']['ReuseFactor'] = 1 + # optimization doesn't yet support auto precision + for layer in cfg['LayerName'].values(): + for key, prec in layer['Precision'].items(): + if prec == 'auto': + layer['Precision'][key] = default_precision + # Verify correct information for every layer model_attributes = get_attributes_from_keras_model_and_hls4ml_config(model, cfg) assert len(model_attributes) == 4 diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 060b9877de..d7f9281b38 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -154,7 +154,7 @@ def test_pointwise_config(strategy): model.compile(optimizer='adam', loss='mse') - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado') config['Model']['Strategy'] = strategy config['LayerName']['conv2d_1x1']['Strategy'] = strategy # Will fail if the strategy is not lowercase output_dir = str(test_root_path / f'hls4mlprj_pointwise2d_config_{strategy}') diff --git a/test/pytest/test_pooling.py b/test/pytest/test_pooling.py index d7de80a5a7..7a10cd2733 100644 --- a/test/pytest/test_pooling.py +++ b/test/pytest/test_pooling.py @@ -53,7 +53,9 @@ def keras_model_1d(request): def test_pool1d(backend, keras_model_1d, data_1d, io_type): model, model_type, padding = keras_model_1d - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,9>', granularity='name') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,9>', granularity='name', backend=backend + ) hls_model = hls4ml.converters.convert_from_keras_model( model, @@ -108,7 +110,9 @@ def keras_model_2d(request): def test_pool2d(backend, keras_model_2d, data_2d, io_type): model, model_type, padding = keras_model_2d - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,9>', granularity='name') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,9>', granularity='name', backend=backend + ) hls_model = hls4ml.converters.convert_from_keras_model( model, diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py index 45d015807b..a6cdaabcac 100644 --- a/test/pytest/test_qkeras.py +++ b/test/pytest/test_qkeras.py @@ -77,7 +77,7 @@ def convert(load_jettagging_model, strategy): ''' model = load_jettagging_model - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado') config['Model']['Strategy'] = strategy config['LayerName']['softmax']['exp_table_t'] = 'ap_fixed<18,8>' config['LayerName']['softmax']['inv_table_t'] = 'ap_fixed<18,4>' @@ -156,7 +156,7 @@ def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_ty model.add(QActivation(activation=quantized_relu(bits, 0), name='relu1')) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) output_dir = str(test_root_path / f'hls4mlprj_qkeras_single_dense_activation_exact_{bits}_{alpha}_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type @@ -205,7 +205,7 @@ def test_quantizer_special(randX_1000_1, quantizer, backend, io_type): model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer')) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) output_dir = str( test_root_path / f'hls4mlprj_qkeras_quantizer_{quantizer.__class__.__name__}_{quantizer.bits}_{backend}_{io_type}' ) @@ -289,7 +289,7 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type): model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer')) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) output_dir = str( test_root_path / 'hls4mlprj_qkeras_quantizer_{}_{}_{}_{}_{}'.format( @@ -328,7 +328,7 @@ def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type): model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer')) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) output_dir = str( test_root_path / 'hls4mlprj_qkeras_leaky_relu_{}_{}_neg_slope_{}_{}_{}'.format( @@ -373,7 +373,7 @@ def test_qactivation_kwarg(randX_100_10, activation_quantizer, weight_quantizer) )(inputs) model = Model(inputs, outputs) - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado') out_dir = str(test_root_path / f'hls4mlprj_qactivation_kwarg_{activation_quantizer}') @@ -418,7 +418,9 @@ def test_quantizer_parsing(randX_100_10, backend, io_type): ) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='fixed<24,8>') + config = hls4ml.utils.config_from_keras_model( + model, granularity='name', default_precision='fixed<24,8>', backend=backend + ) output_dir = str(test_root_path / f'hls4mlprj_qkeras_quant_parse_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type @@ -459,7 +461,9 @@ def test_qconv2dbn(randX_100_8_8_1, backend, io_type): ) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='fixed<24,8>') + config = hls4ml.utils.config_from_keras_model( + model, granularity='name', default_precision='fixed<24,8>', backend=backend + ) output_dir = str(test_root_path / f'hls4mlprj_qkeras_qconv2dbn_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type @@ -500,7 +504,9 @@ def test_qdepthwiseconv2d(randX_10_32_32_3, backend, io_type): ) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='fixed<24,8>') + config = hls4ml.utils.config_from_keras_model( + model, granularity='name', default_precision='fixed<24,8>', backend=backend + ) output_dir = str(test_root_path / f'hls4mlprj_qkeras_qdepthwiseconv2d_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type @@ -538,7 +544,7 @@ def test_quantised_po2_bit_width(backend, io_type, strategy): y_keras = keras_model.predict(X) hls_config = hls4ml.utils.config_from_keras_model( - keras_model, granularity='name', default_precision='ap_fixed<64, 32>', default_reuse_factor=1 + keras_model, granularity='name', default_precision='ap_fixed<64, 32>', default_reuse_factor=1, backend=backend ) hls_config['Model']['Strategy'] = strategy output_dir = str(test_root_path / f'hls4mlprj_qkeras_quantised_po2_{backend}_{io_type}_{strategy}') @@ -573,7 +579,9 @@ def test_qsimplernn(backend): ) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision="ap_fixed<16,1>") + config = hls4ml.utils.config_from_keras_model( + model, granularity='name', default_precision="ap_fixed<16,1>", backend=backend + ) output_dir = str(test_root_path / f'hls4mlprj_qkeras_qsimplernn_{backend}') hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend=backend) hls_model.compile() @@ -607,7 +615,9 @@ def test_qlstm(backend): ) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision="ap_fixed<8,1>") + config = hls4ml.utils.config_from_keras_model( + model, granularity='name', default_precision="ap_fixed<8,1>", backend=backend + ) output_dir = str(test_root_path / f'hls4mlprj_qkeras_qsimplernn_{backend}') hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend=backend) hls_model.compile() @@ -642,7 +652,9 @@ def test_qgru(backend): ) model.compile() - config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision="ap_fixed<8,1>") + config = hls4ml.utils.config_from_keras_model( + model, granularity='name', default_precision="ap_fixed<8,1>", backend=backend + ) output_dir = str(test_root_path / f'hls4mlprj_qkeras_qsimplernn_{backend}') hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend=backend) hls_model.compile() diff --git a/test/pytest/test_rnn.py b/test/pytest/test_rnn.py index 3e6e978011..dc991f7f55 100644 --- a/test/pytest/test_rnn.py +++ b/test/pytest/test_rnn.py @@ -25,7 +25,7 @@ def test_rnn_parsing(rnn_layer, return_sequences): model = Model(model_input, model_output) model.compile(optimizer='adam', loss='mse') - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado') prj_name = f'hls4mlprj_rnn_{rnn_layer.__class__.__name__.lower()}_seq_{int(return_sequences)}' output_dir = str(test_root_path / prj_name) hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir) @@ -90,7 +90,7 @@ def test_rnn_accuracy(rnn_layer, return_sequences, backend, io_type, strategy, s input_shape = (12, 8) X = np.random.rand(50, *input_shape) - 0.5 - layer_name = rnn_layer.__class__.__name__.lower() + layer_name = rnn_layer.__name__ keras_model = Sequential() keras_model.add( rnn_layer( @@ -111,8 +111,9 @@ def test_rnn_accuracy(rnn_layer, return_sequences, backend, io_type, strategy, s ) hls_config['LayerName'][layer_name]['static'] = static hls_config['LayerName'][layer_name]['Strategy'] = strategy - prj_name = 'hls4mlprj_rnn_accuracy_{}_static_{}_ret_seq_{}_{}_{}_{}'.format( - rnn_layer.__class__.__name__.lower(), int(static), int(return_sequences), backend, io_type, strategy + prj_name = ( + f'hls4mlprj_rnn_accuracy_{layer_name}_static_{int(static)}_ret_seq_{int(return_sequences)}_' + f'{backend}_{io_type}_{strategy}' ) output_dir = str(test_root_path / prj_name) diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py index 19c9042465..048b6832ee 100644 --- a/test/pytest/test_softmax.py +++ b/test/pytest/test_softmax.py @@ -41,7 +41,7 @@ def test_softmax(backend, strategy, generate_data, input_bits, input_shape, tabl table_type = f'fixed<{table_bits}, RND, SAT>' - cfg = hls4ml.utils.config_from_keras_model(model, granularity='name') + cfg = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) cfg['LayerName']['softmax']['Strategy'] = strategy cfg['LayerName']['softmax']['inv_table_t'] = table_type cfg['LayerName']['softmax']['exp_table_t'] = table_type @@ -74,7 +74,7 @@ def test_softmax_skipped(backend, io_type): model = tf.keras.models.Sequential([dense, softmax]) model.compile() - cfg = hls4ml.utils.config_from_keras_model(model, granularity='name') + cfg = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) cfg['LayerName']['softmax']['skip'] = True odir = str(test_root_path / 'hls4mlprj_softmax_skipped_{}_{}').format(backend, io_type) diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py index 31a2a1c2cf..f0089438a4 100644 --- a/test/pytest/test_softsign.py +++ b/test/pytest/test_softsign.py @@ -19,7 +19,7 @@ def test_softsign(backend, input_shape, io_type): model.add(tf.keras.layers.Activation(input_shape=input_shape, activation='softsign', name='softsign')) model.compile() - cfg = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='fixed<20,4>') + cfg = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='fixed<20,4>', backend=backend) # Since softsign implementation is lookup-based increasing the precision and size of the table helps with accuracy cfg['LayerName']['softsign']['table_t'] = 'fixed<20,4>' cfg['LayerName']['softsign']['table_size'] = 2048 diff --git a/test/pytest/test_trace.py b/test/pytest/test_trace.py index 14e218fd1c..b01cfcd010 100644 --- a/test/pytest/test_trace.py +++ b/test/pytest/test_trace.py @@ -39,11 +39,11 @@ def test_trace(backend, activation): keras_prediction = model.predict(X_input) - config = hls4ml.utils.config_from_keras_model(model, granularity='name') + config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) for layer in config['LayerName'].keys(): config['LayerName'][layer]['Trace'] = True - output_dir = str(test_root_path / f'hls4mlprj_trace_{backend}') + output_dir = str(test_root_path / f'hls4mlprj_trace_{backend}_{activation}') hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend=backend) diff --git a/test/pytest/test_transpose_concat.py b/test/pytest/test_transpose_concat.py index db3e03125f..7447545d2f 100644 --- a/test/pytest/test_transpose_concat.py +++ b/test/pytest/test_transpose_concat.py @@ -32,7 +32,7 @@ def keras_model(): @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def hls_model(keras_model, backend, io_type): hls_config = hls4ml.utils.config_from_keras_model( - keras_model, default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>', granularity='name' + keras_model, default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>', granularity='name', backend=backend ) hls_config['LayerName']['relu']['Precision'] = 'ap_ufixed<17,3>' output_dir = str(test_root_path / f'hls4mlprj_transpose_{backend}_{io_type}') diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py index 9051d582bd..c81be76933 100644 --- a/test/pytest/test_upsampling.py +++ b/test/pytest/test_upsampling.py @@ -56,7 +56,9 @@ def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type model = keras_model_2d data = data_2d - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,1>', granularity='name') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,1>', granularity='name', backend=backend + ) odir = str(test_root_path / f'hls4mlprj_upsampling_{model_type}_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py index 95f7d79a7d..6a22a22472 100644 --- a/test/pytest/test_zeropadding.py +++ b/test/pytest/test_zeropadding.py @@ -60,7 +60,9 @@ def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_typ model = keras_model_2d data = data_2d - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,1>', granularity='name') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,1>', granularity='name', backend=backend + ) odir = str(test_root_path / f'hls4mlprj_zeropadding_{model_type}_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend From 51c80f96e4c6caf0f3c5310a6b334ddd008e9c1d Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 12 Sep 2024 12:16:43 -0500 Subject: [PATCH 140/272] make the optimizer oder be more similar to main branch --- hls4ml/model/optimizer/__init__.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index fee180b0c5..840d42ebf2 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -57,25 +57,24 @@ register_flow( 'convert', [ - 'fuse_consecutive_batch_normalization', + 'channels_last_converter', 'merge_linear_activation', - 'fuse_batch_normalization', - 'eliminate_linear_activation', - 'qkeras_factorize_alpha', - 'extract_ternary_threshold', - 'replace_multidimensional_dense_with_conv', 'seperable_to_depthwise_and_conv', - # The ones above here need to be before infer_precision_types - 'infer_precision_types', - 'channels_last_converter', 'remove_transpose_before_flatten', 'remove_nop_transpose', 'remove_single_channel_transpose', 'fuse_bias_add', 'expand_layer_group', 'output_rounding_saturation_mode', + 'qkeras_factorize_alpha', + 'extract_ternary_threshold', 'fuse_consecutive_batch_normalization', + 'fuse_batch_normalization', + 'replace_multidimensional_dense_with_conv', 'enforce_proxy_model_embedded_config', + 'eliminate_linear_activation', + # many of the above optimzers need to be done before this + 'infer_precision_types', ], requires=['parse_qonnx'], ) # TODO Maybe not all QKeras optmizers belong here? @@ -83,10 +82,7 @@ register_flow( 'optimize', [ - 'eliminate_linear_activation', 'remove_nop_batch_normalization', - 'infer_precision_types', - 'set_precision_concat', ], requires=['convert'], ) From 52411091beb871a80dff33f474cda6ce92310360 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 12 Sep 2024 17:24:49 -0500 Subject: [PATCH 141/272] remove checks on 'padding' that were missed in previous PR --- test/pytest/test_keras_api.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py index 6f00b2ec00..49ceb625a9 100644 --- a/test/pytest/test_keras_api.py +++ b/test/pytest/test_keras_api.py @@ -165,7 +165,6 @@ def test_conv1d(padds, backend, io_type): assert list(hls_model.get_layers())[1].attributes['n_chan'] == model.layers[0].input_shape[2] assert list(hls_model.get_layers())[1].attributes['n_filt'] == model.layers[0].filters assert list(hls_model.get_layers())[1].attributes['stride_width'] == model.layers[0].strides[0] - assert list(hls_model.get_layers())[1].attributes['padding'] == model.layers[0].padding assert list(hls_model.get_layers())[1].attributes['data_format'] == model.layers[0].data_format assert list(hls_model.get_layers())[1].attributes["out_width"] == list(model.layers[0].output_shape)[1] @@ -235,7 +234,6 @@ def test_conv2d(chans, padds, backend, io_type): assert list(hls_model.get_layers())[1].attributes['n_filt'] == model.layers[0].filters assert list(hls_model.get_layers())[1].attributes['stride_width'] == model.layers[0].strides[1] assert list(hls_model.get_layers())[1].attributes['stride_height'] == model.layers[0].strides[0] - assert list(hls_model.get_layers())[1].attributes['padding'] == model.layers[0].padding assert list(hls_model.get_layers())[1].attributes['data_format'] == model.layers[0].data_format if model.layers[0].data_format == 'channels_first': @@ -392,7 +390,6 @@ def test_pooling(pooling, padds, chans, backend): assert hls_pool.attributes['stride_width'] == ker_pool.strides[1] assert hls_pool.attributes['pool_height'] == ker_pool.pool_size[1] assert hls_pool.attributes['pool_width'] == ker_pool.pool_size[0] - assert hls_pool.attributes['padding'] == ker_pool.padding if hls_pool.attributes['data_format'] == 'channels_last': assert hls_pool.attributes['in_height'] == ker_pool.input_shape[1] @@ -403,7 +400,7 @@ def test_pooling(pooling, padds, chans, backend): assert hls_pool.attributes['in_width'] == ker_pool.input_shape[3] assert hls_pool.attributes['n_filt'] == ker_pool.input_shape[1] - if hls_pool.attributes['padding'] == 'same': + if ker_pool.padding == 'same': # Height in_height = ker_pool.input_shape[1] if ker_pool.data_format == 'channels_first': @@ -434,7 +431,7 @@ def test_pooling(pooling, padds, chans, backend): assert pad_left == hls_pool.attributes['pad_left'] assert pad_right == hls_pool.attributes['pad_right'] - elif hls_pool.attributes['padding'] == 'valid': + elif ker_pool.padding == 'valid': if hls_pool.attributes['data_format'] == 'channels_first': in_height = ker_pool.input_shape[2] in_width = ker_pool.input_shape[3] @@ -459,12 +456,11 @@ def test_pooling(pooling, padds, chans, backend): assert hls_pool.attributes['n_filt'] == ker_pool.input_shape[2] assert hls_pool.attributes['pool_width'] == ker_pool.pool_size[0] assert hls_pool.attributes['stride_width'] == ker_pool.strides[0] - assert hls_pool.attributes['padding'] == ker_pool.padding out_same = math.ceil(float(ker_pool.input_shape[1]) / float(ker_pool.strides[0])) out_valid = math.ceil(float(ker_pool.input_shape[1] - ker_pool.pool_size[0] + 1) / ker_pool.strides[0]) - if hls_pool.attributes['padding'] == 'same': + if ker_pool.padding == 'same': assert hls_pool.attributes['n_out'] == out_same if ker_pool.input_shape[1] % ker_pool.strides[0] == 0: pad_along_width = max(ker_pool.pool_size[0] - ker_pool.strides[0], 0) @@ -473,7 +469,7 @@ def test_pooling(pooling, padds, chans, backend): assert hls_pool.attributes['pad_left'] == pad_along_width // 2 assert hls_pool.attributes['pad_right'] == pad_along_width - pad_along_width // 2 - elif hls_pool.attributes['padding'] == 'valid': + elif ker_pool.padding == 'valid': assert hls_pool.attributes['n_out'] == out_valid assert hls_pool.attributes['pad_left'] == 0 assert hls_pool.attributes['pad_right'] == 0 From 8eaf10a1557fb56c8abd97966e86488357dde1b7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 19 Sep 2024 16:16:16 -0500 Subject: [PATCH 142/272] fix dimensions when moving scales --- hls4ml/model/layers.py | 2 +- hls4ml/model/optimizer/passes/move_scales.py | 71 +++++++++++--------- hls4ml/model/optimizer/passes/quant_opt.py | 22 +++--- 3 files changed, 54 insertions(+), 41 deletions(-) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index e5ceaca28d..bc3bc2b1c3 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -976,7 +976,7 @@ def initialize(self): class BatchNormalization(Layer): _expected_attributes = [ Attribute('n_in'), - Attribute('n_filt', default=0), + Attribute('n_filt', default=-1), WeightAttribute('scale'), WeightAttribute('bias'), TypeAttribute('scale'), diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index cec69af5e8..3776a6d202 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -67,15 +67,16 @@ def transform(self, model, node): bias = np.array(bias1d[0]) output = node.get_output_variable() + # to remove warning, since these get set again + new_attrs = {k: v for k, v in apply_alpha.attributes.items() if k not in ('trace', 'precision')} can_propagate = False if not bias.shape and bias == 0: # zero bias, propagate through, if possible # (always possible if scale is scalar) try: - np.broadcast_to(scale, output.shape) # check size compatibility - newscale = scale - newbias = np.array(0) + newscale = np.broadcast_to(scale, output.shape) # check size compatibility + newbias = np.zeros(output.shape) can_propagate = True except ValueError: can_propagate = False @@ -84,10 +85,9 @@ def transform(self, model, node): if not can_propagate and isinstance(inp[other_idx], Constant): # can handle nonzero bias in some cases if other value is a Constant try: - np.broadcast_to(scale, output.shape) # check size compatibility - newscale = scale - newbias = inp[other_idx].attributes['value'] * bias - np.broadcast_to(newbias, output.shape) + newscale = np.broadcast_to(scale, output.shape) # check size compatibility + newbias = np.broadcast_to(inp[other_idx].attributes['value'] * bias, output.shape) + new_attrs.pop('bias_precision', None) # remove special bias precision settings can_propagate = True except ValueError: can_propagate = False @@ -97,9 +97,10 @@ def transform(self, model, node): model.remove_node(apply_alpha) - new_node = model.make_node('ApplyAlpha', apply_alpha.name, apply_alpha.attributes, [x for x in node.outputs]) - new_node.add_weights(newscale) - new_node.add_bias(newbias) + new_attrs['scale_data'] = newscale + new_attrs['bias_data'] = newbias + + new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs]) model.insert_node(new_node) return True @@ -136,9 +137,11 @@ def transform(self, model, node): model.remove_node(in0) model.remove_node(in1) - new_node = model.make_node('ApplyAlpha', in0.name, in0.attributes, [x for x in node.outputs]) - new_node.add_weights(scale) - new_node.add_bias(bias) + new_attrs = in0.attributes + new_attrs['scale_data'] = scale + new_attrs['bias_data'] = bias + + new_node = model.make_node('ApplyAlpha', in0.name, new_attrs, [x for x in node.outputs]) model.insert_node(new_node) return True @@ -170,15 +173,16 @@ def transform(self, model, node): bias = np.array(bias1d[0]) output = node.get_output_variable() + # to remove warning, since these get set again + new_attrs = {k: v for k, v in apply_alpha.attributes.items() if k not in ('trace', 'precision')} can_propagate = False if not bias.shape and bias == 0: # zero bias, propagate through, if possible # (always possible if scale is scalar) try: - np.broadcast_to(scale, output.shape) # check broadcastable - newscale = scale - newbias = np.array(0) + newscale = np.broadcast_to(scale, output.shape) # check broadcastable + newbias = np.zeros(output.shape) can_propagate = True except ValueError: can_propagate = False @@ -188,9 +192,10 @@ def transform(self, model, node): model.remove_node(apply_alpha) - new_node = model.make_node('ApplyAlpha', apply_alpha.name, apply_alpha.attributes, [x for x in node.outputs]) - new_node.add_weights(newscale) - new_node.add_bias(newbias) + new_attrs['scale_data'] = newscale + new_attrs['bias_data'] = newbias + + new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs]) model.insert_node(new_node) return True @@ -224,15 +229,16 @@ def transform(self, model, node): bias = np.array(bias1d[0]) output = node.get_output_variable() + # to remove warning, since these get set again + new_attrs = {k: v for k, v in apply_alpha.attributes.items() if k not in ('trace', 'precision')} can_propagate = False if not bias.shape and bias == 0: # zero bias, propagate through, if possible # (always possible if scale is scalar) try: - np.broadcast_to(scale, output.shape) # make sure broadcastable - newscale = scale - newbias = np.array(0) + newscale = np.broadcast_to(scale, output.shape) # make sure broadcastable + newbias = np.zeros(output.shape) can_propagate = True except ValueError: can_propagate = False @@ -242,9 +248,10 @@ def transform(self, model, node): model.remove_node(apply_alpha) - new_node = model.make_node('ApplyAlpha', apply_alpha.name, apply_alpha.attributes, [x for x in node.outputs]) - new_node.add_weights(newscale) - new_node.add_bias(newbias) + new_attrs['scale_data'] = newscale + new_attrs['bias_data'] = newbias + + new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs]) model.insert_node(new_node) return True @@ -278,14 +285,15 @@ def transform(self, model, node): bias = np.array(bias1d[0]) output = node.get_output_variable() + # to remove warning, since these get set again + new_attrs = {k: v for k, v in apply_alpha.attributes.items() if k not in ('trace', 'precision')} can_propagate = False if not scale.shape and scale == 1: # No scale, just additional bias try: - np.broadcast_to(bias, output.shape) - newscale = np.array(1) - newbias = bias + newscale = np.ones(output.shape) + newbias = np.broadcast_to(bias, output.shape) can_propagate = True except ValueError: can_propagate = False @@ -295,8 +303,9 @@ def transform(self, model, node): model.remove_node(apply_alpha) - new_node = model.make_node('ApplyAlpha', apply_alpha.name, apply_alpha.attributes, [x for x in node.outputs]) - new_node.add_weights(newscale) - new_node.add_bias(newbias) + new_attrs['scale_data'] = newscale + new_attrs['bias_data'] = newbias + + new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs]) model.insert_node(new_node) return True diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py index ed7f9701a2..69e9ca7685 100644 --- a/hls4ml/model/optimizer/passes/quant_opt.py +++ b/hls4ml/model/optimizer/passes/quant_opt.py @@ -252,11 +252,13 @@ def transform(self, model, node): # but now add the ApplyAlhpas before and after + inshape = node.get_input_variable().shape + scale = node.get_attr('scale') bias = node.get_attr('zeropt') - attributes_scale = {} - attributes_rescale = {} + attributes_scale = {'n_filt': -1} + attributes_rescale = {'n_filt': -1} scale_config = copy.deepcopy(config) scale_name = f'{node.name}_scale' @@ -270,16 +272,16 @@ def transform(self, model, node): firstscale = 1 / scale firstbias = bias - attributes_scale['scale_data'] = firstscale - attributes_scale['bias_data'] = firstbias + attributes_scale['scale_data'] = np.broadcast_to(firstscale, inshape) + attributes_scale['bias_data'] = np.broadcast_to(firstbias, inshape) scale_node = model.make_node(ApplyAlpha, scale_name, attributes_scale, [node.inputs[0]]) model.insert_node(scale_node) rescale = scale rebias = -bias * scale - attributes_rescale['scale_data'] = rescale - attributes_rescale['bias_data'] = rebias + attributes_rescale['scale_data'] = np.broadcast_to(rescale, inshape) + attributes_rescale['bias_data'] = np.broadcast_to(rebias, inshape) rescale_node = model.make_node(ApplyAlpha, rescale_name, attributes_rescale, [new_node.outputs[0]]) model.insert_node(rescale_node) @@ -332,7 +334,9 @@ def transform(self, model, node): const_node.types['result_t'].precision = precision const_node.get_output_variable().type.precision = precision - attributes_rescale = {} + inshape = node.get_input_variable().shape + + attributes_rescale = {'n_filt': -1} rescale_config = copy.deepcopy(model.config.get_layer_config(node)) rescale_name = f'{node.name}_rescale' @@ -341,8 +345,8 @@ def transform(self, model, node): rescale = scale rebias = -bias * scale - attributes_rescale['scale_data'] = rescale - attributes_rescale['bias_data'] = rebias + attributes_rescale['scale_data'] = np.broadcast_to(rescale, inshape) + attributes_rescale['bias_data'] = np.broadcast_to(rebias, inshape) rescale_node = model.make_node( ApplyAlpha, rescale_name, attributes_rescale, [x for x in node.inputs], [x for x in node.outputs] From d80dc3b410d2a2578a79aae905530a92e7b732a1 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 19 Sep 2024 21:44:22 -0500 Subject: [PATCH 143/272] Added support and some missing parts for `Depthwise` and `Pointwise` Convolutions from QONNX --- hls4ml/converters/onnx/convolution.py | 11 ++- hls4ml/model/optimizer/__init__.py | 1 + .../model/optimizer/passes/conv_to_convxd.py | 16 ++-- .../passes/conv_to_depthwiseconvxd.py | 94 +++++++++++++++++++ 4 files changed, 113 insertions(+), 9 deletions(-) create mode 100644 hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py diff --git a/hls4ml/converters/onnx/convolution.py b/hls4ml/converters/onnx/convolution.py index 85dc0ca804..d84fb855a8 100644 --- a/hls4ml/converters/onnx/convolution.py +++ b/hls4ml/converters/onnx/convolution.py @@ -21,13 +21,18 @@ def parse_conv_layer(node, input_names, input_shapes, graph): if dilations is None: dilations = [1] * len(layer['kernel_shape']) - if get_onnx_attribute(node, 'group') != 1: - raise ValueError("Only 1 group supported corrently") - layer['in_width'] = input_shapes[0][-2] layer['n_chan'] = input_shapes[0][-1] layer['n_filt'] = input_shapes[1][0] + layer['group'] = int(get_onnx_attribute(node, 'group')) + if layer['group'] != 1: + layer['depth_multiplier'] = get_onnx_attribute(node, 'group') / layer['n_chan'] + if not layer['depth_multiplier'].is_integer(): + raise ValueError('Depth multiplier must be an integer') + else: + layer['depth_multiplier'] = int(layer['depth_multiplier']) + layer['n_dim'] = len(input_shapes[0]) - 2 # 2 comes from channels and batch dimentions if layer['n_dim'] not in (1, 2): raise ValueError("Only 1D and 2D convolutions are supported") diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 840d42ebf2..10f652345f 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -51,6 +51,7 @@ 'merge_to_apply_alpha_div', 'matmul_const_to_dense', 'conv_to_conv_x_d', + 'conv_to_depthwise_conv_x_d', ], ) diff --git a/hls4ml/model/optimizer/passes/conv_to_convxd.py b/hls4ml/model/optimizer/passes/conv_to_convxd.py index 6fb88ad0d0..25ac50ba40 100644 --- a/hls4ml/model/optimizer/passes/conv_to_convxd.py +++ b/hls4ml/model/optimizer/passes/conv_to_convxd.py @@ -29,12 +29,16 @@ class ConvToConvXD(OptimizerPass): """Convert Conv with constant to a Conv1D or Conv2D layer""" def match(self, node): - is_match = isinstance(node, Conv) and ( - (len(node.inputs) == 2 and isinstance(node.get_input_node(node.inputs[1]), Constant)) - or ( - len(node.inputs) == 3 - and isinstance(node.get_input_node(node.inputs[1]), Constant) - and isinstance(node.get_input_node(node.inputs[2]), Constant) + is_match = ( + isinstance(node, Conv) + and node.get_attr('group') == 1 + and ( + (len(node.inputs) == 2 and isinstance(node.get_input_node(node.inputs[1]), Constant)) + or ( + len(node.inputs) == 3 + and isinstance(node.get_input_node(node.inputs[1]), Constant) + and isinstance(node.get_input_node(node.inputs[2]), Constant) + ) ) ) diff --git a/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py b/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py new file mode 100644 index 0000000000..26603c6a64 --- /dev/null +++ b/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py @@ -0,0 +1,94 @@ +import numpy as np + +from hls4ml.model.layers import Constant, Conv, DepthwiseConv1D, DepthwiseConv2D +from hls4ml.model.optimizer import OptimizerPass + +# these are attributes to copy +_base_attributes = ( + 'in_width', + 'out_width', + 'n_chan', + 'n_filt', + 'pad_left', + 'pad_right', + 'filt_width', + 'stride_width', + 'dilation_width', + 'in_height', + 'out_height', + 'pad_top', + 'pad_bottom', + 'filt_height', + 'stride_height', + 'dilation_height', + 'data_format', +) + + +class ConvToDepthwiseConvXD(OptimizerPass): + """Convert Conv with constant to a DepthwiseConv1D or DepthwiseConv2D layer""" + + def match(self, node): + is_match = ( + isinstance(node, Conv) + and node.get_attr('group') == node.get_attr('n_chan') + and (node.get_attr('group') != 1) + and ( + (len(node.inputs) == 2 and isinstance(node.get_input_node(node.inputs[1]), Constant)) + or ( + len(node.inputs) == 3 + and isinstance(node.get_input_node(node.inputs[1]), Constant) + and isinstance(node.get_input_node(node.inputs[2]), Constant) + ) + ) + ) + + return is_match + + def transform(self, model, node): + """Convert Conv with constant to a DepthwiseConv1D or DepthwiseConv2D layer""" + + weight_node = node.get_input_node(node.inputs[1]) + weight_data = weight_node.attributes['value'] + bias_node = None + if len(node.inputs) == 3: + bias_node = node.get_input_node(node.inputs[2]) + + # creating the attributes + attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + + # The ConvxD nodes expect the weight data to be in a different format, not (M, k1.., C) + if node.attributes['n_dim'] == 1: + newtype = DepthwiseConv1D + attributes['depthwise_data'] = np.transpose(weight_data, (1, 2, 0)) + else: + newtype = DepthwiseConv2D + attributes['depthwise_data'] = np.transpose(weight_data, (1, 2, 3, 0)) + attributes['depthwise_quantizer'] = weight_node.get_attr('quantizer') + + if bias_node: + attributes['bias_data'] = bias_node.attributes['value'] + attributes['bias_quantizer'] = bias_node.get_attr('quantizer') + attributes['use_bias'] = True + else: + attributes['bias_data'] = np.zeros(attributes['n_filt']) + attributes['use_bias'] = False + + # get the configuration name + config = model.config.get_layer_config(node) + new_name = f'{newtype.__name__}_{node.name}' + model.config.set_name_config(new_name, config) + model.config.parse_name_config(new_name, config) + + # making new node + new_node = model.make_node(newtype, new_name, attributes, [node.inputs[0]], [x for x in node.outputs]) + + # removing and replacing old nodes + if bias_node: + model.remove_node(bias_node, rewire=False) + del node.inputs[2] + model.remove_node(weight_node, rewire=False) + del node.inputs[1] + model.replace_node(node, new_node) + + return True From fae647d6df5e2256591eed362ebb22375e4f4efc Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 23 Sep 2024 16:51:00 -0500 Subject: [PATCH 144/272] add seperable conv to test --- example-models | 2 +- test/pytest/test_qonnx.py | 44 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/example-models b/example-models index ff74f73dbc..a81e36eb16 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548 +Subproject commit a81e36eb16593450d7661e7b9686666ddb397208 diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index 5b7b9d95c9..e9ef37578f 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -14,6 +14,23 @@ import hls4ml test_root_path = Path(__file__).parent +example_model_path = (test_root_path / '../../example-models').resolve() + + +@pytest.fixture(scope='module') +def sep_conv_model(): + """ + Load separabale conv model + """ + dl_file = str(example_model_path / "onnx/separable_conv_model_ch_last.onnx") + assert os.path.isfile(dl_file) + out_file = str(test_root_path / "separable_conv_model_ch_last_clean.onnx") + + # cleanup + qonnx.util.cleanup.cleanup(dl_file, out_file=out_file) + model = ModelWrapper(out_file) + + return model @pytest.fixture(scope='module') @@ -83,6 +100,33 @@ def jettagging_model(): return model +@pytest.mark.parametrize('backend', ['Vitis']) +def test_sep_conv(sep_conv_model, backend): + model = sep_conv_model + ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) + X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) + # X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + config = hls4ml.utils.config.config_from_onnx_model( + model, granularity='name', backend=backend, default_precision='fixed<16,6>' + ) + + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir=str(test_root_path / f'hls4mlprj_qonnx_sep_conv_{backend}'), + io_type='io_stream', + backend=backend, + hls_config=config, + ) + hls_model.compile() + y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) + + np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) + print('test') + + @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) def test_tfc_2w2a(tfc_2w2a_model, backend): model = tfc_2w2a_model From 56c85a442e0aee27fae8fa457fa273e0ec111a95 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 24 Sep 2024 00:15:47 -0500 Subject: [PATCH 145/272] fix pointwise with naming, quant_opt --- hls4ml/backends/catapult/passes/pointwise.py | 8 ++++---- hls4ml/backends/quartus/passes/pointwise.py | 6 ++---- hls4ml/backends/vivado/passes/pointwise.py | 9 +++++---- hls4ml/model/optimizer/passes/quant_opt.py | 2 +- test/pytest/test_qonnx.py | 7 ++----- 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py index 0141d7f108..fd464ef172 100755 --- a/hls4ml/backends/catapult/passes/pointwise.py +++ b/hls4ml/backends/catapult/passes/pointwise.py @@ -1,5 +1,3 @@ -from copy import copy - from hls4ml.backends.catapult.passes.convolution_templates import ( Conv1DConfigTemplate, Conv1DFunctionTemplate, @@ -75,8 +73,10 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' - pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy()) - pw_node.weights['bias'].data = node.weights['bias'].data + new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} + pw_node = model.make_node( + 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() + ) # Set strategy to ensure lowercase string is passed to the template if model.config.is_resource_strategy(pw_node): pw_node.set_attr('strategy', 'resource') diff --git a/hls4ml/backends/quartus/passes/pointwise.py b/hls4ml/backends/quartus/passes/pointwise.py index 0f7f6821ae..d65ab22569 100644 --- a/hls4ml/backends/quartus/passes/pointwise.py +++ b/hls4ml/backends/quartus/passes/pointwise.py @@ -1,5 +1,3 @@ -from copy import copy - from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D from hls4ml.backends.quartus.passes.convolution_templates import ( Conv1DConfigTemplate, @@ -81,10 +79,10 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' + new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} pw_node = model.make_node( - 'PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy() + 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) - pw_node.weights['bias'].data = node.weights['bias'].data model.replace_node(node, pw_node) return True diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py index 85d2635cb8..34568b09f7 100644 --- a/hls4ml/backends/vivado/passes/pointwise.py +++ b/hls4ml/backends/vivado/passes/pointwise.py @@ -1,5 +1,3 @@ -from copy import copy - from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D from hls4ml.backends.vivado.passes.convolution_templates import ( Conv1DConfigTemplate, @@ -75,8 +73,11 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' - pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy()) - pw_node.weights['bias'].data = node.weights['bias'].data + # to remove warning, since these get set again + new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} + pw_node = model.make_node( + 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() + ) # Set strategy to ensure lowercase string is passed to the template if model.config.is_resource_strategy(pw_node): pw_node.set_attr('strategy', 'resource') diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py index 69e9ca7685..cac29b5040 100644 --- a/hls4ml/model/optimizer/passes/quant_opt.py +++ b/hls4ml/model/optimizer/passes/quant_opt.py @@ -187,7 +187,7 @@ def transform(self, model, node): integer = bitwidth scale = node.get_attr('scale') if _ALSO_MATCH_PO2 and not (scale == np.ones_like(scale)).all(): - _, exp = np.frexp(np.squeeze(scale)) + _, exp = np.frexp(scale[0]) # know that np.all(scale[0] == scale) must be true integer = bitwidth + exp - 1 precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode) diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index e9ef37578f..58d8b68fe2 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -20,15 +20,12 @@ @pytest.fixture(scope='module') def sep_conv_model(): """ - Load separabale conv model + Load separabale conv model, already channels-last and cleaned """ dl_file = str(example_model_path / "onnx/separable_conv_model_ch_last.onnx") assert os.path.isfile(dl_file) - out_file = str(test_root_path / "separable_conv_model_ch_last_clean.onnx") - # cleanup - qonnx.util.cleanup.cleanup(dl_file, out_file=out_file) - model = ModelWrapper(out_file) + model = ModelWrapper(dl_file) return model From b0efdd6275a02eb9c18b82c29f90f30f380ac693 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 24 Sep 2024 10:45:59 -0500 Subject: [PATCH 146/272] fix ConstantBatchNormFusion --- hls4ml/model/optimizer/passes/batchnorm_opt.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index 0dde6b77a9..b6c21c7267 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -89,6 +89,9 @@ def match(self, node): isinstance(node, BatchNormalization) and not any(node.inputs[1:]) and isinstance(node.get_input_node(node.inputs[0]), Constant) + and isinstance( + node.get_input_node(node.inputs[0]).get_output_variable().type.precision, UnspecifiedPrecisionType + ) ) return is_match From 2a7248d2b689af5115bfc87064c14f41fe7cbea3 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 24 Sep 2024 21:16:21 +0200 Subject: [PATCH 147/272] Remove extras flow --- hls4ml/backends/catapult/catapult_backend.py | 7 ++----- hls4ml/backends/quartus/quartus_backend.py | 7 ++----- hls4ml/backends/vivado/vivado_backend.py | 7 ++----- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py index d939e1f30b..295d285df7 100644 --- a/hls4ml/backends/catapult/catapult_backend.py +++ b/hls4ml/backends/catapult/catapult_backend.py @@ -153,9 +153,8 @@ def _register_flows(self): ] if len(extras) > 0: - extras_flow = register_flow('extras', extras, requires=[init_flow], backend=self.name) - else: - extras_flow = None + for opt in extras: + print(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') ip_flow_requirements = [ 'optimize', @@ -164,10 +163,8 @@ def _register_flows(self): quantization_flow, optimization_flow, catapult_types_flow, - extras_flow, template_flow, ] - ip_flow_requirements = list(filter(None, ip_flow_requirements)) self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py index b6080a8c95..9dceab9469 100644 --- a/hls4ml/backends/quartus/quartus_backend.py +++ b/hls4ml/backends/quartus/quartus_backend.py @@ -103,9 +103,8 @@ def _register_flows(self): ] if len(extras) > 0: - extras_flow = register_flow('extras', extras, requires=[init_flow], backend=self.name) - else: - extras_flow = None + for opt in extras: + print(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') ip_flow_requirements = [ 'optimize', @@ -114,10 +113,8 @@ def _register_flows(self): quantization_flow, optimization_flow, quartus_types_flow, - extras_flow, template_flow, ] - ip_flow_requirements = list(filter(None, ip_flow_requirements)) self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index ffd9d84e43..b9c79f9a26 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -147,9 +147,8 @@ def _register_flows(self): ] if len(extras) > 0: - extras_flow = register_flow('extras', extras, requires=[init_flow], backend=self.name) - else: - extras_flow = None + for opt in extras: + print(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') ip_flow_requirements = [ 'optimize', @@ -158,10 +157,8 @@ def _register_flows(self): quantization_flow, optimization_flow, vivado_types_flow, - extras_flow, template_flow, ] - ip_flow_requirements = list(filter(None, ip_flow_requirements)) self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) From 14da6f5d2be0feb6a65b1c0c626631a19b70041e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 25 Sep 2024 09:13:01 -0500 Subject: [PATCH 148/272] update broadcasting for moving scales for conv --- hls4ml/model/optimizer/passes/move_scales.py | 27 ++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index 3776a6d202..1197480eaa 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -237,9 +237,21 @@ def transform(self, model, node): # zero bias, propagate through, if possible # (always possible if scale is scalar) try: - newscale = np.broadcast_to(scale, output.shape) # make sure broadcastable + if scale.ndim > 1: + # undo any broadcast_to + reduced_scale = _remove_redundant_dims(scale) + if reduced_scale.shape[-1] == 1: + reduced_scale = reduced_scale[..., 0] + if node.attributes['n_dim'] == 1: + scale_trans = np.transpose(reduced_scale, (1, 0)) + else: + scale_trans = np.transpose(reduced_scale, (1, 2, 0)) + newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable + can_propagate = True + else: + newscale = np.broadcast_to(scale, output.shape) # make sure broadcastable + can_propagate = True newbias = np.zeros(output.shape) - can_propagate = True except ValueError: can_propagate = False @@ -309,3 +321,14 @@ def transform(self, model, node): new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs]) model.insert_node(new_node) return True + + +def _remove_redundant_dims(X): + """This is somewhat of the inverse of broadcast-to. It sets the dimension size to 1 if all values are identical""" + + shape = X.shape + for i in range(len(shape)): + reduced = np.expand_dims(np.take(X, 0, axis=i), axis=i) + if np.all(reduced == X): + X = reduced + return X From 66c07501268f6e797bbb39913eb0a47e16666dac Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 26 Sep 2024 00:34:58 +0200 Subject: [PATCH 149/272] Use warnings.warn() instead of print() --- hls4ml/backends/catapult/catapult_backend.py | 3 ++- hls4ml/backends/quartus/quartus_backend.py | 2 +- hls4ml/backends/vivado/vivado_backend.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py index 295d285df7..aa9df375af 100644 --- a/hls4ml/backends/catapult/catapult_backend.py +++ b/hls4ml/backends/catapult/catapult_backend.py @@ -1,5 +1,6 @@ import os import sys +from warnings import warn import numpy as np @@ -154,7 +155,7 @@ def _register_flows(self): if len(extras) > 0: for opt in extras: - print(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') + warn(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') ip_flow_requirements = [ 'optimize', diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py index 9dceab9469..0ce1123fff 100644 --- a/hls4ml/backends/quartus/quartus_backend.py +++ b/hls4ml/backends/quartus/quartus_backend.py @@ -104,7 +104,7 @@ def _register_flows(self): if len(extras) > 0: for opt in extras: - print(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') + warn(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') ip_flow_requirements = [ 'optimize', diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index b9c79f9a26..2112a8db04 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -148,7 +148,7 @@ def _register_flows(self): if len(extras) > 0: for opt in extras: - print(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') + warn(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') ip_flow_requirements = [ 'optimize', From 97ad08c9df17fea021b43fdb67709a706d4f845e Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 26 Sep 2024 00:45:09 +0200 Subject: [PATCH 150/272] Put optimizers from extras flow to their proper place (in Quartus + Catapult) --- hls4ml/backends/catapult/catapult_backend.py | 2 ++ hls4ml/backends/quartus/quartus_backend.py | 1 + 2 files changed, 3 insertions(+) diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py index aa9df375af..5c85bf9b7e 100644 --- a/hls4ml/backends/catapult/catapult_backend.py +++ b/hls4ml/backends/catapult/catapult_backend.py @@ -111,6 +111,7 @@ def _register_flows(self): 'catapult:inplace_stream_flatten', 'catapult:skip_softmax', 'catapult:fix_softmax_table_size', + 'catapult:process_fixed_point_quantizer_layer', 'infer_precision_types', ] optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) @@ -121,6 +122,7 @@ def _register_flows(self): 'catapult:generate_conv_streaming_instructions', 'catapult:apply_resource_strategy', 'catapult:generate_conv_im2col', + 'catapult:apply_winograd_kernel_transformation', ] catapult_types_flow = register_flow('specific_types', catapult_types, requires=[init_flow], backend=self.name) diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py index 0ce1123fff..aecad642c6 100644 --- a/hls4ml/backends/quartus/quartus_backend.py +++ b/hls4ml/backends/quartus/quartus_backend.py @@ -55,6 +55,7 @@ def _register_flows(self): 'quartus:transform_types', 'quartus:register_bram_weights', 'quartus:apply_resource_strategy', + 'quartus:generate_conv_im2col', 'quartus:apply_winograd_kernel_transformation', ] quartus_types_flow = register_flow('specific_types', quartus_types, requires=[init_flow], backend=self.name) From 0333d36894e4378081639c76f0c0d7ac0f9d3d52 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 26 Sep 2024 09:18:38 -0500 Subject: [PATCH 151/272] snapshot of current development --- hls4ml/model/optimizer/__init__.py | 1 + hls4ml/model/optimizer/passes/move_scales.py | 274 +++++++++++-------- 2 files changed, 168 insertions(+), 107 deletions(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 10f652345f..d82d45668d 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -43,6 +43,7 @@ 'constant_batch_norm_fusion', 'merge_two_constants', 'scale_down_add', + 'bias_down_add', 'scale_down_mat_mul', 'scale_down_weight_conv', 'scale_down_bias_conv', diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index 1197480eaa..ecf1099ee5 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -56,15 +56,7 @@ def transform(self, model, node): scale = apply_alpha.weights['scale'].data_unquantized bias = apply_alpha.weights['bias'].data_unquantized - scale1d = np.ravel(scale) - if (scale1d[0] == scale).all(): - # scalar scale - scale = np.array(scale1d[0]) - - bias1d = np.ravel(bias) - if (bias1d[0] == bias).all(): - # scalar bias - bias = np.array(bias1d[0]) + scale, bias = _make_scalar(scale, bias) output = node.get_output_variable() # to remove warning, since these get set again @@ -146,73 +138,54 @@ def transform(self, model, node): return True -class ScaleDownConv(OptimizerPass): - '''Shift an ApplyAlpha on input below a Conv''' +class BiasDownAdd(OptimizerPass): + '''Shift a ApplyAlpha with only bias below a Merge (Add)''' def match(self, node): - '''Shift an ApplyAlpha from the Weight''' - is_match = isinstance(node, Conv) and isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha) - + '''Match if there is only one ApplyAlpha. If there are two, if the scale of both is 0, they would + match the ScaleDownAdd, so this optimizer does not need to handle that case. + ''' + is_match = isinstance(node, Merge) and len(node.inputs) == 2 and node.attributes["op"] == "add" + if is_match: + in0 = node.get_input_node(node.inputs[0]) + in1 = node.get_input_node(node.inputs[1]) + is_match = ( + (isinstance(in0, ApplyAlpha) + or isinstance(in1, ApplyAlpha)) + and not (isinstance(in0, ApplyAlpha) + and isinstance(in1, ApplyAlpha)) + ) # only one ApplyAlpha return is_match def transform(self, model, node): - apply_alpha = node.get_input_node(node.inputs[0]) - - # Check if we can move - scale = apply_alpha.weights['scale'].data_unquantized - bias = apply_alpha.weights['bias'].data_unquantized - - scale1d = np.ravel(scale) - if (scale1d[0] == scale).all(): - # scalar scale - scale = np.array(scale1d[0]) - - bias1d = np.ravel(bias) - if (bias1d[0] == bias).all(): - # scalar bias - bias = np.array(bias1d[0]) + in0 = node.get_input_node(node.inputs[0]) + in1 = node.get_input_node(node.inputs[1]) - output = node.get_output_variable() - # to remove warning, since these get set again - new_attrs = {k: v for k, v in apply_alpha.attributes.items() if k not in ('trace', 'precision')} + alpha_node = in0 if isinstance(in0, ApplyAlpha) else in1 - can_propagate = False - if not bias.shape and bias == 0: - # zero bias, propagate through, if possible - # (always possible if scale is scalar) - try: - newscale = np.broadcast_to(scale, output.shape) # check broadcastable - newbias = np.zeros(output.shape) - can_propagate = True - except ValueError: - can_propagate = False + # Check if we can move + scale = alpha_node.weights['scale'].data_unquantized - if not can_propagate: + if (scale == 0).all(): + model.remove_node(alpha_node) + new_node = model.make_node('ApplyAlpha', alpha_node.name, alpha_node.attributes, [x for x in node.outputs]) + model.insert_node(new_node) + return True + else: return False - model.remove_node(apply_alpha) - - new_attrs['scale_data'] = newscale - new_attrs['bias_data'] = newbias - - new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs]) - model.insert_node(new_node) - return True - -class ScaleDownWeightConv(OptimizerPass): - '''Shift an ApplyAlpha weight (from conv side) below a Conv''' +class ScaleDownConv(OptimizerPass): + '''Shift an ApplyAlpha on input below a Conv''' def match(self, node): '''Shift an ApplyAlpha from the Weight''' - is_match = ( - isinstance(node, Conv) and len(node.inputs) > 1 and isinstance(node.get_input_node(node.inputs[1]), ApplyAlpha) - ) + is_match = isinstance(node, Conv) and isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha) return is_match def transform(self, model, node): - apply_alpha = node.get_input_node(node.inputs[1]) + apply_alpha = node.get_input_node(node.inputs[0]) # Check if we can move scale = apply_alpha.weights['scale'].data_unquantized @@ -237,21 +210,9 @@ def transform(self, model, node): # zero bias, propagate through, if possible # (always possible if scale is scalar) try: - if scale.ndim > 1: - # undo any broadcast_to - reduced_scale = _remove_redundant_dims(scale) - if reduced_scale.shape[-1] == 1: - reduced_scale = reduced_scale[..., 0] - if node.attributes['n_dim'] == 1: - scale_trans = np.transpose(reduced_scale, (1, 0)) - else: - scale_trans = np.transpose(reduced_scale, (1, 2, 0)) - newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable - can_propagate = True - else: - newscale = np.broadcast_to(scale, output.shape) # make sure broadcastable - can_propagate = True + newscale = np.broadcast_to(scale, output.shape) # check broadcastable newbias = np.zeros(output.shape) + can_propagate = True except ValueError: can_propagate = False @@ -268,57 +229,141 @@ def transform(self, model, node): return True -class ScaleDownBiasConv(OptimizerPass): - '''Shift an ApplyAlpha bias (from conv side) below a Conv''' +class ScaleDownConv(OptimizerPass): + '''Shift an ApplyAlpha on a Conv with 2-3 inputs''' def match(self, node): '''Shift an ApplyAlpha from the Weight''' is_match = ( - isinstance(node, Conv) and len(node.inputs) > 2 and isinstance(node.get_input_node(node.inputs[2]), ApplyAlpha) + isinstance(node, Conv) and len(node.inputs) > 1 and + (isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha) + or isinstance(node.get_input_node(node.inputs[1]), ApplyAlpha) + or (len(node.inputs) == 3 and isinstance(node.get_input_node(node.inputs[2]), ApplyAlpha))) ) - return is_match def transform(self, model, node): - apply_alpha = node.get_input_node(node.inputs[2]) - - # Check if we can move - scale = apply_alpha.weights['scale'].data_unquantized - bias = apply_alpha.weights['bias'].data_unquantized - - scale1d = np.ravel(scale) - if (scale1d[0] == scale).all(): - # scalar scale - scale = np.array(scale1d[0]) - - bias1d = np.ravel(bias) - if (bias1d[0] == bias).all(): - # scalar bias - bias = np.array(bias1d[0]) + in0 = node.get_input_node(node.inputs[0]) + in1 = node.get_input_node(node.inputs[1]) + in2 = node.get_input_node(node.inputs[2]) if len(node.inputs) == 3 else None + + aa0 = isinstance(in0, ApplyAlpha) + aa1 = isinstance(in1, ApplyAlpha) + aa2 = isinstance(in2, ApplyAlpha) if len(node.inputs) == 3 else False + + if not isinstance(in1, (Constant, ApplyAlpha)): + raise RuntimeError("The weight node needs to be ApplyAlpha or Constant") + if len(node.inputs) == 3 and not isinstance(in2, (Constant, ApplyAlpha)): + raise RuntimeError("The bias node needs to be ApplyAlpha or Constant") + + scale0 = in0.weights['scale'].data_unquantized if aa0 else None + bias0 = in0.weights['bias'].data_unquantized if aa0 else None + scale1 = in1.weights['scale'].data_unquantized if aa1 else None + bias1 = in1.weights['bias'].data_unquantized if aa1 else None + scale2 = in2.weights['scale'].data_unquantized if aa2 else None + bias2 = in2.weights['bias'].data_unquantized if aa2 else None + + # If possible, make scale and bias have scalar values + if aa0: + scale0, bias0 = _make_scalar(scale0, bias0) + if aa1: + scale1, bias1 = _make_scalar(scale1, bias1) + if aa2: + scale2, bias2 = _make_scalar(scale2, bias2) output = node.get_output_variable() - # to remove warning, since these get set again - new_attrs = {k: v for k, v in apply_alpha.attributes.items() if k not in ('trace', 'precision')} - - can_propagate = False - if not scale.shape and scale == 1: - # No scale, just additional bias - try: - newscale = np.ones(output.shape) - newbias = np.broadcast_to(bias, output.shape) - can_propagate = True - except ValueError: - can_propagate = False + if (aa0 and not aa1 and not aa2): + # only datapath has a scale + bias = in2.attributes['value'] if len(node.inputs) == 3 else 0 + conv_nobias = np.all(bias == 0) + + can_propagate = False + if not bias0.shape and bias0 == 0: + # zero bias, propagate through, if possible + # (always possible if scale is scalar) + if conv_nobias: + try: + newscale = np.broadcast_to(_remove_redundant_dims(scale0), output.shape) # check broadcastable + newbias = np.zeros(output.shape) + can_propagate = True + except ValueError: + can_propagate = False + elif not scale0.shape: + # scalar scale0 + try: + newscale = np.broadcast_to(scale0, output.shape) # check broadcastable + newbias = np.broadcast_to(bias * (1 - scale0), output.shape) + can_propagate = True + except ValueError: + can_propagate = False + if not can_propagate: + return False + + # to remove warning, since these get set again + new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')} + new_name = in0.name + model.remove_node(in0) + elif (not aa0 and aa1 and not aa2): + # only weights have a scale + bias = in2.attributes['value'] if len(node.inputs) == 3 else 0 + conv_nobias = np.all(bias == 0) + + can_propagate = False + if not bias1.shape and bias1 == 0: + # zero bias, propagate through, if possible + # (always possible if scale is scalar) + try: + if scale1.ndim > 1: + # undo any broadcast_to + reduced_scale = _remove_redundant_dims(scale1) + if reduced_scale.shape[-1] == 1: + reduced_scale = reduced_scale[..., 0] + if node.attributes['n_dim'] == 1: + scale_trans = np.transpose(reduced_scale, (1, 0)) + else: + scale_trans = np.transpose(reduced_scale, (1, 2, 0)) + newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable + can_propagate = True + else: + newscale = np.broadcast_to(scale1, output.shape) # make sure broadcastable + can_propagate = True + newbias = np.zeros(output.shape) + except ValueError: + can_propagate = False + + if not can_propagate: + return False + + # to remove warning, since these get set again + new_attrs = {k: v for k, v in in1.attributes.items() if k not in ('trace', 'precision')} + new_name = in1.name + model.remove_node(in1) + + elif (not aa0 and not aa1 and aa2): + # only bias has a scale + + can_propagate = False + if not scale2.shape and scale2 == 1: + # No scale, just additional bias + try: + newscale = np.ones(output.shape) + newbias = np.broadcast_to(bias2, output.shape) + can_propagate = True + except ValueError: + can_propagate = False - if not can_propagate: - return False + if not can_propagate: + return False - model.remove_node(apply_alpha) + # to remove warning, since these get set again + new_attrs = {k: v for k, v in in2.attributes.items() if k not in ('trace', 'precision')} + new_name = in2.name + model.remove_node(in2) new_attrs['scale_data'] = newscale new_attrs['bias_data'] = newbias - new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs]) + new_node = model.make_node('ApplyAlpha', new_name, new_attrs, [x for x in node.outputs]) model.insert_node(new_node) return True @@ -332,3 +377,18 @@ def _remove_redundant_dims(X): if np.all(reduced == X): X = reduced return X + + +def _make_scalar(scale, bias): + """Make the scale and bias scalar if possible""" + scale1d = np.ravel(scale) + if (scale1d[0] == scale).all(): + # scalar scale + scale = np.array(scale1d[0]) + + bias1d = np.ravel(bias) + if (bias1d[0] == bias).all(): + # scalar bias + bias = np.array(bias1d[0]) + + return scale, bias From 80184d21514ab617bf4950c0476aac34964616ab Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 26 Sep 2024 12:09:47 -0500 Subject: [PATCH 152/272] snapshot working through scale downs --- example-models | 2 +- hls4ml/model/optimizer/passes/move_scales.py | 189 ++++++++++--------- 2 files changed, 103 insertions(+), 88 deletions(-) diff --git a/example-models b/example-models index a81e36eb16..ff74f73dbc 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit a81e36eb16593450d7661e7b9686666ddb397208 +Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548 diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index ecf1099ee5..0ccdf07c61 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -149,11 +149,8 @@ def match(self, node): if is_match: in0 = node.get_input_node(node.inputs[0]) in1 = node.get_input_node(node.inputs[1]) - is_match = ( - (isinstance(in0, ApplyAlpha) - or isinstance(in1, ApplyAlpha)) - and not (isinstance(in0, ApplyAlpha) - and isinstance(in1, ApplyAlpha)) + is_match = (isinstance(in0, ApplyAlpha) or isinstance(in1, ApplyAlpha)) and not ( + isinstance(in0, ApplyAlpha) and isinstance(in1, ApplyAlpha) ) # only one ApplyAlpha return is_match @@ -175,70 +172,19 @@ def transform(self, model, node): return False -class ScaleDownConv(OptimizerPass): - '''Shift an ApplyAlpha on input below a Conv''' - - def match(self, node): - '''Shift an ApplyAlpha from the Weight''' - is_match = isinstance(node, Conv) and isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha) - - return is_match - - def transform(self, model, node): - apply_alpha = node.get_input_node(node.inputs[0]) - - # Check if we can move - scale = apply_alpha.weights['scale'].data_unquantized - bias = apply_alpha.weights['bias'].data_unquantized - - scale1d = np.ravel(scale) - if (scale1d[0] == scale).all(): - # scalar scale - scale = np.array(scale1d[0]) - - bias1d = np.ravel(bias) - if (bias1d[0] == bias).all(): - # scalar bias - bias = np.array(bias1d[0]) - - output = node.get_output_variable() - # to remove warning, since these get set again - new_attrs = {k: v for k, v in apply_alpha.attributes.items() if k not in ('trace', 'precision')} - - can_propagate = False - if not bias.shape and bias == 0: - # zero bias, propagate through, if possible - # (always possible if scale is scalar) - try: - newscale = np.broadcast_to(scale, output.shape) # check broadcastable - newbias = np.zeros(output.shape) - can_propagate = True - except ValueError: - can_propagate = False - - if not can_propagate: - return False - - model.remove_node(apply_alpha) - - new_attrs['scale_data'] = newscale - new_attrs['bias_data'] = newbias - - new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs]) - model.insert_node(new_node) - return True - - class ScaleDownConv(OptimizerPass): '''Shift an ApplyAlpha on a Conv with 2-3 inputs''' def match(self, node): '''Shift an ApplyAlpha from the Weight''' is_match = ( - isinstance(node, Conv) and len(node.inputs) > 1 and - (isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha) - or isinstance(node.get_input_node(node.inputs[1]), ApplyAlpha) - or (len(node.inputs) == 3 and isinstance(node.get_input_node(node.inputs[2]), ApplyAlpha))) + isinstance(node, Conv) + and len(node.inputs) > 1 + and ( + isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha) + or isinstance(node.get_input_node(node.inputs[1]), ApplyAlpha) + or (len(node.inputs) == 3 and isinstance(node.get_input_node(node.inputs[2]), ApplyAlpha)) + ) ) return is_match @@ -272,14 +218,14 @@ def transform(self, model, node): scale2, bias2 = _make_scalar(scale2, bias2) output = node.get_output_variable() - if (aa0 and not aa1 and not aa2): + if aa0 and not aa1 and not aa2: # only datapath has a scale bias = in2.attributes['value'] if len(node.inputs) == 3 else 0 conv_nobias = np.all(bias == 0) can_propagate = False if not bias0.shape and bias0 == 0: - # zero bias, propagate through, if possible + # No zero offset, propagate through, if possible # (always possible if scale is scalar) if conv_nobias: try: @@ -303,34 +249,58 @@ def transform(self, model, node): new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')} new_name = in0.name model.remove_node(in0) - elif (not aa0 and aa1 and not aa2): + + elif not aa0 and aa1 and not aa2: # only weights have a scale bias = in2.attributes['value'] if len(node.inputs) == 3 else 0 conv_nobias = np.all(bias == 0) - + can_propagate = False if not bias1.shape and bias1 == 0: - # zero bias, propagate through, if possible + # No zero offset, propagate through, if possible # (always possible if scale is scalar) - try: - if scale1.ndim > 1: - # undo any broadcast_to - reduced_scale = _remove_redundant_dims(scale1) - if reduced_scale.shape[-1] == 1: - reduced_scale = reduced_scale[..., 0] - if node.attributes['n_dim'] == 1: - scale_trans = np.transpose(reduced_scale, (1, 0)) - else: - scale_trans = np.transpose(reduced_scale, (1, 2, 0)) - newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable + if conv_nobias: + try: + if scale1.ndim > 1: + # undo any broadcast_to + reduced_scale0 = _remove_redundant_dims(scale0) if scale0.ndim > 1 else scale0 + reduced_scale1 = _remove_redundant_dims(scale1) + reduced_scale = reduced_scale0 @ reduced_scale1 + if reduced_scale.shape[-1] == 1: + reduced_scale = reduced_scale[..., 0] + if node.attributes['n_dim'] == 1: + scale_trans = np.transpose(reduced_scale, (1, 0)) + else: + scale_trans = np.transpose(reduced_scale, (1, 2, 0)) + newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable + can_propagate = True + elif scale0.ndim > 1: + # scale1 is scalar + # undo any broadcast_to + reduced_scale0 = _remove_redundant_dims(scale0) + reduced_scale = scale1 * reduced_scale0 + if reduced_scale.shape[-1] == 1: + reduced_scale = reduced_scale[..., 0] + if node.attributes['n_dim'] == 1: + scale_trans = np.transpose(reduced_scale, (1, 0)) + else: + scale_trans = np.transpose(reduced_scale, (1, 2, 0)) + newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable + can_propagate = True + else: + newscale = np.broadcast_to(scale0 * scale1, output.shape) # make sure broadcastable can_propagate = True - else: - newscale = np.broadcast_to(scale1, output.shape) # make sure broadcastable + newbias = np.zeros(output.shape) + except ValueError: + can_propagate = False + elif not scale0.shape and not scale1.shape: + # scalar scale1 + try: + newscale = np.broadcast_to(scale0 * scale1, output.shape) # check broadcastable + newbias = np.broadcast_to(bias * (1 - scale0 * scale1), output.shape) can_propagate = True - newbias = np.zeros(output.shape) - except ValueError: - can_propagate = False - + except ValueError: + can_propagate = False if not can_propagate: return False @@ -339,9 +309,9 @@ def transform(self, model, node): new_name = in1.name model.remove_node(in1) - elif (not aa0 and not aa1 and aa2): + elif not aa0 and not aa1 and aa2: # only bias has a scale - + can_propagate = False if not scale2.shape and scale2 == 1: # No scale, just additional bias @@ -360,6 +330,51 @@ def transform(self, model, node): new_name = in2.name model.remove_node(in2) + elif aa0 and aa1 and not aa2: + # dataflow and weights have an ApplyAlpha + bias = in2.attributes['value'] if len(node.inputs) == 3 else 0 + conv_nobias = np.all(bias == 0) + + can_propagate = False + if not bias0.shape and bias0 == 0 and not bias1.shape and bias1 == 0: + # zero bias, propagate through, if possible + # (always possible if scale is scalar) + if conv_nobias: + try: + if scale1.ndim > 1: + # undo any broadcast_to + reduced_scale = _remove_redundant_dims(scale1) + if reduced_scale.shape[-1] == 1: + reduced_scale = reduced_scale[..., 0] + if node.attributes['n_dim'] == 1: + scale_trans = np.transpose(reduced_scale, (1, 0)) + else: + scale_trans = np.transpose(reduced_scale, (1, 2, 0)) + newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable + can_propagate = True + else: + newscale = np.broadcast_to(scale1, output.shape) # make sure broadcastable + can_propagate = True + newbias = np.zeros(output.shape) + except ValueError: + can_propagate = False + elif not scale1.shape: + # scalar scale1 + try: + newscale = np.broadcast_to(scale1, output.shape) # check broadcastable + newbias = np.broadcast_to(bias * (1 - scale1), output.shape) + can_propagate = True + except ValueError: + can_propagate = False + if not can_propagate: + return False + + # to remove warning, since these get set again + new_attrs = {k: v for k, v in in1.attributes.items() if k not in ('trace', 'precision')} + new_name = in1.name + model.remove_node(in1) + + # after the big if-else above new_attrs['scale_data'] = newscale new_attrs['bias_data'] = newbias From 6bb08172a7f9dfeccc0ba6d6e72df21fbc0059d1 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 26 Sep 2024 16:03:34 -0500 Subject: [PATCH 153/272] finish making the various cases --- hls4ml/model/optimizer/passes/move_scales.py | 131 +++++++++++++++---- 1 file changed, 104 insertions(+), 27 deletions(-) diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index 0ccdf07c61..b2470f2839 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -251,7 +251,7 @@ def transform(self, model, node): model.remove_node(in0) elif not aa0 and aa1 and not aa2: - # only weights have a scale + # only weights have an ApplyAlpha bias = in2.attributes['value'] if len(node.inputs) == 3 else 0 conv_nobias = np.all(bias == 0) @@ -263,22 +263,7 @@ def transform(self, model, node): try: if scale1.ndim > 1: # undo any broadcast_to - reduced_scale0 = _remove_redundant_dims(scale0) if scale0.ndim > 1 else scale0 - reduced_scale1 = _remove_redundant_dims(scale1) - reduced_scale = reduced_scale0 @ reduced_scale1 - if reduced_scale.shape[-1] == 1: - reduced_scale = reduced_scale[..., 0] - if node.attributes['n_dim'] == 1: - scale_trans = np.transpose(reduced_scale, (1, 0)) - else: - scale_trans = np.transpose(reduced_scale, (1, 2, 0)) - newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable - can_propagate = True - elif scale0.ndim > 1: - # scale1 is scalar - # undo any broadcast_to - reduced_scale0 = _remove_redundant_dims(scale0) - reduced_scale = scale1 * reduced_scale0 + reduced_scale = _remove_redundant_dims(scale1) if reduced_scale.shape[-1] == 1: reduced_scale = reduced_scale[..., 0] if node.attributes['n_dim'] == 1: @@ -288,16 +273,16 @@ def transform(self, model, node): newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable can_propagate = True else: - newscale = np.broadcast_to(scale0 * scale1, output.shape) # make sure broadcastable + newscale = np.broadcast_to(scale1, output.shape) # make sure broadcastable can_propagate = True newbias = np.zeros(output.shape) except ValueError: can_propagate = False - elif not scale0.shape and not scale1.shape: + elif not scale1.shape: # scalar scale1 try: - newscale = np.broadcast_to(scale0 * scale1, output.shape) # check broadcastable - newbias = np.broadcast_to(bias * (1 - scale0 * scale1), output.shape) + newscale = np.broadcast_to(scale1, output.shape) # check broadcastable + newbias = np.broadcast_to(bias * (1 - scale1), output.shape) can_propagate = True except ValueError: can_propagate = False @@ -305,7 +290,7 @@ def transform(self, model, node): return False # to remove warning, since these get set again - new_attrs = {k: v for k, v in in1.attributes.items() if k not in ('trace', 'precision')} + new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')} new_name = in1.name model.remove_node(in1) @@ -337,13 +322,28 @@ def transform(self, model, node): can_propagate = False if not bias0.shape and bias0 == 0 and not bias1.shape and bias1 == 0: - # zero bias, propagate through, if possible + # No zero offset, propagate through, if possible # (always possible if scale is scalar) if conv_nobias: try: if scale1.ndim > 1: # undo any broadcast_to - reduced_scale = _remove_redundant_dims(scale1) + reduced_scale0 = _remove_redundant_dims(scale0) if scale0.ndim > 1 else scale0 + reduced_scale1 = _remove_redundant_dims(scale1) + reduced_scale = reduced_scale0 @ reduced_scale1 + if reduced_scale.shape[-1] == 1: + reduced_scale = reduced_scale[..., 0] + if node.attributes['n_dim'] == 1: + scale_trans = np.transpose(reduced_scale, (1, 0)) + else: + scale_trans = np.transpose(reduced_scale, (1, 2, 0)) + newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable + can_propagate = True + elif scale0.ndim > 1: + # scale1 is scalar + # undo any broadcast_to + reduced_scale0 = _remove_redundant_dims(scale0) + reduced_scale = scale1 * reduced_scale0 if reduced_scale.shape[-1] == 1: reduced_scale = reduced_scale[..., 0] if node.attributes['n_dim'] == 1: @@ -353,16 +353,93 @@ def transform(self, model, node): newscale = np.broadcast_to(scale_trans, output.shape) # make sure broadcastable can_propagate = True else: - newscale = np.broadcast_to(scale1, output.shape) # make sure broadcastable + newscale = np.broadcast_to(scale0 * scale1, output.shape) # make sure broadcastable can_propagate = True newbias = np.zeros(output.shape) except ValueError: can_propagate = False - elif not scale1.shape: + elif not scale0.shape and not scale1.shape: + # scalar scale1 + try: + newscale = np.broadcast_to(scale0 * scale1, output.shape) # check broadcastable + newbias = np.broadcast_to(bias * (1 - scale0 * scale1), output.shape) + can_propagate = True + except ValueError: + can_propagate = False + if not can_propagate: + return False + + # to remove warning, since these get set again + new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')} + new_name = in1.name + model.remove_node(in0) + model.remove_node(in1) + + elif aa0 and not aa1 and aa2: + # datapath and bias have a scale + + can_propagate = False + if not bias0.shape and bias0 == 0 and not scale2.shape and not scale0.shape and scale2 == scale0: + # scalar scale0, no bais0 and scale2. + try: + newscale = np.broadcast_to(scale0, output.shape) # check broadcastable + newbias = np.broadcast_to(bias2, output.shape) + can_propagate = True + except ValueError: + can_propagate = False + if not can_propagate: + return False + + # to remove warning, since these get set again + new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')} + new_name = in0.name + model.remove_node(in0) + model.remove_node(in2) + + elif not aa0 and aa1 and aa2: + # only weights and bias have an ApplyAlpha + + can_propagate = False + if not bias1.shape and bias1 == 0 and not scale2.shape and not scale1.shape and scale2 == scale1: + # No zero offset, propagate through, if possible + # (always possible if scale is scalar) + if not scale1.shape: # scalar scale1 try: newscale = np.broadcast_to(scale1, output.shape) # check broadcastable - newbias = np.broadcast_to(bias * (1 - scale1), output.shape) + newbias = np.broadcast_to(bias2, output.shape) + can_propagate = True + except ValueError: + can_propagate = False + if not can_propagate: + return False + + # to remove warning, since these get set again + new_attrs = {k: v for k, v in in1.attributes.items() if k not in ('trace', 'precision')} + new_name = in1.name + model.remove_node(in1) + + elif aa0 and aa1 and aa2: + # have all + + can_propagate = False + if ( + not bias0.shape + and bias0 == 0 + and not bias1.shape + and bias1 == 0 + and not scale2.shape + and not scale1.shape + and not scale0.shape + and scale2 == scale1 * scale0 + ): + # No zero offset, propagate through, if possible + # (always possible if scale is scalar) + if not scale1.shape: + # scalar scale1 + try: + newscale = np.broadcast_to(scale0 * scale1, output.shape) # check broadcastable + newbias = np.broadcast_to(bias2, output.shape) can_propagate = True except ValueError: can_propagate = False From 766a14cf0775bfa52eb5d10a1a3cc27a4ab42d37 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 26 Sep 2024 16:11:36 -0500 Subject: [PATCH 154/272] accidentally reverted the example models --- example-models | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example-models b/example-models index ff74f73dbc..a81e36eb16 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548 +Subproject commit a81e36eb16593450d7661e7b9686666ddb397208 From 5ff1373d3db86239b9912a96b1a040958643790f Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 26 Sep 2024 16:35:56 -0500 Subject: [PATCH 155/272] some bug fixes --- example-models | 2 +- hls4ml/model/optimizer/__init__.py | 2 -- hls4ml/model/optimizer/passes/move_scales.py | 7 +++++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/example-models b/example-models index a81e36eb16..3cfbcfd062 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit a81e36eb16593450d7661e7b9686666ddb397208 +Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550 diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index d82d45668d..0edd549b29 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -45,8 +45,6 @@ 'scale_down_add', 'bias_down_add', 'scale_down_mat_mul', - 'scale_down_weight_conv', - 'scale_down_bias_conv', 'scale_down_conv', 'merge_to_apply_alpha', 'merge_to_apply_alpha_div', diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index b2470f2839..43fcaa0da7 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -418,6 +418,7 @@ def transform(self, model, node): new_attrs = {k: v for k, v in in1.attributes.items() if k not in ('trace', 'precision')} new_name = in1.name model.remove_node(in1) + model.remove_node(in2) elif aa0 and aa1 and aa2: # have all @@ -447,9 +448,11 @@ def transform(self, model, node): return False # to remove warning, since these get set again - new_attrs = {k: v for k, v in in1.attributes.items() if k not in ('trace', 'precision')} - new_name = in1.name + new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')} + new_name = in0.name + model.remove_node(in0) model.remove_node(in1) + model.remove_node(in2) # after the big if-else above new_attrs['scale_data'] = newscale From b56a0bd2ed86057d63b4c82784dba3c92f3c32db Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 29 Sep 2024 01:42:20 +0200 Subject: [PATCH 156/272] Halt on errors in build_lib.sh --- hls4ml/templates/vivado/build_lib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/templates/vivado/build_lib.sh b/hls4ml/templates/vivado/build_lib.sh index 19f2d0a1c8..e321e94df3 100755 --- a/hls4ml/templates/vivado/build_lib.sh +++ b/hls4ml/templates/vivado/build_lib.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -e CC=g++ if [[ "$OSTYPE" == "linux-gnu" ]]; then From 7beb6954df24f22e32ec6b3699d1a3943ea01fd2 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 29 Sep 2024 01:43:47 +0200 Subject: [PATCH 157/272] make build_lib.sh executable by default and switch to using pathlib --- hls4ml/writer/vivado_writer.py | 55 +++++++++++++++++----------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 53ec98df1b..e4c0c24551 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -1,7 +1,9 @@ import glob import os +import stat import tarfile from collections import OrderedDict +from pathlib import Path from shutil import copyfile, copytree, rmtree import numpy as np @@ -692,45 +694,44 @@ def write_build_script(self, model): model (ModelGraph): the hls4ml model. """ - filedir = os.path.dirname(os.path.abspath(__file__)) + filedir = Path(__file__).parent # project.tcl - f = open(f'{model.config.get_output_dir()}/project.tcl', 'w') - f.write('variable project_name\n') - f.write(f'set project_name "{model.config.get_project_name()}"\n') - f.write('variable backend\n') - f.write('set backend "vivado"\n') - f.write('variable part\n') - f.write('set part "{}"\n'.format(model.config.get_config_value('Part'))) - f.write('variable clock_period\n') - f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) - f.write('variable clock_uncertainty\n') - f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) - f.write('variable version\n') - f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) - f.close() + prj_tcl_dst = Path(f'{model.config.get_output_dir()}/project.tcl') + with open(prj_tcl_dst, 'w') as f: + f.write('variable project_name\n') + f.write(f'set project_name "{model.config.get_project_name()}"\n') + f.write('variable backend\n') + f.write('set backend "vivado"\n') + f.write('variable part\n') + f.write('set part "{}"\n'.format(model.config.get_config_value('Part'))) + f.write('variable clock_period\n') + f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) + f.write('variable version\n') + f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) # build_prj.tcl - srcpath = os.path.join(filedir, '../templates/vivado/build_prj.tcl') + srcpath = (filedir / '../templates/vivado/build_prj.tcl').resolve() dstpath = f'{model.config.get_output_dir()}/build_prj.tcl' copyfile(srcpath, dstpath) # vivado_synth.tcl - srcpath = os.path.join(filedir, '../templates/vivado/vivado_synth.tcl') + srcpath = (filedir / '../templates/vivado/vivado_synth.tcl').resolve() dstpath = f'{model.config.get_output_dir()}/vivado_synth.tcl' copyfile(srcpath, dstpath) # build_lib.sh - f = open(os.path.join(filedir, '../templates/vivado/build_lib.sh')) - fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') - - for line in f.readlines(): - line = line.replace('myproject', model.config.get_project_name()) - line = line.replace('mystamp', model.config.get_config_value('Stamp')) - - fout.write(line) - f.close() - fout.close() + build_lib_src = (filedir / '../templates/vivado/build_lib.sh').resolve() + build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve() + with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst: + for line in src.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + + dst.write(line) + build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC) def write_nnet_utils(self, model): """Copy the nnet_utils, AP types headers and any custom source to the project output directory From 8bf5bd56149b209d6a64d84a427aed9b6508520f Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 29 Sep 2024 01:44:19 +0200 Subject: [PATCH 158/272] Use subprocess instead of os.system --- hls4ml/backends/fpga/fpga_backend.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 479af8ebf3..f7f10a5613 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -1,6 +1,7 @@ import math import os import re +import subprocess from bisect import bisect_left from collections.abc import Iterable @@ -131,19 +132,22 @@ def compile(self, model): Returns: string: Returns the name of the compiled library. """ - curr_dir = os.getcwd() - os.chdir(model.config.get_output_dir()) lib_name = None - try: - ret_val = os.system('bash build_lib.sh') - if ret_val != 0: - raise Exception(f'Failed to compile project "{model.config.get_project_name()}"') - lib_name = '{}/firmware/{}-{}.so'.format( - model.config.get_output_dir(), model.config.get_project_name(), model.config.get_config_value('Stamp') - ) - finally: - os.chdir(curr_dir) + ret_val = subprocess.run( + ['./build_lib.sh'], + shell=True, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=model.config.get_output_dir() + ) + if ret_val.returncode != 0: + print(ret_val.stdout) + raise Exception(f'Failed to compile project "{model.config.get_project_name()}"') + lib_name = '{}/firmware/{}-{}.so'.format( + model.config.get_output_dir(), model.config.get_project_name(), model.config.get_config_value('Stamp') + ) return lib_name From 21a7e7b3f092ee4e04db1c446e0597847289e63b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 Sep 2024 00:17:03 +0000 Subject: [PATCH 159/272] [pre-commit.ci] auto fixes from pre-commit hooks --- hls4ml/backends/fpga/fpga_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index f7f10a5613..d6f2959d94 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -140,7 +140,7 @@ def compile(self, model): text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - cwd=model.config.get_output_dir() + cwd=model.config.get_output_dir(), ) if ret_val.returncode != 0: print(ret_val.stdout) From 86abdd236f74ce39af96a6f0fc868bc7246f49f2 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sun, 29 Sep 2024 15:01:20 -0500 Subject: [PATCH 160/272] update qonnx sepconv test --- test/pytest/test_qonnx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index 58d8b68fe2..75c6c95c3f 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -102,12 +102,12 @@ def test_sep_conv(sep_conv_model, backend): model = sep_conv_model ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) - # X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + X = (np.round(X * 2**16) * 2**-16).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] config = hls4ml.utils.config.config_from_onnx_model( - model, granularity='name', backend=backend, default_precision='fixed<16,6>' + model, granularity='name', backend=backend, default_precision='fixed<32,16>' ) hls_model = hls4ml.converters.convert_from_onnx_model( From d0b1cfc7c7ee3e71c25bdd47851cd00676ae0a4e Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 30 Sep 2024 00:23:24 +0200 Subject: [PATCH 161/272] Follow up with Quartus and Catapult --- hls4ml/templates/quartus/build_lib.sh | 1 + hls4ml/templates/vivado/build_lib.sh | 5 +- hls4ml/writer/catapult_writer.py | 85 +++++++++++++-------------- hls4ml/writer/quartus_writer.py | 42 ++++++------- 4 files changed, 66 insertions(+), 67 deletions(-) diff --git a/hls4ml/templates/quartus/build_lib.sh b/hls4ml/templates/quartus/build_lib.sh index 02e92a1994..5514a9cc75 100755 --- a/hls4ml/templates/quartus/build_lib.sh +++ b/hls4ml/templates/quartus/build_lib.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -e CC=g++ if [[ "$OSTYPE" == "linux-gnu" ]]; then diff --git a/hls4ml/templates/vivado/build_lib.sh b/hls4ml/templates/vivado/build_lib.sh index e321e94df3..8b2daf185f 100755 --- a/hls4ml/templates/vivado/build_lib.sh +++ b/hls4ml/templates/vivado/build_lib.sh @@ -11,8 +11,9 @@ LDFLAGS= INCFLAGS="-Ifirmware/ap_types/" PROJECT=myproject LIB_STAMP=mystamp +WEIGHTS_DIR="\"weights\"" -${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o -${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR=${WEIGHTS_DIR} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR=${WEIGHTS_DIR} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o ${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so rm -f *.o diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py index af3f28a59e..396ecb968e 100755 --- a/hls4ml/writer/catapult_writer.py +++ b/hls4ml/writer/catapult_writer.py @@ -1,7 +1,9 @@ import glob import os +import stat import tarfile from collections import OrderedDict +from pathlib import Path from shutil import copyfile, copytree, rmtree import numpy as np @@ -749,55 +751,50 @@ def write_build_script(self, model): model (ModelGraph): the hls4ml model. """ - filedir = os.path.dirname(os.path.abspath(__file__)) + filedir = Path(__file__).parent # build_prj.tcl - srcpath = os.path.join(filedir, '../templates/catapult/build_prj.tcl') - dstpath = f'{model.config.get_output_dir()}/build_prj.tcl' - # copyfile(srcpath, dstpath) - f = open(srcpath) - fout = open(dstpath, 'w') - for line in f.readlines(): - indent = line[: len(line) - len(line.lstrip())] - line = line.replace('myproject', model.config.get_project_name()) - line = line.replace('CATAPULT_DIR', model.config.get_project_dir()) - if '#hls-fpga-machine-learning insert techlibs' in line: - if model.config.get_config_value('Technology') is None: - if model.config.get_config_value('Part') is not None: - line = indent + 'setup_xilinx_part {{{}}}\n'.format(model.config.get_config_value('Part')) - elif model.config.get_config_value('ASICLibs') is not None: - line = indent + 'setup_asic_libs {{{}}}\n'.format(model.config.get_config_value('ASICLibs')) - else: - if model.config.get_config_value('Technology') == 'asic': - line = indent + 'setup_asic_libs {{{}}}\n'.format(model.config.get_config_value('ASICLibs')) + srcpath = (filedir / '../templates/catapult/build_prj.tcl').resolve() + dstpath = Path(f'{model.config.get_output_dir()}/build_prj.tcl').resolve() + with open(srcpath) as src, open(dstpath, 'w') as dst: + for line in src.readlines(): + indent = line[: len(line) - len(line.lstrip())] + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('CATAPULT_DIR', model.config.get_project_dir()) + if '#hls-fpga-machine-learning insert techlibs' in line: + if model.config.get_config_value('Technology') is None: + if model.config.get_config_value('Part') is not None: + line = indent + 'setup_xilinx_part {{{}}}\n'.format(model.config.get_config_value('Part')) + elif model.config.get_config_value('ASICLibs') is not None: + line = indent + 'setup_asic_libs {{{}}}\n'.format(model.config.get_config_value('ASICLibs')) else: - line = indent + 'setup_xilinx_part {{{}}}\n'.format(model.config.get_config_value('Part')) - elif '#hls-fpga-machine-learning insert invoke_args' in line: - tb_in_file = model.config.get_config_value('InputData') - tb_out_file = model.config.get_config_value('OutputPredictions') - invoke_args = '$sfd/firmware/weights' - if tb_in_file is not None: - invoke_args = invoke_args + f' $sfd/tb_data/{tb_in_file}' - if tb_out_file is not None: - invoke_args = invoke_args + f' $sfd/tb_data/{tb_out_file}' - line = indent + f'flow package option set /SCVerify/INVOKE_ARGS "{invoke_args}"\n' - elif 'set hls_clock_period 5' in line: - line = indent + 'set hls_clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')) - fout.write(line) - f.close() - fout.close() + if model.config.get_config_value('Technology') == 'asic': + line = indent + 'setup_asic_libs {{{}}}\n'.format(model.config.get_config_value('ASICLibs')) + else: + line = indent + 'setup_xilinx_part {{{}}}\n'.format(model.config.get_config_value('Part')) + elif '#hls-fpga-machine-learning insert invoke_args' in line: + tb_in_file = model.config.get_config_value('InputData') + tb_out_file = model.config.get_config_value('OutputPredictions') + invoke_args = '$sfd/firmware/weights' + if tb_in_file is not None: + invoke_args = invoke_args + f' $sfd/tb_data/{tb_in_file}' + if tb_out_file is not None: + invoke_args = invoke_args + f' $sfd/tb_data/{tb_out_file}' + line = indent + f'flow package option set /SCVerify/INVOKE_ARGS "{invoke_args}"\n' + elif 'set hls_clock_period 5' in line: + line = indent + 'set hls_clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')) + dst.write(line) # build_lib.sh - f = open(os.path.join(filedir, '../templates/catapult/build_lib.sh')) - fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') - - for line in f.readlines(): - line = line.replace('myproject', model.config.get_project_name()) - line = line.replace('mystamp', model.config.get_config_value('Stamp')) - - fout.write(line) - f.close() - fout.close() + build_lib_src = (filedir / '../templates/catapult/build_lib.sh').resolve() + build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve() + with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst: + for line in src.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + + dst.write(line) + build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC) def write_nnet_utils(self, model): """Copy the nnet_utils, AP types headers and any custom source to the project output directory diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py index 8c0217f924..932a8b6a6d 100644 --- a/hls4ml/writer/quartus_writer.py +++ b/hls4ml/writer/quartus_writer.py @@ -1,7 +1,9 @@ import glob import os +import stat import tarfile from collections import OrderedDict +from pathlib import Path from shutil import copyfile, copytree, rmtree import numpy as np @@ -877,32 +879,30 @@ def write_build_script(self, model): model (ModelGraph): the hls4ml model. """ - # Makefile - filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir, '../templates/quartus/Makefile')) - fout = open(f'{model.config.get_output_dir()}/Makefile', 'w') + filedir = Path(__file__).parent - for line in f.readlines(): - line = line.replace('myproject', model.config.get_project_name()) + # Makefile + makefile_src = (filedir / '../templates/quartus/Makefile').resolve() + makefile_dst = Path(f'{model.config.get_output_dir()}/Makefile').resolve() + with open(makefile_src) as src, open(makefile_dst, 'w') as dst: + for line in src.readlines(): + line = line.replace('myproject', model.config.get_project_name()) - if 'DEVICE :=' in line: - line = 'DEVICE := {}\n'.format(model.config.get_config_value('Part')) + if 'DEVICE :=' in line: + line = 'DEVICE := {}\n'.format(model.config.get_config_value('Part')) - fout.write(line) - f.close() - fout.close() + dst.write(line) # build_lib.sh - f = open(os.path.join(filedir, '../templates/quartus/build_lib.sh')) - fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') - - for line in f.readlines(): - line = line.replace('myproject', model.config.get_project_name()) - line = line.replace('mystamp', model.config.get_config_value('Stamp')) - - fout.write(line) - f.close() - fout.close() + build_lib_src = (filedir / '../templates/quartus/build_lib.sh').resolve() + build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve() + with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst: + for line in src.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + + dst.write(line) + build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC) def write_nnet_utils(self, model): """Copy the nnet_utils, AP types headers and any custom source to the project output directory From b7c767b706d96c39f3141ff1e7f24f5ecae73b55 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 30 Sep 2024 18:35:16 +0200 Subject: [PATCH 162/272] Fix unused import --- hls4ml/backends/fpga/fpga_backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index d6f2959d94..7996adfd00 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -1,5 +1,4 @@ import math -import os import re import subprocess from bisect import bisect_left From 2f6443a7aac03da10b085ec0089cbd849bda6362 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 1 Oct 2024 17:00:39 +0200 Subject: [PATCH 163/272] Fix SR writer --- hls4ml/writer/symbolic_writer.py | 67 ++++++++++++++++---------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/hls4ml/writer/symbolic_writer.py b/hls4ml/writer/symbolic_writer.py index b442d3cd39..76d56b1533 100644 --- a/hls4ml/writer/symbolic_writer.py +++ b/hls4ml/writer/symbolic_writer.py @@ -1,5 +1,7 @@ import glob import os +import stat +from pathlib import Path from shutil import copyfile, copytree, rmtree from hls4ml.backends import get_backend @@ -56,49 +58,48 @@ def write_build_script(self, model): model (ModelGraph): the hls4ml model. """ - filedir = os.path.dirname(os.path.abspath(__file__)) - - # build_prj.tcl - f = open(f'{model.config.get_output_dir()}/project.tcl', 'w') - f.write('variable project_name\n') - f.write(f'set project_name "{model.config.get_project_name()}"\n') - f.write('variable backend\n') - f.write('set backend "vivado"\n') - f.write('variable part\n') - f.write('set part "{}"\n'.format(model.config.get_config_value('Part'))) - f.write('variable clock_period\n') - f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) - f.write('variable clock_uncertainty\n') - f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '0%'))) - f.write('variable version\n') - f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) - f.close() + filedir = Path(__file__).parent + + # project.tcl + prj_tcl_dst = Path(f'{model.config.get_output_dir()}/project.tcl') + with open(prj_tcl_dst, 'w') as f: + f.write('variable project_name\n') + f.write(f'set project_name "{model.config.get_project_name()}"\n') + f.write('variable backend\n') + f.write('set backend "vivado"\n') + f.write('variable part\n') + f.write('set part "{}"\n'.format(model.config.get_config_value('Part'))) + f.write('variable clock_period\n') + f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '0%'))) + f.write('variable version\n') + f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) # build_prj.tcl - srcpath = os.path.join(filedir, '../templates/vivado/build_prj.tcl') + srcpath = (filedir / '../templates/vivado/build_prj.tcl').resolve() dstpath = f'{model.config.get_output_dir()}/build_prj.tcl' copyfile(srcpath, dstpath) # vivado_synth.tcl - srcpath = os.path.join(filedir, '../templates/vivado/vivado_synth.tcl') + srcpath = (filedir / '../templates/vivado/vivado_synth.tcl').resolve() dstpath = f'{model.config.get_output_dir()}/vivado_synth.tcl' copyfile(srcpath, dstpath) # build_lib.sh - f = open(os.path.join(filedir, '../templates/symbolic/build_lib.sh')) - fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') - - for line in f.readlines(): - line = line.replace('myproject', model.config.get_project_name()) - line = line.replace('mystamp', model.config.get_config_value('Stamp')) - line = line.replace('mylibspath', model.config.get_config_value('HLSLibsPath')) - - if 'LDFLAGS=' in line and not os.path.exists(model.config.get_config_value('HLSLibsPath')): - line = 'LDFLAGS=\n' - - fout.write(line) - f.close() - fout.close() + build_lib_src = (filedir / '../templates/symbolic/build_lib.sh').resolve() + build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve() + with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst: + for line in src.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + line = line.replace('mylibspath', model.config.get_config_value('HLSLibsPath')) + + if 'LDFLAGS=' in line and not os.path.exists(model.config.get_config_value('HLSLibsPath')): + line = 'LDFLAGS=\n' + + dst.write(line) + build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC) def write_hls(self, model): print('Writing HLS project') From 1654c1c35806a961ed96e4bbf4db85c2534a07ea Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 1 Oct 2024 12:37:30 -0500 Subject: [PATCH 164/272] Expose alpha and theta type for parametrized activations (#1069) * update parametrized activations for Xilinx * update quartus and catapult * fix pre-commit * fix non-parametrized version of elu * update comment on parametriced activation precision --- example-models | 2 +- .../catapult/passes/core_templates.py | 29 +++++++++++++++++-- .../backends/quartus/passes/core_templates.py | 29 +++++++++++++++++-- .../backends/vivado/passes/core_templates.py | 29 +++++++++++++++++-- hls4ml/converters/keras/core.py | 2 +- hls4ml/converters/pytorch/core.py | 2 +- hls4ml/model/layers.py | 20 ++++++++++++- .../model/optimizer/passes/infer_precision.py | 16 ++++++++++ .../catapult/nnet_utils/nnet_activation.h | 20 ++++++------- .../nnet_utils/nnet_activation_stream.h | 20 ++++++------- .../firmware/nnet_utils/nnet_activation.h | 18 ++++++------ .../nnet_utils/nnet_activation_stream.h | 18 ++++++------ .../vivado/nnet_utils/nnet_activation.h | 18 ++++++------ .../nnet_utils/nnet_activation_stream.h | 18 ++++++------ test/pytest/test_activations.py | 2 +- 15 files changed, 173 insertions(+), 70 deletions(-) diff --git a/example-models b/example-models index ff74f73dbc..3cfbcfd062 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548 +Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550 diff --git a/hls4ml/backends/catapult/passes/core_templates.py b/hls4ml/backends/catapult/passes/core_templates.py index 2088923428..77c3b85524 100755 --- a/hls4ml/backends/catapult/passes/core_templates.py +++ b/hls4ml/backends/catapult/passes/core_templates.py @@ -115,6 +115,15 @@ def format(self, node): typedef {table_t.name} table_t; }};\n""" +param_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; + typedef {param_t.name} param_t; +}};\n""" + hard_activ_config_template = """struct {type}_config{index} {{ static const unsigned n_in = {n_in}; static const {slope_t.name} slope; @@ -140,14 +149,16 @@ def format(self, node): }};\n""" activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' -param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' +param_activ_function_template = ( + 'nnet::{activation}<{input_t}, {param_t.name}, {output_t}, {config}>({input}, {param}, {output});' +) activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): - super().__init__((Activation, ParametrizedActivation, PReLU)) + super().__init__(Activation) self.template = activ_config_template def format(self, node): @@ -157,6 +168,18 @@ def format(self, node): return self.template.format(**params) +class ParamActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ParametrizedActivation, PReLU)) + self.template = param_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + class HardActivationConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__(HardActivation) @@ -210,7 +233,7 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) params['activation'] = node.get_attr('activation').lower() - params['param'] = node.get_weights('alpha').name + params['param'] = node.get_weights('param').name params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) return self.template.format(**params) diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py index d6998c9ab2..b474e14df5 100644 --- a/hls4ml/backends/quartus/passes/core_templates.py +++ b/hls4ml/backends/quartus/passes/core_templates.py @@ -125,6 +125,15 @@ def format(self, node): typedef {table_t.name} table_t; }};\n""" +param_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; + typedef {param_t.name} param_t; +}};\n""" + hard_activ_config_template = """struct {type}_config{index} {{ static const unsigned n_in = {n_in}; static const {slope_t.name} slope; @@ -146,14 +155,16 @@ def format(self, node): }};\n""" activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' -param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' +param_activ_function_template = ( + 'nnet::{activation}<{input_t}, {param_t.name}, {output_t}, {config}>({input}, {param}, {output});' +) activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): - super().__init__((Activation, ParametrizedActivation, PReLU, UnaryLUT)) + super().__init__((Activation, UnaryLUT)) self.template = activ_config_template def format(self, node): @@ -163,6 +174,18 @@ def format(self, node): return self.template.format(**params) +class ParamActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ParametrizedActivation, PReLU)) + self.template = param_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + class HardActivationConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__(HardActivation) @@ -216,7 +239,7 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) params['activation'] = node.get_attr('activation').lower() - params['param'] = node.get_weights('alpha').name + params['param'] = node.get_weights('param').name params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) return self.template.format(**params) diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index 268293dd1e..b20a89f9ad 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -116,6 +116,15 @@ def format(self, node): typedef {table_t.name} table_t; }};\n""" +param_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; + typedef {param_t.name} param_t; +}};\n""" + hard_activ_config_template = """struct {type}_config{index} {{ static const unsigned n_in = {n_in}; static const {slope_t.name} slope; @@ -138,14 +147,16 @@ def format(self, node): }};\n""" activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' -param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' +param_activ_function_template = ( + 'nnet::{activation}<{input_t}, {param_t.name}, {output_t}, {config}>({input}, {param}, {output});' +) activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): - super().__init__((Activation, ParametrizedActivation, PReLU, UnaryLUT)) + super().__init__((Activation, UnaryLUT)) self.template = activ_config_template def format(self, node): @@ -155,6 +166,18 @@ def format(self, node): return self.template.format(**params) +class ParamActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ParametrizedActivation, PReLU)) + self.template = param_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + class HardActivationConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__(HardActivation) @@ -208,7 +231,7 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) params['activation'] = node.get_attr('activation').lower() - params['param'] = node.get_weights('alpha').name + params['param'] = node.get_weights('param').name params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) return self.template.format(**params) diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py index ca7d0b3541..aff15808ad 100644 --- a/hls4ml/converters/keras/core.py +++ b/hls4ml/converters/keras/core.py @@ -71,7 +71,7 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader): elif layer['class_name'] == 'ReLU': layer['class_name'] = 'Activation' elif layer['class_name'] == 'PReLU': - layer['alpha_data'] = get_weights_data(data_reader, layer['name'], 'alpha') + layer['param_data'] = get_weights_data(data_reader, layer['name'], 'alpha') if layer['class_name'] == 'Activation' and layer['activation'] == 'softmax': layer['class_name'] = 'Softmax' diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py index d3ba470bf5..c56857715a 100644 --- a/hls4ml/converters/pytorch/core.py +++ b/hls4ml/converters/pytorch/core.py @@ -55,7 +55,7 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod if layer['class_name'] == 'ELU': layer['activ_param'] = class_object.alpha if layer['class_name'] == 'PReLU': - layer['alpha_data'] = class_object.weight.data.numpy() + layer['param_data'] = class_object.weight.data.numpy() if layer['class_name'] == 'Threshold': layer['activ_param'] = class_object.threshold layer['class_name'] = 'ThresholdedReLU' diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index d8d1fb9c8f..1ceb6456b8 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -845,6 +845,17 @@ def initialize(self): class ParametrizedActivation(Activation): + _expected_attributes = [ + Attribute('n_in'), + Attribute('activation', value_type=str), + TypeAttribute('param'), + ] + + def initialize(self): + super().initialize() + param_t = NamedType(*reversed(self.model.config.get_precision(self, 'param'))) + self.set_attr('param_t', param_t) + def _get_act_function_name(self): act = self.get_attr('activation').lower() if act == 'leakyrelu': @@ -882,9 +893,16 @@ def initialize(self): class PReLU(Activation): + _expected_attributes = [ + Attribute('n_in'), + Attribute('activation', value_type=str), + WeightAttribute('param'), + TypeAttribute('param'), + ] + def initialize(self): super().initialize() - self.add_weights_variable(name='alpha', var_name='a{index}') + self.add_weights_variable(name='param', var_name='a{index}') class Softmax(Activation): diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index bb24f2206e..bd439e4a0f 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -84,6 +84,9 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['SimpleRNN', 'LSTM', 'GRU']: return self._infer_rnn_precision(node, types_to_infer) + if node_class in ['ParametrizedActivation']: + return self._infer_par_act_precision(node, types_to_infer) + # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent # this in config_from_* functions @@ -557,3 +560,16 @@ def _infer_rnn_precision(self, node, types_to_infer): inferred_types.append(f'{weightvar}_t') return inferred_types + + def _infer_par_act_precision(self, node, types_to_infer): + inferred_types = [] + + # For threshold relu, set the parameter precision to be the input precision by default; + # for other parametrized activations, just allow the default precision to be used. + # Can override these values in the configuration by explicitly setting them. + if 'param_t' in inferred_types and self.get_attr('activation').lower() == 'thresholdedrelu': + in_type = node.get_input_variable().type.precision + node.attributes['param_t'].type = in_type + inferred_types.append('param_t') + + return inferred_types diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_activation.h b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h index f08e75a0d6..fb72460b96 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h @@ -686,8 +686,8 @@ void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // ************************************************* // Leaky RELU Activation // ************************************************* -template -void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { +template +void leaky_relu(data_T data[CONFIG_T::n_in], param_T alpha, res_T res[CONFIG_T::n_in]) { //#pragma HLS PIPELINE data_T datareg; @@ -703,8 +703,8 @@ void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n // ************************************************* // Thresholded RELU Activation // ************************************************* -template -void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { +template +void thresholded_relu(data_T data[CONFIG_T::n_in], param_T theta, res_T res[CONFIG_T::n_in]) { //#pragma HLS PIPELINE data_T datareg; @@ -917,8 +917,8 @@ template void init_elu_table(typename CONFIG_T: #ifndef USE_AC_MATH -template -void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { +template +void elu(data_T data[CONFIG_T::n_in], const param_T alpha, res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -953,8 +953,8 @@ void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_i #else -template -void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { +template +void elu(data_T data[CONFIG_T::n_in], const param_T alpha, res_T res[CONFIG_T::n_in]) { for (int ii = 0; ii < CONFIG_T::n_in; ii++) { ac_math::ac_elu_pwl(data[ii], res[ii], alpha); } @@ -1045,8 +1045,8 @@ template void selu(data_T data[CO // ************************************************* // PReLU Activation // ************************************************* -template -void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template +void prelu(data_T data[CONFIG_T::n_in], param_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { //#pragma HLS PIPELINE data_T datareg; diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h index 509560bc2b..82570dbe51 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h @@ -545,8 +545,8 @@ template void hard_tanh(ac_channe // ************************************************* // Leaky RELU Activation // ************************************************* -template -void leaky_relu(ac_channel &data, typename data_T::value_type alpha, ac_channel &res) { +template +void leaky_relu(ac_channel &data, param_T alpha, ac_channel &res) { LeakyReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { //#pragma HLS PIPELINE @@ -571,8 +571,8 @@ void leaky_relu(ac_channel &data, typename data_T::value_type alpha, ac_ // Thresholded RELU Activation // ************************************************* -template -void thresholded_relu(ac_channel &data, typename data_T::value_type theta, ac_channel &res) { +template +void thresholded_relu(ac_channel &data, param_T theta, ac_channel &res) { ThresholdedReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { //#pragma HLS PIPELINE @@ -720,8 +720,8 @@ template void softsign(ac_channel #ifndef USE_AC_MATH -template -void elu(ac_channel &data, typename data_T::value_type alpha, ac_channel &res) { +template +void elu(ac_channel &data, param_T alpha, ac_channel &res) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -763,8 +763,8 @@ void elu(ac_channel &data, typename data_T::value_type alpha, ac_channel } #else -template -void elu(ac_channel &data, typename data_T::value_type alpha, ac_channel &res) { +template +void elu(ac_channel &data, param_T alpha, ac_channel &res) { EluActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { data_T in_data = data.read(); @@ -845,8 +845,8 @@ template void selu(ac_channel -void prelu(ac_channel &data, typename data_T::value_type alpha[CONFIG_T::n_in], ac_channel &res) { +template +void prelu(ac_channel &data, const param_T alpha[CONFIG_T::n_in], ac_channel &res) { PReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { //#pragma HLS PIPELINE diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h index a70096e2f5..1dea511c10 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h @@ -333,8 +333,8 @@ void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // ************************************************* // Leaky RELU Activation // ************************************************* -template -void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { +template +void leaky_relu(data_T data[CONFIG_T::n_in], param_T alpha, res_T res[CONFIG_T::n_in]) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -348,8 +348,8 @@ void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n // ************************************************* // Thresholded RELU Activation // ************************************************* -template -void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { +template +void thresholded_relu(data_T data[CONFIG_T::n_in], param_T theta, res_T res[CONFIG_T::n_in]) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -414,8 +414,8 @@ void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // ************************************************* // ELU Activation // ************************************************* -template -void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { +template +void elu(data_T data[CONFIG_T::n_in], const param_T alpha, res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #include "activation_tables/elu_table.tb" // Index into the lookup table based on data @@ -434,7 +434,7 @@ void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_i } template void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { - elu(data, 1.0, res); + elu, res_T, CONFIG_T>(data, 1.0, res); } // ************************************************* @@ -461,8 +461,8 @@ template void selu(data_T data[CO // ************************************************* // PReLU Activation // ************************************************* -template -void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template +void prelu(data_T data[CONFIG_T::n_in], const param_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h index f0562a9b22..e29592d1e1 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h @@ -52,8 +52,8 @@ template void relu(stream // ************************************************* // Leaky RELU Activation // ************************************************* -template -void leaky_relu(stream &data, const typename data_T::value_type alpha, stream &res) { +template +void leaky_relu(stream &data, param_T alpha, stream &res) { constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; @@ -79,8 +79,8 @@ void leaky_relu(stream &data, const typename data_T::value_type alpha, s // ************************************************* // Thresholded RELU Activation // ************************************************* -template -void thresholded_relu(stream &data, const typename data_T::value_type theta, stream &res) { +template +void thresholded_relu(stream &data, param_T theta, stream &res) { ThresholdedReLUActLoop: #pragma ii 1 for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { @@ -103,8 +103,8 @@ void thresholded_relu(stream &data, const typename data_T::value_type th // ************************************************* // ELU Activation // ************************************************* -template -void elu(stream &data, const typename data_T::value_type alpha, stream &res) { +template +void elu(stream &data, param_T alpha, stream &res) { #include "activation_tables/elu_table.tb" constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); @@ -135,7 +135,7 @@ void elu(stream &data, const typename data_T::value_type alpha, stream void elu(stream &data, stream &res) { - elu(data, 1.0, res); + elu, res_T, CONFIG_T>(data, 1.0, res); } // ************************************************* @@ -171,8 +171,8 @@ template void selu(stream // ************************************************* // PReLU Activation // ************************************************* -template -void prelu(stream &data, const typename data_T::value_type alpha[CONFIG_T::n_in], stream &res) { +template +void prelu(stream &data, const param_T alpha[CONFIG_T::n_in], stream &res) { constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); constexpr unsigned pipeline = data_T::size / multiplier_limit; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h index da13998e38..4683239d85 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h @@ -499,8 +499,8 @@ void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // ************************************************* // Leaky RELU Activation // ************************************************* -template -void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { +template +void leaky_relu(data_T data[CONFIG_T::n_in], param_T alpha, res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; @@ -516,8 +516,8 @@ void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n // ************************************************* // Thresholded RELU Activation // ************************************************* -template -void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { +template +void thresholded_relu(data_T data[CONFIG_T::n_in], param_T theta, res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; @@ -646,8 +646,8 @@ template void init_elu_table(typename CONFIG_T: } } -template -void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { +template +void elu(data_T data[CONFIG_T::n_in], const param_T alpha, res_T res[CONFIG_T::n_in]) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -680,7 +680,7 @@ void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_i } template void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { - elu(data, 1.0, res); + elu, res_T, CONFIG_T>(data, 1.0, res); } // ************************************************* @@ -738,8 +738,8 @@ template void selu(data_T data[CO // ************************************************* // PReLU Activation // ************************************************* -template -void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template +void prelu(data_T data[CONFIG_T::n_in], param_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #pragma HLS PIPELINE data_T datareg; diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h index 4f12ee5cb4..ef687243bf 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h @@ -499,8 +499,8 @@ template void hard_tanh(hls::stre // Leaky RELU Activation // ************************************************* -template -void leaky_relu(hls::stream &data, typename data_T::value_type alpha, hls::stream &res) { +template +void leaky_relu(hls::stream &data, param_T alpha, hls::stream &res) { LeakyReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE @@ -525,8 +525,8 @@ void leaky_relu(hls::stream &data, typename data_T::value_type alpha, hl // Thresholded RELU Activation // ************************************************* -template -void thresholded_relu(hls::stream &data, typename data_T::value_type theta, hls::stream &res) { +template +void thresholded_relu(hls::stream &data, param_T theta, hls::stream &res) { ThresholdedReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE @@ -633,8 +633,8 @@ template void softsign(hls::strea // ************************************************* // ELU Activation // ************************************************* -template -void elu(hls::stream &data, typename data_T::value_type alpha, hls::stream &res) { +template +void elu(hls::stream &data, param_T alpha, hls::stream &res) { // Initialize the lookup table #ifdef __HLS_SYN__ bool initialized = false; @@ -675,7 +675,7 @@ void elu(hls::stream &data, typename data_T::value_type alpha, hls::stre } template void elu(hls::stream &data, hls::stream &res) { - elu(data, 1.0, res); + elu, res_T, CONFIG_T>(data, 1.0, res); } // ************************************************* @@ -726,8 +726,8 @@ template void selu(hls::stream -void prelu(hls::stream &data, typename data_T::value_type alpha[CONFIG_T::n_in], hls::stream &res) { +template +void prelu(hls::stream &data, const param_T alpha[CONFIG_T::n_in], hls::stream &res) { PReLUActLoop: for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { #pragma HLS PIPELINE diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py index 5ab9481e1a..f156b1cdc3 100644 --- a/test/pytest/test_activations.py +++ b/test/pytest/test_activations.py @@ -41,7 +41,7 @@ def test_activations(backend, activation, name, shape, io_type): activation = activation(input) keras_model = Model(inputs=input, outputs=activation) - hls_config = hls4ml.utils.config_from_keras_model(keras_model) + hls_config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name', backend=backend) output_dir = str(test_root_path / 'hls4mlprj_activations_{}_{}_{}_{}').format(backend, io_type, str(shape), name) hls_model = hls4ml.converters.convert_from_keras_model( From d30773f870f2bc244bde23fabe60c3b5ba7776aa Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 1 Oct 2024 13:41:31 -0500 Subject: [PATCH 165/272] update qkeras in Jenkinsfile --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b943ce3480..5ca79a484c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,7 +16,7 @@ pipeline { sh '''#!/bin/bash --login conda activate hls4ml-py310 conda install -y jupyterhub pydot graphviz pytest pytest-cov - pip install pytest-randomly jupyter onnx>=1.4.0 matplotlib pandas seaborn pydigitalwavetools==1.1 pyyaml tensorflow==2.14 qonnx torch git+https://github.com/google/qkeras.git pyparsing + pip install pytest-randomly jupyter onnx>=1.4.0 matplotlib pandas seaborn pydigitalwavetools==1.1 pyyaml tensorflow==2.14 qonnx torch git+https://github.com/jmitrevs/qkeras.git@qrecurrent_unstack pyparsing pip install -U ../ --user ./convert-keras-models.sh -x -f keras-models.txt pip uninstall hls4ml -y''' From c4af46af9835b0d2f9c791f2c539e2a30e04f87c Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 7 Oct 2024 10:41:34 +0200 Subject: [PATCH 166/272] Rename "unrolled" -> "resource_unrolled" --- hls4ml/backends/vitis/passes/feature_check.py | 8 ++-- hls4ml/backends/vitis/vitis_backend.py | 2 +- .../vivado/passes/convolution_templates.py | 8 ++-- .../backends/vivado/passes/core_templates.py | 4 +- .../backends/vivado/passes/pipeline_style.py | 6 +-- .../vivado/passes/recurrent_templates.py | 8 ++-- .../vivado/passes/resource_strategy.py | 2 +- .../vivado/passes/unrolled_codegen.py | 12 ++--- hls4ml/backends/vivado/vivado_backend.py | 34 +++++++------- hls4ml/model/graph.py | 9 ++-- .../templates/vivado/nnet_utils/nnet_common.h | 2 +- .../vivado/nnet_utils/nnet_conv1d_stream.h | 2 +- .../vivado/nnet_utils/nnet_conv2d_stream.h | 2 +- hls4ml/utils/string_utils.py | 3 +- test/pytest/test_dense_unrolled.py | 46 +++++++++++++------ test/pytest/test_pipeline_style.py | 4 +- 16 files changed, 86 insertions(+), 66 deletions(-) mode change 100755 => 100644 test/pytest/test_pipeline_style.py diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py index 7f0b832765..a38f6581f6 100644 --- a/hls4ml/backends/vitis/passes/feature_check.py +++ b/hls4ml/backends/vitis/passes/feature_check.py @@ -35,17 +35,17 @@ def transform(self, model, node): ) -class ValidateUnrolledStrategy(OptimizerPass): +class ValidateResourceUnrolledStrategy(OptimizerPass): _unrolled_layer_cls = ['Conv1D', 'Conv2D', 'Dense', 'GRU', 'LSTM'] def match(self, node): is_unrolled_layer = len([layer_cls for layer_cls in self._unrolled_layer_cls if layer_cls in node.class_name]) > 0 - is_unrolled_strategy = node.get_attr('strategy', 'latency').lower() == 'unrolled' + is_unrolled_strategy = node.get_attr('strategy', 'latency').lower() == 'resource_unrolled' return is_unrolled_layer and is_unrolled_strategy def transform(self, model, node): print( - f'WARNING: "Unrolled" strategy in "{node.name}" ({node.class_name}) may have unexpected II in Vitis backend.\n' - 'Verify that the final design satisfies the latency/II constraints.' + f'WARNING: "ResourceUnrolled" strategy in "{node.name}" ({node.class_name}) may have unexpected II in' + 'Vitis backend.\nVerify that the final design satisfies the latency/II constraints.' ) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index c9fd452619..0110f78313 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -16,7 +16,7 @@ def _register_flows(self): validation_passes = [ 'vitis:validate_conv_implementation', 'vitis:validate_resource_strategy', - 'vitis:validate_unrolled_strategy', + 'vitis:validate_resource_unrolled_strategy', ] validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 6b13319174..dd77bee85e 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -111,8 +111,8 @@ def format(self, node): else: mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0' # The 3rd case is never used - elif node.get_attr('strategy').lower() == 'unrolled': - mult_params['dense_function'] = f'dense_unrolled_{node.index}' + elif node.get_attr('strategy').lower() == 'resource_unrolled': + mult_params['dense_function'] = f'dense_resource_unrolled_{node.index}' mult_config = self.mult_template.format(**mult_params) @@ -236,8 +236,8 @@ def format(self, node): else: mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0' # The 3rd case is never used - elif node.get_attr('strategy').lower() == 'unrolled': - mult_params['dense_function'] = f'dense_unrolled_{node.index}' + elif node.get_attr('strategy').lower() == 'resource_unrolled': + mult_params['dense_function'] = f'dense_resource_unrolled_{node.index}' mult_config = self.mult_template.format(**mult_params) diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py index 118ba41335..836da6e68a 100644 --- a/hls4ml/backends/vivado/passes/core_templates.py +++ b/hls4ml/backends/vivado/passes/core_templates.py @@ -51,8 +51,8 @@ def format(self, node): else: params['dense_function'] = 'DenseResource_rf_gt_nin_rem0' # The 3rd case is never used - elif node.get_attr('strategy').lower() == 'unrolled': - params['dense_function'] = f'dense_unrolled_{node.index}' + elif node.get_attr('strategy').lower() == 'resource_unrolled': + params['dense_function'] = f'dense_resource_unrolled_{node.index}' return self.template.format(**params) diff --git a/hls4ml/backends/vivado/passes/pipeline_style.py b/hls4ml/backends/vivado/passes/pipeline_style.py index 326745e455..66c2bbe71e 100644 --- a/hls4ml/backends/vivado/passes/pipeline_style.py +++ b/hls4ml/backends/vivado/passes/pipeline_style.py @@ -25,7 +25,7 @@ def transform(self, model): if self._maybe_set_dataflow_resource_strategy(model): return True - if self._maybe_set_pipeline_unrolled_strategy(model): + if self._maybe_set_pipeline_resource_unrolled_strategy(model): return True if self._maybe_set_pipeline_io_parallel(model): @@ -65,10 +65,10 @@ def _maybe_set_dataflow_resource_strategy(self, model): return False - def _maybe_set_pipeline_unrolled_strategy(self, model): + def _maybe_set_pipeline_resource_unrolled_strategy(self, model): have_unrolled = False for layer in model.get_layers(): - if model.config.get_strategy(layer).lower() == 'unrolled': + if model.config.get_strategy(layer).lower() == 'resource_unrolled': self._set_pipeline_style(model, 'pipeline') have_unrolled = True break diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py index 6c4ee51cdb..939713af22 100644 --- a/hls4ml/backends/vivado/passes/recurrent_templates.py +++ b/hls4ml/backends/vivado/passes/recurrent_templates.py @@ -149,8 +149,8 @@ def format(self, node): else: mult_params1['dense_function'] = 'DenseResource_rf_gt_nin_rem0' # The 3rd case is never used - elif node.get_attr('strategy').lower() == 'unrolled': - mult_params1['dense_function'] = f'dense_unrolled_{node.index}_1' + elif node.get_attr('strategy').lower() == 'resource_unrolled': + mult_params1['dense_function'] = f'dense_resource_unrolled_{node.index}_1' if node.get_attr('return_sequences'): mult_params2['n_in'] = node.get_output_variable().shape[1] @@ -174,8 +174,8 @@ def format(self, node): else: mult_params2['dense_function'] = 'DenseResource_rf_gt_nin_rem0' # The 3rd case is never used - elif node.get_attr('strategy').lower() == 'unrolled': - mult_params2['dense_function'] = f'dense_unrolled_{node.index}_2' + elif node.get_attr('strategy').lower() == 'resource_unrolled': + mult_params2['dense_function'] = f'dense_resource_unrolled_{node.index}_2' mult_config1 = self.mult1_template.format(**mult_params1) mult_config2 = self.mult2_template.format(**mult_params2) diff --git a/hls4ml/backends/vivado/passes/resource_strategy.py b/hls4ml/backends/vivado/passes/resource_strategy.py index d65b0dc48e..0c06190f30 100644 --- a/hls4ml/backends/vivado/passes/resource_strategy.py +++ b/hls4ml/backends/vivado/passes/resource_strategy.py @@ -9,7 +9,7 @@ class ApplyResourceStrategy(OptimizerPass): def match(self, node): node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU)) - is_resource_strategy = node.get_attr('strategy', '').lower() in ['resource', 'unrolled'] + is_resource_strategy = node.get_attr('strategy', '').lower() in ['resource', 'resource_unrolled'] already_transformed = node.get_attr('_weights_transposed', False) is True return node_matches and is_resource_strategy and not already_transformed diff --git a/hls4ml/backends/vivado/passes/unrolled_codegen.py b/hls4ml/backends/vivado/passes/unrolled_codegen.py index 6fd6c584af..d901c77008 100644 --- a/hls4ml/backends/vivado/passes/unrolled_codegen.py +++ b/hls4ml/backends/vivado/passes/unrolled_codegen.py @@ -15,14 +15,14 @@ def match(self, node): # TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers layers_with_dense = (Dense, Conv1D, Conv2D, LSTM, GRU) - # Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer + # Unrolled Dense mimics the hardware implementation of Resource strategy -> apply after Resource optimizer weights_transposed = node.get_attr('_weights_transposed', False) # RF = 1 will optimize DSPs anyway, so no need to unroll code rf_gt_one = node.get_attr('reuse_factor', 1) > 1 # User requested unrolled implementation of Dense - is_unrolled = node.get_attr('strategy', 'latency') == 'unrolled' + is_unrolled = node.get_attr('strategy', 'latency') == 'resource_unrolled' return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled @@ -34,7 +34,7 @@ def transform(self, model, node): weights = node.weights['weight'] code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, str(node.index) + '_1') code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend) - node.set_attr('unrolled_dense_resource_codegen_1', Source(code_str)) + node.set_attr('resource_unrolled_dense_codegen_1', Source(code_str)) recr_reuse_factor = node.get_attr('recurrent_reuse_factor') recr_weights = node.weights['recurrent_weight'] @@ -42,7 +42,7 @@ def transform(self, model, node): n_in_recr, n_out_recr, recr_reuse_factor, recr_weights, str(node.index) + '_2' ) code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend) - node.set_attr('unrolled_dense_resource_codegen_2', Source(code_str)) + node.set_attr('resource_unrolled_dense_codegen_2', Source(code_str)) else: n_in, n_out = node.model.config.backend.get_layer_mult_size(node) @@ -51,7 +51,7 @@ def transform(self, model, node): code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, node.index) code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend) - node.set_attr('unrolled_dense_resource_codegen', Source(code_str)) + node.set_attr('resource_unrolled_dense_codegen', Source(code_str)) def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, function_suffix): """ @@ -72,7 +72,7 @@ def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, functi # Variable instantiation and function pragmas generated_code = ( 'template\n' - 'class dense_unrolled_{suffix} : public DenseKernel {{{{\n' + 'class dense_resource_unrolled_{suffix} : public DenseKernel {{{{\n' ' public:\n' ' static void dense(\n' ' data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n' diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 8fdc862287..9f8a5171d3 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -266,11 +266,11 @@ def init_dense(self, layer): index_t = layer.get_weights('weight').type.index_precision else: layer.set_attr('strategy', 'resource') - elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled': use_resource_instead = False if layer.get_attr('reuse_factor', 1) == 1: print( - f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' + f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' 'Using "resource" strategy instead.' ) use_resource_instead = True @@ -281,7 +281,7 @@ def init_dense(self, layer): layer.set_attr('strategy', 'resource') else: self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False) - layer.set_attr('strategy', 'unrolled') + layer.set_attr('strategy', 'resource_unrolled') else: layer.set_attr('strategy', 'latency') layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) @@ -297,17 +297,17 @@ def init_conv1d(self, layer): n_in, n_out = self.get_layer_mult_size(layer) self.set_target_reuse_factor(layer) self.set_closest_reuse_factor(layer, n_in, n_out) - elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled': use_resource_instead = False if layer.get_attr('reuse_factor', 1) == 1: print( - f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}".' + f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}".' 'Using "resource" strategy instead.' ) use_resource_instead = True elif layer.model.config.get_config_value('IOType') == 'io_parallel': print( - f'Unrolled strategy cannot be combined with io_parallel in layer "{layer.name}". ' + f'Unrolled resource strategy cannot be combined with io_parallel in layer "{layer.name}". ' 'Using "resource" strategy instead.' ) use_resource_instead = True @@ -318,7 +318,7 @@ def init_conv1d(self, layer): layer.set_attr('strategy', 'resource') else: self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False) - layer.set_attr('strategy', 'unrolled') + layer.set_attr('strategy', 'resource_unrolled') else: layer.set_attr('strategy', 'latency') @@ -418,17 +418,17 @@ def init_conv2d(self, layer): self.set_target_reuse_factor(layer) n_in, n_out = self.get_layer_mult_size(layer) self.set_closest_reuse_factor(layer, n_in, n_out) - elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled': use_resource_instead = False if layer.get_attr('reuse_factor', 1) == 1: print( - f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' + f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' 'Using "resource" strategy instead.' ) use_resource_instead = True elif layer.model.config.get_config_value('IOType') == 'io_parallel': print( - f'Unrolled strategy cannot be combined with io_parallel in layer "{layer.name}". ' + f'Unrolled resource strategy cannot be combined with io_parallel in layer "{layer.name}". ' 'Using "resource" strategy instead.' ) use_resource_instead = True @@ -439,7 +439,7 @@ def init_conv2d(self, layer): layer.set_attr('strategy', 'resource') else: self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False) - layer.set_attr('strategy', 'unrolled') + layer.set_attr('strategy', 'resource_unrolled') else: layer.set_attr('strategy', 'latency') @@ -563,11 +563,11 @@ def init_lstm(self, layer): self.set_closest_reuse_factor(layer, n_in, n_out) self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') layer.set_attr('strategy', 'resource') - elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled': use_resource_instead = False if layer.get_attr('reuse_factor', 1) == 1: print( - f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' + f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' 'Using "resource" strategy instead.' ) use_resource_instead = True @@ -581,7 +581,7 @@ def init_lstm(self, layer): self.set_closest_reuse_factor( layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor', include_max_rf=False ) - layer.set_attr('strategy', 'unrolled') + layer.set_attr('strategy', 'resource_unrolled') else: layer.set_attr('strategy', 'latency') @@ -597,11 +597,11 @@ def init_gru(self, layer): self.set_closest_reuse_factor(layer, n_in, n_out) self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') layer.set_attr('strategy', 'resource') - elif layer.model.config.get_strategy(layer).lower() == 'unrolled': + elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled': use_resource_instead = False if layer.get_attr('reuse_factor', 1) == 1: print( - f'Unrolled strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' + f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}". ' 'Using "resource" strategy instead.' ) use_resource_instead = True @@ -615,7 +615,7 @@ def init_gru(self, layer): self.set_closest_reuse_factor( layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor', include_max_rf=False ) - layer.set_attr('strategy', 'unrolled') + layer.set_attr('strategy', 'resource_unrolled') else: layer.set_attr('strategy', 'latency') diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 609417f94a..678e6d49af 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -10,6 +10,7 @@ from hls4ml.model.flow import get_flow from hls4ml.model.layers import layer_map from hls4ml.model.optimizer import get_available_passes, optimize_model +from hls4ml.utils.string_utils import convert_to_snake_case class HLSConfig: @@ -35,7 +36,7 @@ def __init__(self, config): self.layer_type_targ_cycles = {} self.layer_name_targ_cycles = {} - self.model_strategy = 'Latency' + self.model_strategy = convert_to_snake_case('Latency') self.layer_type_strategy = {} self.layer_name_strategy = {} @@ -217,7 +218,7 @@ def parse_name_config(self, layer_name, layer_cfg): strategy = layer_cfg.get('Strategy') if strategy is not None: - self.layer_name_strategy[layer_name.lower()] = strategy + self.layer_name_strategy[layer_name.lower()] = convert_to_snake_case(strategy) conv_implementation = layer_cfg.get('ConvImplementation') if conv_implementation is not None: @@ -265,7 +266,7 @@ def _parse_hls_config(self): self.model_rf = model_cfg.get('ReuseFactor') self.model_targ_cycles = model_cfg.get('TargetCycles') self.model_conv_implementation = model_cfg.get('ConvImplementation', 'LineBuffer') - self.model_strategy = model_cfg.get('Strategy', 'Latency') + self.model_strategy = convert_to_snake_case(model_cfg.get('Strategy', 'Latency')) self.model_compression = bool(model_cfg.get('Compression', 0)) self.pipeline_style = model_cfg.get('PipelineStyle', 'auto') self.pipeline_ii = model_cfg.get('PipelineInterval', None) @@ -290,7 +291,7 @@ def _parse_hls_config(self): strategy = layer_cfg.get('Strategy') if strategy is not None: - self.layer_type_strategy[layer_type.lower()] = strategy + self.layer_type_strategy[layer_type.lower()] = convert_to_snake_case(strategy) conv_implementation = layer_cfg.get('ConvImplementation') if conv_implementation is not None: diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index fee8b7b935..a14517df5b 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -23,7 +23,7 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; -enum strategy { latency, resource, unrolled }; +enum strategy { latency, resource, resource_unrolled }; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h index 4a55700d8d..2b481930b7 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h @@ -60,7 +60,7 @@ void conv_1d_buffer_cl(hls::stream &data, hls::stream &res, typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); - if (CONFIG_T::strategy == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + if (CONFIG_T::strategy == nnet::resource_unrolled && CONFIG_T::reuse_factor > 1) { #pragma HLS allocation instances=compute_output_buffer_1d limit=1 function } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h index d5583f2669..1408b0db13 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h @@ -75,7 +75,7 @@ void conv_2d_buffer_cl( [CONFIG_T::n_chan]; #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 - if (CONFIG_T::strategy == nnet::unrolled && CONFIG_T::reuse_factor > 1) { + if (CONFIG_T::strategy == nnet::resource_unrolled && CONFIG_T::reuse_factor > 1) { #pragma HLS allocation instances=compute_output_buffer_1d limit=1 function #pragma HLS allocation instances=compute_output_buffer_2d limit=1 function } diff --git a/hls4ml/utils/string_utils.py b/hls4ml/utils/string_utils.py index fa341cd8af..a08c4c52a7 100644 --- a/hls4ml/utils/string_utils.py +++ b/hls4ml/utils/string_utils.py @@ -10,7 +10,8 @@ def convert_to_snake_case(pascal_case): Returns: str: converted string """ - return re.sub(r'(?', backend='Vitis', default_reuse_factor=8) + config['Model']['Strategy'] = strategy + + output_dir = str(test_root_path / f'hls4mlprj_resource_unrolled_parsing_{strategy}') + hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend='Vitis') + + # Check if strategy was not overridden + assert list(hls_model.get_layers())[1].get_attr('strategy') == 'resource_unrolled' + + +# Tests a wide range of RF to ensure the unrolled resource kernel is correct @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) @pytest.mark.parametrize('reuse_factor', [1, 2, 4, 8, 16, 32, 48, 64, 96, 192]) @pytest.mark.parametrize('backend', ['Vitis', 'Vivado']) -def test_dense_unrolled(io_type, reuse_factor, backend): +def test_resource_unrolled_dense(io_type, reuse_factor, backend): input_shape = (16,) X = np.random.rand(100, *input_shape) @@ -31,13 +49,13 @@ def test_dense_unrolled(io_type, reuse_factor, backend): config = config_from_keras_model( model, default_precision='ac_fixed<32, 16>', backend=backend, default_reuse_factor=reuse_factor ) - config['Model']['Strategy'] = 'Unrolled' + config['Model']['Strategy'] = 'ResourceUnrolled' - output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_{io_type}_{reuse_factor}_{backend}') + output_dir = str(test_root_path / f'hls4mlprj_resource_unrolled_dense_{io_type}_{reuse_factor}_{backend}') hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type) # Check if strategy was not overridden - assert list(hls_model.get_layers())[1].get_attr('strategy') == 'unrolled' if reuse_factor > 1 else 'latency' + assert list(hls_model.get_layers())[1].get_attr('strategy') == 'resource_unrolled' if reuse_factor > 1 else 'latency' hls_model.compile() @@ -45,11 +63,11 @@ def test_dense_unrolled(io_type, reuse_factor, backend): np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=1e-2) -# Tests a wide range RF on streaming Conv1D/2D to ensure the unrolled Dense is correct +# Tests a wide range RF on streaming Conv1D/2D to ensure the unrolled resource kernel is correct @pytest.mark.parametrize('dim', [1, 2]) @pytest.mark.parametrize('io_type', ['io_stream']) @pytest.mark.parametrize('reuse_factor', [1, 3, 9, 27, 54, 108]) -def test_dense_unrolled_streaming_conv(dim, io_type, reuse_factor): +def test_resource_unrolled_streaming_conv(dim, io_type, reuse_factor): input_shape = (8,) * dim + (3,) X = np.random.rand(100, *input_shape) conv_class = Conv1D if dim == 1 else Conv2D @@ -66,13 +84,13 @@ def test_dense_unrolled_streaming_conv(dim, io_type, reuse_factor): keras_prediction = model.predict(X) config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', default_reuse_factor=reuse_factor) - config['Model']['Strategy'] = 'Unrolled' + config['Model']['Strategy'] = 'ResourceUnrolled' - output_dir = str(test_root_path / f'hls4mlprj_dense_unrolled_conv{dim}d_{io_type}_{reuse_factor}') + output_dir = str(test_root_path / f'hls4mlprj_resource_unrolled_conv{dim}d_{io_type}_{reuse_factor}') hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type) # Check if strategy was not overridden - assert list(hls_model.get_layers())[1].get_attr('strategy') == 'unrolled' if reuse_factor > 1 else 'latency' + assert list(hls_model.get_layers())[1].get_attr('strategy') == 'resource_unrolled' if reuse_factor > 1 else 'latency' hls_model.compile() @@ -85,7 +103,7 @@ def test_dense_unrolled_streaming_conv(dim, io_type, reuse_factor): @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) @pytest.mark.parametrize('static', [True, False]) @pytest.mark.parametrize('reuse_factor', [1, 4, 32, 128]) # RF=128 also tests if setting closest RF works well -def test_rnn_unrolled(rnn_layer, backend, io_type, static, reuse_factor): +def test_resource_unrolled_rnn(rnn_layer, backend, io_type, static, reuse_factor): # Subtract 0.5 to include negative values input_shape = (12, 8) X = np.random.rand(50, *input_shape) - 0.5 @@ -110,9 +128,9 @@ def test_rnn_unrolled(rnn_layer, backend, io_type, static, reuse_factor): keras_model, granularity='name', default_precision=default_precision, backend=backend ) hls_config['LayerName'][layer_name]['static'] = static - hls_config['LayerName'][layer_name]['Strategy'] = 'Unrolled' + hls_config['LayerName'][layer_name]['Strategy'] = 'ResourceUnrolled' hls_config['LayerName'][layer_name]['ReuseFactor'] = reuse_factor - prj_name = f'hls4mlprj_rnn_unrolled_{layer_name}_static_{int(static)}_{io_type}_{reuse_factor}_{backend}' + prj_name = f'hls4mlprj_resource_unrolled_rnn_{layer_name}_static_{int(static)}_{io_type}_{reuse_factor}_{backend}' output_dir = str(test_root_path / prj_name) hls_model = convert_from_keras_model( @@ -120,7 +138,7 @@ def test_rnn_unrolled(rnn_layer, backend, io_type, static, reuse_factor): ) # Check if strategy was not overridden - assert list(hls_model.get_layers())[1].get_attr('strategy') == 'unrolled' if reuse_factor > 1 else 'latency' + assert list(hls_model.get_layers())[1].get_attr('strategy') == 'resource_unrolled' if reuse_factor > 1 else 'latency' hls_model.compile() diff --git a/test/pytest/test_pipeline_style.py b/test/pytest/test_pipeline_style.py old mode 100755 new mode 100644 index f8706fa52c..17d180d487 --- a/test/pytest/test_pipeline_style.py +++ b/test/pytest/test_pipeline_style.py @@ -16,10 +16,10 @@ [ (1, 'auto', 'io_stream', 'resource', None), # io_stream should result in DATAFLOW pragma regardless of other params (2, 'auto', 'io_stream', 'latency', None), - (3, None, 'io_stream', 'unrolled', None), # None should be interpreted as 'auto' + (3, None, 'io_stream', 'resource_unrolled', None), # None should be interpreted as 'auto' (4, 'auto', 'io_parallel', 'resource', None), # Should end up with DATAFLOW pragma (5, 'auto', 'io_parallel', 'latency', None), # Should end up with PIPELINE pragma - (6, 'auto', 'io_parallel', 'unrolled', None), # Should end up with PIPELINE pragma and II + (6, 'auto', 'io_parallel', 'resource_unrolled', None), # Should end up with PIPELINE pragma and II (7, 'pipeline', 'io_stream', 'resource', None), # Should result in a warning (8, 'pipeline', 'io_parallel', 'resource', None), # Should result in a warning (9, 'pipeline', 'io_parallel', 'latency', None), # No warning From 97c5347eb2e8cb7cf68a243810e40bce2f0ce24a Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 7 Oct 2024 18:33:36 +0200 Subject: [PATCH 167/272] Move optimization API to "dsp_aware_pruning" module (new optimization tools coming) --- docs/advanced/model_optimization.rst | 14 +-- hls4ml/optimization/__init__.py | 111 +----------------- .../dsp_aware_pruning/__init__.py | 108 +++++++++++++++++ .../{ => dsp_aware_pruning}/attributes.py | 4 +- .../{ => dsp_aware_pruning}/config.py | 0 .../{ => dsp_aware_pruning}/keras/__init__.py | 14 +-- .../{ => dsp_aware_pruning}/keras/builder.py | 4 +- .../{ => dsp_aware_pruning}/keras/config.py | 0 .../{ => dsp_aware_pruning}/keras/masking.py | 6 +- .../keras/reduction.py | 2 +- .../keras/regularizers.py | 2 +- .../{ => dsp_aware_pruning}/keras/utils.py | 0 .../{ => dsp_aware_pruning}/knapsack.py | 0 .../objectives/__init__.py | 4 +- .../objectives/gpu_objectives.py | 6 +- .../objectives/vivado_objectives.py | 6 +- .../{ => dsp_aware_pruning}/scheduler.py | 0 .../test_optimization/test_attributes.py | 2 +- .../test_keras/test_masking.py | 8 +- .../test_keras/test_reduction.py | 4 +- .../test_keras/test_regularizers.py | 6 +- .../test_keras/test_weight_sharing.py | 8 +- .../pytest/test_optimization/test_knapsack.py | 2 +- .../test_optimization/test_objectives.py | 4 +- .../test_optimization/test_scheduler.py | 2 +- test/pytest/test_pipeline_style.py | 0 26 files changed, 160 insertions(+), 157 deletions(-) create mode 100644 hls4ml/optimization/dsp_aware_pruning/__init__.py rename hls4ml/optimization/{ => dsp_aware_pruning}/attributes.py (98%) rename hls4ml/optimization/{ => dsp_aware_pruning}/config.py (100%) rename hls4ml/optimization/{ => dsp_aware_pruning}/keras/__init__.py (96%) rename hls4ml/optimization/{ => dsp_aware_pruning}/keras/builder.py (98%) rename hls4ml/optimization/{ => dsp_aware_pruning}/keras/config.py (100%) rename hls4ml/optimization/{ => dsp_aware_pruning}/keras/masking.py (99%) rename hls4ml/optimization/{ => dsp_aware_pruning}/keras/reduction.py (96%) rename hls4ml/optimization/{ => dsp_aware_pruning}/keras/regularizers.py (99%) rename hls4ml/optimization/{ => dsp_aware_pruning}/keras/utils.py (100%) rename hls4ml/optimization/{ => dsp_aware_pruning}/knapsack.py (100%) rename hls4ml/optimization/{ => dsp_aware_pruning}/objectives/__init__.py (97%) rename hls4ml/optimization/{ => dsp_aware_pruning}/objectives/gpu_objectives.py (92%) rename hls4ml/optimization/{ => dsp_aware_pruning}/objectives/vivado_objectives.py (98%) rename hls4ml/optimization/{ => dsp_aware_pruning}/scheduler.py (100%) mode change 100644 => 100755 test/pytest/test_pipeline_style.py diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst index c1396b3d20..41132ab619 100644 --- a/docs/advanced/model_optimization.rst +++ b/docs/advanced/model_optimization.rst @@ -13,11 +13,11 @@ The code block below showcases three use cases of the hls4ml Optimization API - from tensorflow.keras.optimizers import Adam from tensorflow.keras.metrics import CategoricalAccuracy from tensorflow.keras.losses import CategoricalCrossentropy - from hls4ml.optimization.keras import optimize_model - from hls4ml.optimization.keras.utils import get_model_sparsity - from hls4ml.optimization.attributes import get_attributes_from_keras_model - from hls4ml.optimization.objectives import ParameterEstimator - from hls4ml.optimization.scheduler import PolynomialScheduler + from hls4ml.optimization.dsp_aware_pruning.keras import optimize_model + from hls4ml.optimization.dsp_aware_pruning.keras.utils import get_model_sparsity + from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model + from hls4ml.optimization.dsp_aware_pruning.objectives import ParameterEstimator + from hls4ml.optimization.dsp_aware_pruning.scheduler import PolynomialScheduler # Define baseline model and load data # X_train, y_train = ... # X_val, y_val = ... @@ -75,7 +75,7 @@ To optimize GPU FLOPs, the code is similar to above: .. code-block:: Python - from hls4ml.optimization.objectives.gpu_objectives import GPUFLOPEstimator + from hls4ml.optimization.dsp_aware_pruning.objectives.gpu_objectives import GPUFLOPEstimator # Optimize model # Note the change from ParameterEstimator to GPUFLOPEstimator @@ -98,7 +98,7 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config: .. code-block:: Python from hls4ml.utils.config import config_from_keras_model - from hls4ml.optimization.objectives.vivado_objectives import VivadoDSPEstimator + from hls4ml.optimization.dsp_aware_pruning.objectives.vivado_objectives import VivadoDSPEstimator # Note the change from optimize_model to optimize_keras_model_for_hls4ml # The function optimize_keras_model_for_hls4ml acts as a wrapper for the function, parsing hls4ml config to model attributes diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py index ab51ce1eb3..c626b70c2b 100644 --- a/hls4ml/optimization/__init__.py +++ b/hls4ml/optimization/__init__.py @@ -1,108 +1,3 @@ -import numpy as np - -from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config -from hls4ml.optimization.keras import optimize_model - -default_regularization_range = np.logspace(-6, -2, num=16).tolist() - - -def optimize_keras_model_for_hls4ml( - keras_model, - hls_config, - objective, - scheduler, - X_train, - y_train, - X_val, - y_val, - batch_size, - epochs, - optimizer, - loss_fn, - validation_metric, - increasing, - rtol, - callbacks=None, - ranking_metric='l1', - local=False, - verbose=False, - rewinding_epochs=1, - cutoff_bad_trials=3, - directory='hls4ml-optimization', - tuner='Bayesian', - knapsack_solver='CBC_MIP', - regularization_range=default_regularization_range, -): - ''' - Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s) - - Args: - keras_model (keras.Model): Model to be optimized - hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...) - objective (hls4ml.optimization.objectives.ObjectiveEstimator): - Parameter, hardware or user-defined objective of optimization - scheduler (hls4ml.optimization.scheduler.OptimizationScheduler): - Sparsity scheduler, choose between constant, polynomial and binary - X_train (np.array): Training inputs - y_train (np.array): Training labels - X_val (np.array): Validation inputs - y_val (np.array): Validation labels - batch_size (int): Batch size during training - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning - optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training - loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline - increasing (boolean): If the metric improves with increased values; - e.g. accuracy -> increasing = True, MSE -> increasing = False - rtol (float): Relative tolerance; - pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions - ranking_metric (string): Metric used for ranking weights and structures; - currently supported l1, l2, saliency and Oracle - local (boolean): Layer-wise or global pruning - verbose (boolean): Display debug logs during model optimization - rewinding_epochs (int): Number of epochs to retrain model without weight freezing, - allows regrowth of previously pruned weights - cutoff_bad_trials (int): After how many bad trials (performance below threshold), - should model pruning / weight sharing stop - directory (string): Directory to store temporary results - tuner (str): Tuning algorithm, choose between Bayesian, Hyperband and None - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing; - default usually works well; for very large networks, greedy algorithm might be more suitable - regularization_range (list): List of suitable hyperparameters for weight decay - - Returns: - keras.Model: Optimized model - ''' - - # Extract model attributes - model_attributes = get_attributes_from_keras_model_and_hls4ml_config(keras_model, hls_config) - - # Optimize model - return optimize_model( - keras_model, - model_attributes, - objective, - scheduler, - X_train, - y_train, - X_val, - y_val, - batch_size, - epochs, - optimizer, - loss_fn, - validation_metric, - increasing, - rtol, - callbacks=callbacks, - ranking_metric=ranking_metric, - local=local, - verbose=verbose, - rewinding_epochs=rewinding_epochs, - cutoff_bad_trials=cutoff_bad_trials, - directory=directory, - tuner=tuner, - knapsack_solver=knapsack_solver, - regularization_range=regularization_range, - ) +from .dsp_aware_pruning import optimize_keras_model_for_hls4ml # noqa: F401 +from .dsp_aware_pruning.attributes import get_attributes_from_keras_model_and_hls4ml_config # noqa: F401 +from .dsp_aware_pruning.keras import optimize_model # noqa: F401 diff --git a/hls4ml/optimization/dsp_aware_pruning/__init__.py b/hls4ml/optimization/dsp_aware_pruning/__init__.py new file mode 100644 index 0000000000..69e2029e0e --- /dev/null +++ b/hls4ml/optimization/dsp_aware_pruning/__init__.py @@ -0,0 +1,108 @@ +import numpy as np + +from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model_and_hls4ml_config +from hls4ml.optimization.dsp_aware_pruning.keras import optimize_model + +default_regularization_range = np.logspace(-6, -2, num=16).tolist() + + +def optimize_keras_model_for_hls4ml( + keras_model, + hls_config, + objective, + scheduler, + X_train, + y_train, + X_val, + y_val, + batch_size, + epochs, + optimizer, + loss_fn, + validation_metric, + increasing, + rtol, + callbacks=None, + ranking_metric='l1', + local=False, + verbose=False, + rewinding_epochs=1, + cutoff_bad_trials=3, + directory='hls4ml-optimization', + tuner='Bayesian', + knapsack_solver='CBC_MIP', + regularization_range=default_regularization_range, +): + ''' + Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s) + + Args: + keras_model (keras.Model): Model to be optimized + hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...) + objective (hls4ml.optimization.objectives.ObjectiveEstimator): + Parameter, hardware or user-defined objective of optimization + scheduler (hls4ml.optimization.scheduler.OptimizationScheduler): + Sparsity scheduler, choose between constant, polynomial and binary + X_train (np.array): Training inputs + y_train (np.array): Training labels + X_val (np.array): Validation inputs + y_val (np.array): Validation labels + batch_size (int): Batch size during training + epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning + optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training + loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training + validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline + increasing (boolean): If the metric improves with increased values; + e.g. accuracy -> increasing = True, MSE -> increasing = False + rtol (float): Relative tolerance; + pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric + callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions + ranking_metric (string): Metric used for ranking weights and structures; + currently supported l1, l2, saliency and Oracle + local (boolean): Layer-wise or global pruning + verbose (boolean): Display debug logs during model optimization + rewinding_epochs (int): Number of epochs to retrain model without weight freezing, + allows regrowth of previously pruned weights + cutoff_bad_trials (int): After how many bad trials (performance below threshold), + should model pruning / weight sharing stop + directory (string): Directory to store temporary results + tuner (str): Tuning algorithm, choose between Bayesian, Hyperband and None + knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing; + default usually works well; for very large networks, greedy algorithm might be more suitable + regularization_range (list): List of suitable hyperparameters for weight decay + + Returns: + keras.Model: Optimized model + ''' + + # Extract model attributes + model_attributes = get_attributes_from_keras_model_and_hls4ml_config(keras_model, hls_config) + + # Optimize model + return optimize_model( + keras_model, + model_attributes, + objective, + scheduler, + X_train, + y_train, + X_val, + y_val, + batch_size, + epochs, + optimizer, + loss_fn, + validation_metric, + increasing, + rtol, + callbacks=callbacks, + ranking_metric=ranking_metric, + local=local, + verbose=verbose, + rewinding_epochs=rewinding_epochs, + cutoff_bad_trials=cutoff_bad_trials, + directory=directory, + tuner=tuner, + knapsack_solver=knapsack_solver, + regularization_range=regularization_range, + ) diff --git a/hls4ml/optimization/attributes.py b/hls4ml/optimization/dsp_aware_pruning/attributes.py similarity index 98% rename from hls4ml/optimization/attributes.py rename to hls4ml/optimization/dsp_aware_pruning/attributes.py index a7b6d74135..f652f27d50 100644 --- a/hls4ml/optimization/attributes.py +++ b/hls4ml/optimization/dsp_aware_pruning/attributes.py @@ -2,8 +2,8 @@ import hls4ml from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType -from hls4ml.optimization.config import SUPPORTED_STRUCTURES -from hls4ml.optimization.keras.config import SUPPORTED_LAYERS +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.keras.config import SUPPORTED_LAYERS class hls4mlAttributes: diff --git a/hls4ml/optimization/config.py b/hls4ml/optimization/dsp_aware_pruning/config.py similarity index 100% rename from hls4ml/optimization/config.py rename to hls4ml/optimization/dsp_aware_pruning/config.py diff --git a/hls4ml/optimization/keras/__init__.py b/hls4ml/optimization/dsp_aware_pruning/keras/__init__.py similarity index 96% rename from hls4ml/optimization/keras/__init__.py rename to hls4ml/optimization/dsp_aware_pruning/keras/__init__.py index d67ddd5d26..29012bd39e 100644 --- a/hls4ml/optimization/keras/__init__.py +++ b/hls4ml/optimization/dsp_aware_pruning/keras/__init__.py @@ -7,13 +7,13 @@ # Enables printing of loss tensors during custom training loop from tensorflow.python.ops.numpy_ops import np_config -import hls4ml.optimization.keras.utils as utils -from hls4ml.optimization.config import SUPPORTED_STRUCTURES -from hls4ml.optimization.keras.builder import build_optimizable_model, remove_custom_regularizers -from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS, TMP_DIRECTORY -from hls4ml.optimization.keras.masking import get_model_masks -from hls4ml.optimization.keras.reduction import reduce_model -from hls4ml.optimization.scheduler import OptimizationScheduler +import hls4ml.optimization.dsp_aware_pruning.keras.utils as utils +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.keras.builder import build_optimizable_model, remove_custom_regularizers +from hls4ml.optimization.dsp_aware_pruning.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS, TMP_DIRECTORY +from hls4ml.optimization.dsp_aware_pruning.keras.masking import get_model_masks +from hls4ml.optimization.dsp_aware_pruning.keras.reduction import reduce_model +from hls4ml.optimization.dsp_aware_pruning.scheduler import OptimizationScheduler np_config.enable_numpy_behavior() default_regularization_range = np.logspace(-6, -2, num=16).tolist() diff --git a/hls4ml/optimization/keras/builder.py b/hls4ml/optimization/dsp_aware_pruning/keras/builder.py similarity index 98% rename from hls4ml/optimization/keras/builder.py rename to hls4ml/optimization/dsp_aware_pruning/keras/builder.py index f265ccdf48..4ba39e4f7b 100644 --- a/hls4ml/optimization/keras/builder.py +++ b/hls4ml/optimization/dsp_aware_pruning/keras/builder.py @@ -8,8 +8,8 @@ from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.layers import Conv2D, Dense -from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, TMP_DIRECTORY -from hls4ml.optimization.keras.regularizers import Conv2DRegularizer, DenseRegularizer +from hls4ml.optimization.dsp_aware_pruning.keras.config import SUPPORTED_LAYERS, TMP_DIRECTORY +from hls4ml.optimization.dsp_aware_pruning.keras.regularizers import Conv2DRegularizer, DenseRegularizer co = {} _add_supported_quantized_objects(co) diff --git a/hls4ml/optimization/keras/config.py b/hls4ml/optimization/dsp_aware_pruning/keras/config.py similarity index 100% rename from hls4ml/optimization/keras/config.py rename to hls4ml/optimization/dsp_aware_pruning/keras/config.py diff --git a/hls4ml/optimization/keras/masking.py b/hls4ml/optimization/dsp_aware_pruning/keras/masking.py similarity index 99% rename from hls4ml/optimization/keras/masking.py rename to hls4ml/optimization/dsp_aware_pruning/keras/masking.py index 0e74997be8..dddeddf6f7 100644 --- a/hls4ml/optimization/keras/masking.py +++ b/hls4ml/optimization/dsp_aware_pruning/keras/masking.py @@ -6,9 +6,9 @@ from qkeras import QConv2D, QDense from tensorflow.keras.layers import Conv2D, Dense -from hls4ml.optimization.config import SUPPORTED_STRUCTURES -from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS -from hls4ml.optimization.knapsack import solve_knapsack +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS +from hls4ml.optimization.dsp_aware_pruning.knapsack import solve_knapsack def get_model_masks( diff --git a/hls4ml/optimization/keras/reduction.py b/hls4ml/optimization/dsp_aware_pruning/keras/reduction.py similarity index 96% rename from hls4ml/optimization/keras/reduction.py rename to hls4ml/optimization/dsp_aware_pruning/keras/reduction.py index 4ea8855aa8..12fb534799 100644 --- a/hls4ml/optimization/keras/reduction.py +++ b/hls4ml/optimization/dsp_aware_pruning/keras/reduction.py @@ -2,7 +2,7 @@ from tensorflow.keras.layers import Conv2D, Dense from tensorflow.keras.models import Sequential -from hls4ml.optimization.keras.utils import get_last_layer_with_weights +from hls4ml.optimization.dsp_aware_pruning.keras.utils import get_last_layer_with_weights def reduce_model(model): diff --git a/hls4ml/optimization/keras/regularizers.py b/hls4ml/optimization/dsp_aware_pruning/keras/regularizers.py similarity index 99% rename from hls4ml/optimization/keras/regularizers.py rename to hls4ml/optimization/dsp_aware_pruning/keras/regularizers.py index 1e885963c2..b42eb3f056 100644 --- a/hls4ml/optimization/keras/regularizers.py +++ b/hls4ml/optimization/dsp_aware_pruning/keras/regularizers.py @@ -1,7 +1,7 @@ import numpy as np import tensorflow as tf -from hls4ml.optimization.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES @tf.keras.utils.register_keras_serializable(name='DenseRegularizer') diff --git a/hls4ml/optimization/keras/utils.py b/hls4ml/optimization/dsp_aware_pruning/keras/utils.py similarity index 100% rename from hls4ml/optimization/keras/utils.py rename to hls4ml/optimization/dsp_aware_pruning/keras/utils.py diff --git a/hls4ml/optimization/knapsack.py b/hls4ml/optimization/dsp_aware_pruning/knapsack.py similarity index 100% rename from hls4ml/optimization/knapsack.py rename to hls4ml/optimization/dsp_aware_pruning/knapsack.py diff --git a/hls4ml/optimization/objectives/__init__.py b/hls4ml/optimization/dsp_aware_pruning/objectives/__init__.py similarity index 97% rename from hls4ml/optimization/objectives/__init__.py rename to hls4ml/optimization/dsp_aware_pruning/objectives/__init__.py index fcbef305b6..45204aaf73 100644 --- a/hls4ml/optimization/objectives/__init__.py +++ b/hls4ml/optimization/dsp_aware_pruning/objectives/__init__.py @@ -3,8 +3,8 @@ import numpy as np -from hls4ml.optimization.attributes import OptimizationAttributes -from hls4ml.optimization.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.attributes import OptimizationAttributes +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES ''' Pruning & weight sharing are formulated as an optimization problem, with the aim of minimizing some metric diff --git a/hls4ml/optimization/objectives/gpu_objectives.py b/hls4ml/optimization/dsp_aware_pruning/objectives/gpu_objectives.py similarity index 92% rename from hls4ml/optimization/objectives/gpu_objectives.py rename to hls4ml/optimization/dsp_aware_pruning/objectives/gpu_objectives.py index 8528a31839..bb3afc6397 100644 --- a/hls4ml/optimization/objectives/gpu_objectives.py +++ b/hls4ml/optimization/dsp_aware_pruning/objectives/gpu_objectives.py @@ -2,9 +2,9 @@ import numpy as np -from hls4ml.optimization.attributes import OptimizationAttributes -from hls4ml.optimization.config import SUPPORTED_STRUCTURES -from hls4ml.optimization.objectives import ObjectiveEstimator +from hls4ml.optimization.dsp_aware_pruning.attributes import OptimizationAttributes +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.objectives import ObjectiveEstimator class GPUFLOPEstimator(ObjectiveEstimator): diff --git a/hls4ml/optimization/objectives/vivado_objectives.py b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py similarity index 98% rename from hls4ml/optimization/objectives/vivado_objectives.py rename to hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py index c0c0c33e09..798542cfc0 100644 --- a/hls4ml/optimization/objectives/vivado_objectives.py +++ b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py @@ -3,9 +3,9 @@ import numpy as np -from hls4ml.optimization.attributes import OptimizationAttributes -from hls4ml.optimization.config import SUPPORTED_STRUCTURES -from hls4ml.optimization.objectives import ObjectiveEstimator +from hls4ml.optimization.dsp_aware_pruning.attributes import OptimizationAttributes +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.objectives import ObjectiveEstimator # Optimizes DSP utilisation for Vivado backend diff --git a/hls4ml/optimization/scheduler.py b/hls4ml/optimization/dsp_aware_pruning/scheduler.py similarity index 100% rename from hls4ml/optimization/scheduler.py rename to hls4ml/optimization/dsp_aware_pruning/scheduler.py diff --git a/test/pytest/test_optimization/test_attributes.py b/test/pytest/test_optimization/test_attributes.py index 3ba8d08d14..a42d3a6751 100644 --- a/test/pytest/test_optimization/test_attributes.py +++ b/test/pytest/test_optimization/test_attributes.py @@ -1,7 +1,7 @@ from tensorflow.keras.layers import Conv2D, Dense, Flatten, ReLU from tensorflow.keras.models import Sequential -from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config +from hls4ml.optimization import get_attributes_from_keras_model_and_hls4ml_config from hls4ml.utils.config import config_from_keras_model diff --git a/test/pytest/test_optimization/test_keras/test_masking.py b/test/pytest/test_optimization/test_keras/test_masking.py index 5c5e60aca7..8b465d8d7e 100644 --- a/test/pytest/test_optimization/test_keras/test_masking.py +++ b/test/pytest/test_optimization/test_keras/test_masking.py @@ -4,10 +4,10 @@ from tensorflow.keras.layers import Conv2D, Dense, Flatten from tensorflow.keras.models import Sequential -from hls4ml.optimization.attributes import get_attributes_from_keras_model -from hls4ml.optimization.config import SUPPORTED_STRUCTURES -from hls4ml.optimization.keras.masking import get_model_masks -from hls4ml.optimization.objectives import ParameterEstimator +from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.keras.masking import get_model_masks +from hls4ml.optimization.dsp_aware_pruning.objectives import ParameterEstimator ''' In all the tests, an artifical network with one Dense/Conv2D layer and pre-determined weights is created diff --git a/test/pytest/test_optimization/test_keras/test_reduction.py b/test/pytest/test_optimization/test_keras/test_reduction.py index 7243a9123f..4bf93f7301 100644 --- a/test/pytest/test_optimization/test_keras/test_reduction.py +++ b/test/pytest/test_optimization/test_keras/test_reduction.py @@ -6,8 +6,8 @@ from tensorflow.keras.layers import AveragePooling2D, BatchNormalization, Conv2D, Dense, Flatten, MaxPooling2D, ReLU, Softmax from tensorflow.keras.models import Sequential -from hls4ml.optimization.keras.reduction import reduce_model -from hls4ml.optimization.keras.utils import get_model_sparsity +from hls4ml.optimization.dsp_aware_pruning.keras.reduction import reduce_model +from hls4ml.optimization.dsp_aware_pruning.keras.utils import get_model_sparsity pytest.skip(allow_module_level=True) diff --git a/test/pytest/test_optimization/test_keras/test_regularizers.py b/test/pytest/test_optimization/test_keras/test_regularizers.py index 9fe518caae..f643f3a79a 100644 --- a/test/pytest/test_optimization/test_keras/test_regularizers.py +++ b/test/pytest/test_optimization/test_keras/test_regularizers.py @@ -6,9 +6,9 @@ from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import Adam -from hls4ml.optimization.config import SUPPORTED_STRUCTURES -from hls4ml.optimization.keras.builder import remove_custom_regularizers -from hls4ml.optimization.keras.regularizers import Conv2DRegularizer, DenseRegularizer +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.keras.builder import remove_custom_regularizers +from hls4ml.optimization.dsp_aware_pruning.keras.regularizers import Conv2DRegularizer, DenseRegularizer # Constants pattern_offset = 4 diff --git a/test/pytest/test_optimization/test_keras/test_weight_sharing.py b/test/pytest/test_optimization/test_keras/test_weight_sharing.py index c274a84da8..be1d3a957f 100644 --- a/test/pytest/test_optimization/test_keras/test_weight_sharing.py +++ b/test/pytest/test_optimization/test_keras/test_weight_sharing.py @@ -4,10 +4,10 @@ from tensorflow.keras.layers import Dense from tensorflow.keras.models import Sequential -from hls4ml.optimization.attributes import get_attributes_from_keras_model -from hls4ml.optimization.config import SUPPORTED_STRUCTURES -from hls4ml.optimization.keras.masking import get_model_masks -from hls4ml.optimization.objectives import ObjectiveEstimator +from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model +from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES +from hls4ml.optimization.dsp_aware_pruning.keras.masking import get_model_masks +from hls4ml.optimization.dsp_aware_pruning.objectives import ObjectiveEstimator # Similar tests in test_masking.py, weight sharing instead of pruning sparsity = 0.33 diff --git a/test/pytest/test_optimization/test_knapsack.py b/test/pytest/test_optimization/test_knapsack.py index a4145c00d0..804081c8e8 100644 --- a/test/pytest/test_optimization/test_knapsack.py +++ b/test/pytest/test_optimization/test_knapsack.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from hls4ml.optimization.knapsack import solve_knapsack +from hls4ml.optimization.dsp_aware_pruning.knapsack import solve_knapsack # In the simple case below, both implementations give the optimal answer diff --git a/test/pytest/test_optimization/test_objectives.py b/test/pytest/test_optimization/test_objectives.py index a7d81befe6..2f8a6414da 100644 --- a/test/pytest/test_optimization/test_objectives.py +++ b/test/pytest/test_optimization/test_objectives.py @@ -2,8 +2,8 @@ from tensorflow.keras.layers import Conv2D, Dense, Flatten from tensorflow.keras.models import Sequential -from hls4ml.optimization.attributes import get_attributes_from_keras_model -from hls4ml.optimization.objectives import ParameterEstimator +from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model +from hls4ml.optimization.dsp_aware_pruning.objectives import ParameterEstimator # Test attempts to verify one of the estimators (parameter) is correctly declared, the functions are static etc. diff --git a/test/pytest/test_optimization/test_scheduler.py b/test/pytest/test_optimization/test_scheduler.py index 2dc7642bf6..2182d1cb46 100644 --- a/test/pytest/test_optimization/test_scheduler.py +++ b/test/pytest/test_optimization/test_scheduler.py @@ -1,6 +1,6 @@ import numpy as np # Use np.testing.assert_allclose due to floating point rounding errors -from hls4ml.optimization.scheduler import BinaryScheduler, ConstantScheduler, PolynomialScheduler +from hls4ml.optimization.dsp_aware_pruning.scheduler import BinaryScheduler, ConstantScheduler, PolynomialScheduler def test_constant_scheduler(): diff --git a/test/pytest/test_pipeline_style.py b/test/pytest/test_pipeline_style.py old mode 100644 new mode 100755 From 5fbdae817faab6dbb90316b0c7a834e63f37a9c3 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 8 Oct 2024 21:24:36 +0200 Subject: [PATCH 168/272] Hardcode weights loading (ensures weights loading works from any dir) --- hls4ml/model/graph.py | 36 ++++++++----------- .../templates/catapult/myproject_bridge.cpp | 2 +- hls4ml/writer/catapult_writer.py | 3 ++ hls4ml/writer/vivado_writer.py | 3 ++ 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index d0a1fdf7fc..cfbfb183d6 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -805,32 +805,24 @@ def predict(self, x): n_inputs = len(self.get_input_variables()) n_outputs = len(self.get_output_variables()) - curr_dir = os.getcwd() - os.chdir(self.config.get_output_dir() + '/firmware') - output = [] if n_samples == 1 and n_inputs == 1: x = [x] - try: - for i in range(n_samples): - predictions = [np.zeros(yj.size(), dtype=ctype) for yj in self.get_output_variables()] - if n_inputs == 1: - inp = [np.asarray(x[i])] - else: - inp = [np.asarray(xj[i]) for xj in x] - argtuple = inp - argtuple += predictions - argtuple = tuple(argtuple) - top_function(*argtuple) - output.append(predictions) - - # Convert to list of numpy arrays (one for each output) - output = [ - np.asarray([output[i_sample][i_output] for i_sample in range(n_samples)]) for i_output in range(n_outputs) - ] - finally: - os.chdir(curr_dir) + for i in range(n_samples): + predictions = [np.zeros(yj.size(), dtype=ctype) for yj in self.get_output_variables()] + if n_inputs == 1: + inp = [np.asarray(x[i])] + else: + inp = [np.asarray(xj[i]) for xj in x] + argtuple = inp + argtuple += predictions + argtuple = tuple(argtuple) + top_function(*argtuple) + output.append(predictions) + + # Convert to list of numpy arrays (one for each output) + output = [np.asarray([output[i_sample][i_output] for i_sample in range(n_samples)]) for i_output in range(n_outputs)] if n_samples == 1 and n_outputs == 1: return output[0][0] diff --git a/hls4ml/templates/catapult/myproject_bridge.cpp b/hls4ml/templates/catapult/myproject_bridge.cpp index f1326a1faf..9937adcf89 100755 --- a/hls4ml/templates/catapult/myproject_bridge.cpp +++ b/hls4ml/templates/catapult/myproject_bridge.cpp @@ -6,7 +6,7 @@ #include #include -static std::string s_weights_dir = "weights"; +// hls-fpga-machine-learning insert weights dir const char *get_weights_dir() { return s_weights_dir.c_str(); } diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py index 396ecb968e..7db1063206 100755 --- a/hls4ml/writer/catapult_writer.py +++ b/hls4ml/writer/catapult_writer.py @@ -676,6 +676,9 @@ def write_bridge(self, model): newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) elif 'myproject' in line: newline = line.replace('myproject', format(model.config.get_project_name())) + elif '// hls-fpga-machine-learning insert weights dir' in line: + weights_dir = (Path(fout.name).parent / 'firmware/weights').resolve() + newline = f'static std::string s_weights_dir = "{weights_dir}";\n' elif '// hls-fpga-machine-learning insert bram' in line: newline = line for bram in model_brams: diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index e4c0c24551..1d88c13de5 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -725,10 +725,13 @@ def write_build_script(self, model): # build_lib.sh build_lib_src = (filedir / '../templates/vivado/build_lib.sh').resolve() build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve() + weights_dir = (build_lib_dst.parent / 'firmware/weights').resolve() with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst: for line in src.readlines(): line = line.replace('myproject', model.config.get_project_name()) line = line.replace('mystamp', model.config.get_config_value('Stamp')) + if line.startswith('WEIGHTS_DIR='): + line = f'WEIGHTS_DIR=\\""{weights_dir}\\""\n' dst.write(line) build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC) From a6a5c7f9a44848de88c59271f9d3298608c5bc4c Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 9 Oct 2024 18:52:55 -0700 Subject: [PATCH 169/272] add flow --- hls4ml/backends/vivado/vivado_backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 982fa2ce87..694cb503fe 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -115,6 +115,7 @@ def _register_flows(self): 'vivado:generate_conv_streaming_instructions', 'vivado:apply_resource_strategy', 'vivado:generate_conv_im2col', + 'vivado:generate_pointwise_conv1_d', ] vivado_types_flow = register_flow('specific_types', vivado_types, requires=[init_flow], backend=self.name) From 170999fae963dba6bf4091a8af60f16b17dfb96a Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 10 Oct 2024 07:14:46 -0700 Subject: [PATCH 170/272] div roundup --- example-models | 2 +- hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h | 4 +--- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 4 +--- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/example-models b/example-models index 3cfbcfd062..ff74f73dbc 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550 +Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548 diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index 3fd6160f4f..bfe675ce12 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -107,9 +107,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - int multiplier_limit = - ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / - float(CONFIG_T::reuse_factor)); + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 8fb9f769f4..6f23976799 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -106,9 +106,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - int multiplier_limit = - ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / - float(CONFIG_T::reuse_factor)); + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later From 308af4ed6f992617594528b16664306cc928714b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 20:40:33 +0000 Subject: [PATCH 171/272] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.8.0 → 24.10.0](https://github.com/psf/black/compare/24.8.0...24.10.0) - [github.com/pre-commit/pre-commit-hooks: v4.6.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.6.0...v5.0.0) - [github.com/asottile/pyupgrade: v3.17.0 → v3.18.0](https://github.com/asottile/pyupgrade/compare/v3.17.0...v3.18.0) - [github.com/asottile/setup-cfg-fmt: v2.5.0 → v2.7.0](https://github.com/asottile/setup-cfg-fmt/compare/v2.5.0...v2.7.0) - [github.com/mgedmin/check-manifest: 0.49 → 0.50](https://github.com/mgedmin/check-manifest/compare/0.49...0.50) --- .pre-commit-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1b3d872190..8ef3dd41d5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte repos: - repo: https://github.com/psf/black - rev: 24.8.0 + rev: 24.10.0 hooks: - id: black language_version: python3 @@ -10,7 +10,7 @@ repos: '--skip-string-normalization'] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-added-large-files - id: check-case-conflict @@ -30,13 +30,13 @@ repos: args: ["--profile", "black", --line-length=125] - repo: https://github.com/asottile/pyupgrade - rev: v3.17.0 + rev: v3.18.0 hooks: - id: pyupgrade args: ["--py36-plus"] - repo: https://github.com/asottile/setup-cfg-fmt - rev: v2.5.0 + rev: v2.7.0 hooks: - id: setup-cfg-fmt @@ -50,7 +50,7 @@ repos: '--extend-ignore=E203,T201'] # E203 is not PEP8 compliant - repo: https://github.com/mgedmin/check-manifest - rev: "0.49" + rev: "0.50" hooks: - id: check-manifest stages: [manual] From 4ec63876dbbd8643f195164f375882e329f27859 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 17 Oct 2024 13:50:27 -0700 Subject: [PATCH 172/272] update --- hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h | 7 +------ hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index 1b66c646af..1c268ed588 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -58,12 +58,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], if (CONFIG_T::strategy == nnet::latency) { if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation - if (CONFIG_T::reuse_factor > 1) { - CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); - } else { - assert(CONFIG_T::reuse_factor == 1); - pointwise_conv_1d_latency_cl(data, res, weights, biases); - } + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); } else { // Use standard unrolled implementation conv_1d_latency_cl(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index 7cceabfe1b..95d5d7fcce 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -56,12 +56,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], if (CONFIG_T::strategy == nnet::latency) { if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation - if (CONFIG_T::reuse_factor > 1) { - CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); - } else { - assert(CONFIG_T::reuse_factor == 1); - pointwise_conv_1d_latency_cl(data, res, weights, biases); - } + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); } else { // Use standard unrolled implementation conv_1d_latency_cl(data, res, weights, biases); From aaab34a72c13ae3974fcd80664d0494fe2adfcbb Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 22 Oct 2024 13:21:25 -0400 Subject: [PATCH 173/272] fix softmax parsing in pytorch and add test --- hls4ml/converters/pytorch/core.py | 19 ++++++++++++++----- test/pytest/test_pytorch_api.py | 9 +++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py index c56857715a..d6b46d93a3 100644 --- a/hls4ml/converters/pytorch/core.py +++ b/hls4ml/converters/pytorch/core.py @@ -61,17 +61,21 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod layer['class_name'] = 'ThresholdedReLU' layer['activation'] = 'ThresholdedReLU' if layer['activ_param'] < 0: - raise Exception('negative threshold values not supported') - - if hasattr(node, 'dim'): + raise Exception('negative threshold values not supported') + if hasattr(class_object, 'dim'): layer['axis'] = class_object.dim + if layer['class_name'] == 'Softmax' and layer['axis'] is None: + layer['axis'] = -1 + if 'IOType' in config: + if layer['class_name'] == 'Softmax' and config['IOType'] == 'io_stream' and layer['axis'] != -1: + raise Exception('dim needs to be -1 for io_stream') else: if layer['class_name'] in ['ReLU', 'Sigmoid', 'Tanh']: layer['class_name'] = 'Activation' if layer['class_name'] == 'LeakyReLU': layer['activ_param'] = node.kwargs['negative_slope'] if layer['class_name'] == 'ELU': - layer['activ_param'] = node.kwargs['alpha'] + layer['activ_param'] = node.kwargs['alpha'] if layer['class_name'] == 'Threshold': layer['activ_param'] = node.args[1] if layer['activ_param'] < 0: @@ -80,7 +84,12 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod layer['activation'] = 'ThresholdedReLU' if 'dim' in node.kwargs: layer['axis'] = node.kwargs['dim'] - + if layer['class_name'] == 'Softmax' and layer['axis'] is None: + layer['axis'] = -1 + if 'IOType' in config: + if layer['class_name'] == 'Softmax' and config['IOType'] == 'io_stream' and layer['axis'] != -1: + raise Exception('dim needs to be -1 for io_stream') + output_shape = input_shapes[0] return layer, output_shape diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index fee7b9a3aa..6d558d4b35 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -63,6 +63,7 @@ def test_linear(backend, io_type): @pytest.mark.parametrize( "activation_function", [ + nn.Softmax(dim=-1), nn.ReLU(), nn.Tanh(), nn.LeakyReLU(negative_slope=1.0), @@ -74,6 +75,7 @@ def test_linear(backend, io_type): ) @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) + def test_activations(activation_function, backend, io_type): model = torch.nn.Sequential(nn.Linear(1, 1), activation_function).to() model.eval() @@ -118,6 +120,12 @@ def __init__(self): def forward(self, x): return nn.functional.relu(x) +class SoftmaxModel(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return nn.functional.softmax(x,dim=-1) class TanHModel(nn.Module): def __init__(self): @@ -162,6 +170,7 @@ def forward(self, x): @pytest.mark.parametrize( "activation_function", [ + SoftmaxModel(), ReLuModel(), TanHModel(), LeakyReLuModel(), From 655aef6a22253b9c5ba35fb01bed5216e6e4fc10 Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 22 Oct 2024 14:17:52 -0400 Subject: [PATCH 174/272] precommit --- hls4ml/converters/pytorch/core.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py index d6b46d93a3..2c05b7501f 100644 --- a/hls4ml/converters/pytorch/core.py +++ b/hls4ml/converters/pytorch/core.py @@ -61,21 +61,21 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod layer['class_name'] = 'ThresholdedReLU' layer['activation'] = 'ThresholdedReLU' if layer['activ_param'] < 0: - raise Exception('negative threshold values not supported') + raise Exception('negative threshold values not supported') if hasattr(class_object, 'dim'): layer['axis'] = class_object.dim if layer['class_name'] == 'Softmax' and layer['axis'] is None: - layer['axis'] = -1 - if 'IOType' in config: - if layer['class_name'] == 'Softmax' and config['IOType'] == 'io_stream' and layer['axis'] != -1: - raise Exception('dim needs to be -1 for io_stream') + layer['axis'] = -1 + if 'IOType' in config: + if layer['class_name'] == 'Softmax' and config['IOType'] == 'io_stream' and layer['axis'] != -1: + raise Exception('dim needs to be -1 for io_stream') else: if layer['class_name'] in ['ReLU', 'Sigmoid', 'Tanh']: layer['class_name'] = 'Activation' if layer['class_name'] == 'LeakyReLU': layer['activ_param'] = node.kwargs['negative_slope'] if layer['class_name'] == 'ELU': - layer['activ_param'] = node.kwargs['alpha'] + layer['activ_param'] = node.kwargs['alpha'] if layer['class_name'] == 'Threshold': layer['activ_param'] = node.args[1] if layer['activ_param'] < 0: @@ -85,11 +85,11 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod if 'dim' in node.kwargs: layer['axis'] = node.kwargs['dim'] if layer['class_name'] == 'Softmax' and layer['axis'] is None: - layer['axis'] = -1 - if 'IOType' in config: - if layer['class_name'] == 'Softmax' and config['IOType'] == 'io_stream' and layer['axis'] != -1: - raise Exception('dim needs to be -1 for io_stream') - + layer['axis'] = -1 + if 'IOType' in config: + if layer['class_name'] == 'Softmax' and config['IOType'] == 'io_stream' and layer['axis'] != -1: + raise Exception('dim needs to be -1 for io_stream') + output_shape = input_shapes[0] return layer, output_shape From 61695b67a01c036bb7c10bf90b9328e6a296a9fc Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 22 Oct 2024 14:21:25 -0400 Subject: [PATCH 175/272] precommit v2 --- test/pytest/test_pytorch_api.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index 6d558d4b35..c2add87d6e 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -75,7 +75,6 @@ def test_linear(backend, io_type): ) @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) - def test_activations(activation_function, backend, io_type): model = torch.nn.Sequential(nn.Linear(1, 1), activation_function).to() model.eval() @@ -120,12 +119,14 @@ def __init__(self): def forward(self, x): return nn.functional.relu(x) + class SoftmaxModel(nn.Module): def __init__(self): super().__init__() def forward(self, x): - return nn.functional.softmax(x,dim=-1) + return nn.functional.softmax(x, dim=-1) + class TanHModel(nn.Module): def __init__(self): From a306e3f30a2316fc8e94a049f13c36417b6b422d Mon Sep 17 00:00:00 2001 From: Jan-Frederik Schulte Date: Tue, 22 Oct 2024 16:09:40 -0400 Subject: [PATCH 176/272] add small tweak to fix issue 1054 --- hls4ml/model/optimizer/passes/convert_to_channels_last.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py index a3b861ddfe..0b5f12c008 100644 --- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py +++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py @@ -94,7 +94,11 @@ def transform(self, model, node): node.add_output_variable(shape, dims) # Have to transpose back before flattening to get correct order of elements in the flattened tensor - if isinstance(node, Reshape) and len(node.attributes['target_shape']) == 1: + if ( + isinstance(node, Reshape) + and len(node.attributes['target_shape']) == 1 + and not model.config.config['HLSConfig']['Model']['ChannelsLastConversion'] == "internal" + ): previous_node = node.get_input_node(node.inputs[0]) input = previous_node.name outshape = previous_node.get_output_variable().shape From 583a8c2be59bec87a3377a2567c4e8d7a75754fe Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 23 Oct 2024 00:47:44 -0500 Subject: [PATCH 177/272] In softmax, max axis -1 if it's a positive index that's identical --- hls4ml/converters/onnx/core.py | 3 + test/pytest/test_qonnx.py | 114 +++++++++++++++++++++++---------- 2 files changed, 83 insertions(+), 34 deletions(-) diff --git a/hls4ml/converters/onnx/core.py b/hls4ml/converters/onnx/core.py index d84ba98a95..8ad851426d 100644 --- a/hls4ml/converters/onnx/core.py +++ b/hls4ml/converters/onnx/core.py @@ -62,6 +62,9 @@ def parse_activation_layer(node, input_names, input_shapes, graph): if layer['class_name'] == 'Softmax': layer['activation'] = 'softmax' layer['axis'] = get_onnx_attribute(node, 'axis', -1) + # because -1 is better supported than an explicit index, check if it's the same + if layer['axis'] == len(input_shapes[0]) - 1: + layer['axis'] = -1 elif layer['class_name'] in ['ELU', 'LeakyReLU', 'ThresholdedReLU']: layer['activation'] = layer['class_name'] diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index 75c6c95c3f..c35b19f723 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -16,18 +16,7 @@ test_root_path = Path(__file__).parent example_model_path = (test_root_path / '../../example-models').resolve() - -@pytest.fixture(scope='module') -def sep_conv_model(): - """ - Load separabale conv model, already channels-last and cleaned - """ - dl_file = str(example_model_path / "onnx/separable_conv_model_ch_last.onnx") - assert os.path.isfile(dl_file) - - model = ModelWrapper(dl_file) - - return model +# The models @pytest.fixture(scope='module') @@ -97,31 +86,33 @@ def jettagging_model(): return model -@pytest.mark.parametrize('backend', ['Vitis']) -def test_sep_conv(sep_conv_model, backend): - model = sep_conv_model - ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) - X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) - X = (np.round(X * 2**16) * 2**-16).astype(np.float32) - idict = {model.graph.input[0].name: X} - y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] +@pytest.fixture(scope='module') +def sep_conv_model(): + """ + Load separabale conv model, already channels-last and cleaned + """ + dl_file = str(example_model_path / "onnx/separable_conv_model_ch_last.onnx") + assert os.path.isfile(dl_file) - config = hls4ml.utils.config.config_from_onnx_model( - model, granularity='name', backend=backend, default_precision='fixed<32,16>' - ) + model = ModelWrapper(dl_file) - hls_model = hls4ml.converters.convert_from_onnx_model( - model, - output_dir=str(test_root_path / f'hls4mlprj_qonnx_sep_conv_{backend}'), - io_type='io_stream', - backend=backend, - hls_config=config, - ) - hls_model.compile() - y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) + return model + + +@pytest.fixture(scope='module') +def three_layer_keras_model(): + """ + Load a simple, originally keras unquantized model + """ + dl_file = str(example_model_path / "onnx/three_layer_keras.onnx") + assert os.path.isfile(dl_file) + + model = ModelWrapper(dl_file) + model = qonnx.util.cleanup.cleanup_model(model) + return model - np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) - print('test') + +# The actual tests @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) @@ -197,3 +188,58 @@ def test_jet_tagging(jettagging_model, backend): y_hls4ml = hls_model.predict(X) np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) + + +@pytest.mark.parametrize('backend', ['Vitis']) +def test_sep_conv(sep_conv_model, backend): + model = sep_conv_model + ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) + X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) + X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + config = hls4ml.utils.config.config_from_onnx_model( + model, granularity='name', backend=backend, default_precision='fixed<32,16>' + ) + + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir=str(test_root_path / f'hls4mlprj_qonnx_sep_conv_{backend}'), + io_type='io_stream', + backend=backend, + hls_config=config, + ) + hls_model.compile() + y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) + + np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) + + +@pytest.mark.parametrize('backend', ['Vitis']) +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +def test_three_layer_keras(three_layer_keras_model, io_type, backend): + model = three_layer_keras_model + ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) + X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) + X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + config = hls4ml.utils.config.config_from_onnx_model( + model, granularity='name', backend=backend, default_precision='fixed<32,16>' + ) + + config['LayerName']['Softmax_0']['Implementation'] = 'legacy' + + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir=str(test_root_path / f'hls4mlprj_onnx_three_layer_keras_{io_type}_{backend}'), + io_type=io_type, + backend=backend, + hls_config=config, + ) + hls_model.compile() + y_hls4ml = hls_model.predict(X) + + np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) From 9cbf0f1c9f81417fe6be4fec7add6bf64690a010 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 23 Oct 2024 15:50:37 -0500 Subject: [PATCH 178/272] add more onnx tests, optimize the handling of some attributes, update example model version --- example-models | 2 +- .../model/optimizer/passes/batchnorm_opt.py | 2 +- .../model/optimizer/passes/conv_to_convxd.py | 2 +- .../passes/conv_to_depthwiseconvxd.py | 2 +- test/pytest/test_qonnx.py | 125 +++++++++++++++++- 5 files changed, 122 insertions(+), 11 deletions(-) diff --git a/example-models b/example-models index 3cfbcfd062..d40894b03f 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550 +Subproject commit d40894b03f840a32da43a5adea0531ffc1db216e diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index b6c21c7267..e18d79ff4a 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -28,7 +28,7 @@ def transform(self, model, node): if not (len(node.inputs) == 5 and all(node.inputs)): raise ValueError('All 5 BatchNormOnnnx inputs need to be defined') - attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes = {k: node.attributes[k] for k in _base_attributes if k in node.attributes} gamma_node = node.get_input_node(node.inputs[1]) if not isinstance(gamma_node, Constant): diff --git a/hls4ml/model/optimizer/passes/conv_to_convxd.py b/hls4ml/model/optimizer/passes/conv_to_convxd.py index 25ac50ba40..3e870e43a6 100644 --- a/hls4ml/model/optimizer/passes/conv_to_convxd.py +++ b/hls4ml/model/optimizer/passes/conv_to_convxd.py @@ -54,7 +54,7 @@ def transform(self, model, node): bias_node = node.get_input_node(node.inputs[2]) # creating the attributes - attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes = {k: node.attributes[k] for k in _base_attributes if k in node.attributes} # The ConvxD nodes expect the weight data to be in a different format, not (M, k1.., C) if node.attributes['n_dim'] == 1: diff --git a/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py b/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py index 26603c6a64..b1271b5784 100644 --- a/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py +++ b/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py @@ -55,7 +55,7 @@ def transform(self, model, node): bias_node = node.get_input_node(node.inputs[2]) # creating the attributes - attributes = {k: node.attributes.get(k, None) for k in _base_attributes} + attributes = {k: node.attributes[k] for k in _base_attributes if k in node.attributes} # The ConvxD nodes expect the weight data to be in a different format, not (M, k1.., C) if node.attributes['n_dim'] == 1: diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py index c35b19f723..f822c591a7 100644 --- a/test/pytest/test_qonnx.py +++ b/test/pytest/test_qonnx.py @@ -10,6 +10,8 @@ # To conveniently run QONNX inference from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.channels_last import ConvertToChannelsLastAndClean +from qonnx.transformation.gemm_to_matmul import GemmToMatMul import hls4ml @@ -99,10 +101,23 @@ def sep_conv_model(): return model +@pytest.fixture(scope='module') +def two_layer_keras_model(): + """ + Load a simple, two-layer, originally keras, unquantized model + """ + dl_file = str(example_model_path / "onnx/two_layer_keras.onnx") + assert os.path.isfile(dl_file) + + model = ModelWrapper(dl_file) + model = qonnx.util.cleanup.cleanup_model(model) + return model + + @pytest.fixture(scope='module') def three_layer_keras_model(): """ - Load a simple, originally keras unquantized model + Load a simple, three-layer, originally keras, unquantized model """ dl_file = str(example_model_path / "onnx/three_layer_keras.onnx") assert os.path.isfile(dl_file) @@ -112,6 +127,84 @@ def three_layer_keras_model(): return model +@pytest.fixture(scope='module') +def two_layer_pytorch_model(): + """ + Load a simple, two-layer, originally pytorch, unquantized model + """ + dl_file = str(example_model_path / "onnx/two_layer_keras.onnx") + assert os.path.isfile(dl_file) + + model = ModelWrapper(dl_file) + model = qonnx.util.cleanup.cleanup_model(model) + model = model.transform(GemmToMatMul()) + model = qonnx.util.cleanup.cleanup_model(model) + return model + + +@pytest.fixture(scope='module') +def three_layer_pytorch_model(): + """ + Load a simple, three-layer, originally pytorch, unquantized model + """ + dl_file = str(example_model_path / "onnx/three_layer_pytorch.onnx") + assert os.path.isfile(dl_file) + + model = ModelWrapper(dl_file) + model = qonnx.util.cleanup.cleanup_model(model) + model = model.transform(GemmToMatMul()) + model = qonnx.util.cleanup.cleanup_model(model) + return model + + +@pytest.fixture(scope='module') +def conv1d_small_keras_model(): + """ + Load a simple conv1d, originally keras, unquantized model + """ + dl_file = str(example_model_path / "onnx/conv1d_small_keras.onnx") + assert os.path.isfile(dl_file) + + model = ModelWrapper(dl_file) + model = qonnx.util.cleanup.cleanup_model(model) + model = model.transform(ConvertToChannelsLastAndClean()) + model = model.transform(GemmToMatMul()) + model = qonnx.util.cleanup.cleanup_model(model) + return model + + +@pytest.fixture(scope='module') +def conv2d_small_keras_model(): + """ + Load a simple conv2d, originally keras, unquantized model + """ + dl_file = str(example_model_path / "onnx/conv2d_small_keras.onnx") + assert os.path.isfile(dl_file) + + model = ModelWrapper(dl_file) + model = qonnx.util.cleanup.cleanup_model(model) + model = model.transform(ConvertToChannelsLastAndClean()) + model = model.transform(GemmToMatMul()) + model = qonnx.util.cleanup.cleanup_model(model) + return model + + +@pytest.fixture(scope='module') +def conv2d_small_mp_keras_model(): + """ + Load a conv2d model with max pooling, originally keras, unquantized model + """ + dl_file = str(example_model_path / "onnx/conv2d_small_mp_keras.onnx") + assert os.path.isfile(dl_file) + + model = ModelWrapper(dl_file) + model = qonnx.util.cleanup.cleanup_model(model) + model = model.transform(ConvertToChannelsLastAndClean()) + model = model.transform(GemmToMatMul()) + model = qonnx.util.cleanup.cleanup_model(model) + return model + + # The actual tests @@ -216,25 +309,43 @@ def test_sep_conv(sep_conv_model, backend): np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1) +@pytest.mark.parametrize( + 'model_name', + [ + 'two_layer_keras_model', + 'three_layer_keras_model', + 'two_layer_pytorch_model', + 'three_layer_pytorch_model', + 'conv1d_small_keras_model', + 'conv2d_small_keras_model', + 'conv2d_small_mp_keras_model', + ], +) @pytest.mark.parametrize('backend', ['Vitis']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -def test_three_layer_keras(three_layer_keras_model, io_type, backend): - model = three_layer_keras_model +def test_simple_model(model_name, io_type, backend, request): + if model_name == 'conv2d_small_mp_keras_model' and io_type == 'io_stream': + # Not yet supported due to an issue with channels last conversion + # There is a qonnx PR. + pytest.skip() + model = request.getfixturevalue(model_name) ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) - X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + X = (np.round(X * 2**10) * 2**-10).astype(np.float32) idict = {model.graph.input[0].name: X} y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] config = hls4ml.utils.config.config_from_onnx_model( - model, granularity='name', backend=backend, default_precision='fixed<32,16>' + model, granularity='name', backend=backend, default_precision='fixed<16,6>' ) - config['LayerName']['Softmax_0']['Implementation'] = 'legacy' + for layer in config['LayerName']: + if layer.startswith('Softmax'): + config['LayerName'][layer]['Implementation'] = 'legacy' hls_model = hls4ml.converters.convert_from_onnx_model( model, - output_dir=str(test_root_path / f'hls4mlprj_onnx_three_layer_keras_{io_type}_{backend}'), + output_dir=str(test_root_path / f'hls4mlprj_onnx_{model_name}_{io_type}_{backend}'), io_type=io_type, backend=backend, hls_config=config, From 6ca1055728fc82001f911bc9fc4e1a3aa231fea2 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 23 Oct 2024 18:12:24 -0400 Subject: [PATCH 179/272] Figure out the weights dir automatically from the location of build_lib.sh --- hls4ml/templates/vivado/build_lib.sh | 3 ++- hls4ml/writer/vivado_writer.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/hls4ml/templates/vivado/build_lib.sh b/hls4ml/templates/vivado/build_lib.sh index 8b2daf185f..f5f2431ee4 100755 --- a/hls4ml/templates/vivado/build_lib.sh +++ b/hls4ml/templates/vivado/build_lib.sh @@ -11,7 +11,8 @@ LDFLAGS= INCFLAGS="-Ifirmware/ap_types/" PROJECT=myproject LIB_STAMP=mystamp -WEIGHTS_DIR="\"weights\"" +BASEDIR="$(cd "$(dirname "$0")" && pwd)" +WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\"" ${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR=${WEIGHTS_DIR} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o ${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR=${WEIGHTS_DIR} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 1d88c13de5..e4c0c24551 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -725,13 +725,10 @@ def write_build_script(self, model): # build_lib.sh build_lib_src = (filedir / '../templates/vivado/build_lib.sh').resolve() build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve() - weights_dir = (build_lib_dst.parent / 'firmware/weights').resolve() with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst: for line in src.readlines(): line = line.replace('myproject', model.config.get_project_name()) line = line.replace('mystamp', model.config.get_config_value('Stamp')) - if line.startswith('WEIGHTS_DIR='): - line = f'WEIGHTS_DIR=\\""{weights_dir}\\""\n' dst.write(line) build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC) From 3ec6c5a4dfb40dc57d25d4c34e96668f5a0b36b7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 23 Oct 2024 19:51:31 -0500 Subject: [PATCH 180/272] update qonnx documentation --- docs/advanced/qonnx.rst | 56 +++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 57 insertions(+) create mode 100644 docs/advanced/qonnx.rst diff --git a/docs/advanced/qonnx.rst b/docs/advanced/qonnx.rst new file mode 100644 index 0000000000..09b0074a0b --- /dev/null +++ b/docs/advanced/qonnx.rst @@ -0,0 +1,56 @@ +============== +ONNX and QONNX +============== + +Parsing of ONNX and QONNX models is made in conjunction with the `qonnx `_ package, even if it no quantization is used. This is a common initial parser shared with the AMD/Xilinx FINN project. The first step is to do constant folding, shape inference, etc., on the ONNX graph, commonly known as `cleaning`. If a model has convolution layers, the model also needs to be converted to a channels-last format, since that is what hls4ml mainly supports. The ``qonnx`` package also provides a number of additional transforms that may need to be used. For example, ``Gemm`` nodes need to converted to ``MatMul`` and ``Add`` nodes. + +There are command-line based versions of cleaning and channels-last conversion: + +.. code-block:: bash + + $ qonnx_clean filename.onnx + $ qonnx_to_channels_last filename_clean.onnx + $ qonnx_clean filename_clean_channels_last.onnx # good to do a clean again as a last step + +Things can similarly be done in python. This method is usually easier if you additionally need to call other transforms. An example is given below which also calls the ``GemmToMatMul`` converter: + +.. code-block:: python + + model = ModelWrapper('filename.onnx') + model = qonnx.util.cleanup.cleanup_model(model) + model = model.transform(ConvertToChannelsLastAndClean()) + model = model.transform(GemmToMatMul()) + model = qonnx.util.cleanup.cleanup_model(model) + +``ModelWrapper`` is defined in ``qonnx.core.modelwrapper``. More information on the ``qonnx`` package can be found at the `QONNX documentation page `_. + + +The next steps are very similar to if you are using a Keras model: + +.. code-block:: python + + config = hls4ml.utils.config.config_from_onnx_model( + model, granularity='name', backend='Vitis', default_precision='fixed<16,6>' + ) + # modify the config as desired + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir='my-hls-test', + io_type='io_stream', + backend='Vitis', + hls_config=config, + ) + hls_model.compile() + +Note, unlike the Keras version, "name" granularity is the default for ``config_from_onnx_model``, and it must be used for QONNX models. Unquantized ONNX models can use "model" if so desired, but generally there is no benefit. + +One can subsequently call the ``predict`` function to check the performance or build the project. + +Note that ``execute_onnx`` in ``qonnx.core.onnx_exec`` can be use to run the QONNX graphs directly, and it also provides the values at intermediate layers for validating the model (tracing). + +Quant nodes +=========== + +Documentation for quant nodes is provided in the `qonnx package `_. Note that currently hls4ml only supports the `Quant operator `_. Also, not all legal ``Quant`` configurations are parsable by hls4ml or synthesizable. The ``scale``, ``zeropt``, and ``bitwidth`` values must be constant (though not necessarily scalar for the ``scale`` and ``zeropt``). + +Generally if the ``zeropt`` is 0 and the ``scale`` is a scalar power of 2, hls4ml uses ``ap_fixed`` or ``ac_fixed`` types (depending on the backend) to represent the quantizations. In other cases, the ``scale`` and ``zeropt`` need to be explicitly handled by hls4ml, and there is more of a chance of hls4ml not being able to process the input. (Please report any issues that you find.) diff --git a/docs/index.rst b/docs/index.rst index c21b90aebc..b2f7e2501b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ :hidden: :caption: Advanced Features + advanced/qonnx advanced/fifo_depth advanced/extension advanced/accelerator From 210f8c25cca4eff5dc12af2d4e62e88e136c6e99 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 24 Oct 2024 16:40:33 -0500 Subject: [PATCH 181/272] quote the to handle special characters --- hls4ml/templates/vivado/build_lib.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vivado/build_lib.sh b/hls4ml/templates/vivado/build_lib.sh index f5f2431ee4..df719e2305 100755 --- a/hls4ml/templates/vivado/build_lib.sh +++ b/hls4ml/templates/vivado/build_lib.sh @@ -14,7 +14,7 @@ LIB_STAMP=mystamp BASEDIR="$(cd "$(dirname "$0")" && pwd)" WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\"" -${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR=${WEIGHTS_DIR} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o -${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR=${WEIGHTS_DIR} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o ${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so rm -f *.o From 45185379af2a2aa80883ec99375b6760f7813478 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 25 Oct 2024 02:43:54 -0400 Subject: [PATCH 182/272] Beginnings of the oneAPI backend (#955) * snapshot adding oneapi * fix reduce constexpr * further updates * update the bridge and testbench * fix issues discovered when compiling * update bridge writing files * build library (but not tested) * fix a bug in testbench * snapshot after some debugging * remove forgotten debug printing * add build * pre-commit fixes * fix more pre-commit * fix more pre-commit errors * snapshot of work before reworking types * Use using to decide array type, some preliminary updates * snapshot unifying types * fix the testbench and bridge * snapshot updating nnet_utils (not finished) * define array in nnet_types for oneAPI * fix parallel conv2d * add back the streaming versions of algs, most unconverted * tentatively complete streaming for dense but not functional * first version that compiles streaming * change how the pipe value type is extracted * fix pre-commit error * always treat elu as ELU class * fix batchnorm * snapshot towards fixing conv * snapshot fixing test for streaming * fix conv1d * fix conv2d * fix reshape and flatten for oneAPI * initial oneAPI tests * remove nnet_dense_compressed from oneAPI * add merge functionality (untested) * fix merge for oneAPI * fix merge for oneAPI (missing commit) * add zeropadding * standardize paralellization spelling * fix pointwise for oneAPI * remove references to quartus * more replace quartus with oneapi * snapshot on the way towards implementing pooling * fix io_stream pooling for oneAPI * add fix for Conv2DBatchnorm * accidentally committed CMakeLists.txt in my debug setup * reshaping, not fully tested * fix cloning of streams * fix pytest library loading * remove unused template * fix some activation bugs * fix the overwriting of directories in the pytest * update version of test repository * try to fix docker issue * bump hls4ml-testing tag to 0.5.2 * try not restricting tensorflow-model-optimizatoin * Update to 0.5.3 for testing * bump to docker image 0.5.4, suggested by Ben * fix pre-commit warning * dial down N_TESTS_PER_YAML to 4 * revert tensorflow-model-optimization change * fix issue of saving in "obsolete" h5 format * fix embedding for oneAPI * First attempt at adding RNNs to oneAPI * fix bug in array size * fix order or indices * make queues static in bridge * fix logic error in repack stream * changing the style, but functionally identical * update pointwise optimizer for oneAPI * add oneAPI to test_multi_dense.py * fix updating weight types * initial changes of templates, for testing * fix weight naming, product selection * make im2col the default; fix winograd size * fix up streaming dense and convolution * fix prelu, some batchnorm * fix weight array of exponential types * move ACExponentialPrecisionDefinition to oneapi_types * attempt to fix batchnorm and recurrent * fixed BatchNormalizationQuantizedTanhConfigTemplate template selection * fix embedding_stream * fix lstm and simple rnn * fix GRU * fix winograd, and also disable it by default * fix threshold name * split bn_quant to be backend-specific * add type inference to oneAPI * add oneAPI to pytorch tests * fix pooling with padding for oneAPI and Quartus * Compilation for larger models enabled by increasing -fconstexpr-steps * add oneapi clone tests; remove reduntand multi_clone test * remove some attributes to avoid overwrite warnings * make extra handling for oneAPI like others (as in PR #1067) * remove warnings for extra optimizers that are not scheduled on purpose * update parametrized activations * fix reference to alpha that had not been switched to param * add oneapi documentation * add parallelization factor to the attributes for oneAPI --------- Co-authored-by: Lauri Laatu Co-authored-by: Jan-Frederik Schulte --- docs/advanced/oneapi.rst | 35 + docs/index.rst | 1 + hls4ml/backends/__init__.py | 2 + .../{fpga => catapult}/passes/bn_quant.py | 0 hls4ml/backends/oneapi/__init__.py | 0 hls4ml/backends/oneapi/oneapi_backend.py | 376 +++++++ hls4ml/backends/oneapi/oneapi_template.py | 61 ++ hls4ml/backends/oneapi/oneapi_types.py | 267 +++++ hls4ml/backends/oneapi/passes/__init__.py | 0 hls4ml/backends/oneapi/passes/bn_quant.py | 222 ++++ .../backends/oneapi/passes/clone_templates.py | 32 + .../oneapi/passes/convolution_templates.py | 235 +++++ .../oneapi/passes/convolution_winograd.py | 179 ++++ .../backends/oneapi/passes/core_templates.py | 351 +++++++ .../oneapi/passes/embedding_templates.py | 32 + .../backends/oneapi/passes/merge_templates.py | 137 +++ hls4ml/backends/oneapi/passes/pointwise.py | 156 +++ .../oneapi/passes/pooling_templates.py | 153 +++ .../oneapi/passes/quantization_templates.py | 63 ++ .../oneapi/passes/recurrent_templates.py | 369 +++++++ .../oneapi/passes/reshaping_templates.py | 244 +++++ .../oneapi/passes/resource_strategy.py | 77 ++ .../backends/oneapi/passes/transform_types.py | 60 ++ hls4ml/backends/quartus/passes/bn_quant.py | 169 +++ .../quartus/passes/convolution_templates.py | 4 +- hls4ml/backends/template.py | 21 + hls4ml/backends/vivado/passes/bn_quant.py | 169 +++ hls4ml/converters/keras/core.py | 4 + hls4ml/model/layers.py | 2 +- hls4ml/model/optimizer/passes/stamp.py | 8 +- .../objectives/vivado_objectives.py | 4 +- hls4ml/templates/oneapi/CMakeLists.txt | 338 ++++++ hls4ml/templates/oneapi/exception_handler.hpp | 21 + hls4ml/templates/oneapi/firmware/defines.h | 20 + .../templates/oneapi/firmware/myproject.cpp | 24 + hls4ml/templates/oneapi/firmware/myproject.h | 29 + .../firmware/nnet_utils/nnet_activation.h | 499 +++++++++ .../nnet_utils/nnet_activation_stream.h | 712 +++++++++++++ .../firmware/nnet_utils/nnet_batchnorm.h | 104 ++ .../nnet_utils/nnet_batchnorm_stream.h | 107 ++ .../oneapi/firmware/nnet_utils/nnet_common.h | 76 ++ .../oneapi/firmware/nnet_utils/nnet_conv1d.h | 61 ++ .../nnet_utils/nnet_conv1d_resource.h | 237 +++++ .../firmware/nnet_utils/nnet_conv1d_stream.h | 177 ++++ .../oneapi/firmware/nnet_utils/nnet_conv2d.h | 67 ++ .../nnet_utils/nnet_conv2d_resource.h | 297 ++++++ .../firmware/nnet_utils/nnet_conv2d_stream.h | 241 +++++ .../oneapi/firmware/nnet_utils/nnet_dense.h | 164 +++ .../firmware/nnet_utils/nnet_dense_stream.h | 23 + .../oneapi/firmware/nnet_utils/nnet_embed.h | 43 + .../firmware/nnet_utils/nnet_embed_stream.h | 31 + .../oneapi/firmware/nnet_utils/nnet_helpers.h | 118 +++ .../oneapi/firmware/nnet_utils/nnet_merge.h | 232 +++++ .../firmware/nnet_utils/nnet_merge_stream.h | 359 +++++++ .../oneapi/firmware/nnet_utils/nnet_mult.h | 113 ++ .../oneapi/firmware/nnet_utils/nnet_padding.h | 104 ++ .../firmware/nnet_utils/nnet_padding_stream.h | 81 ++ .../oneapi/firmware/nnet_utils/nnet_pooling.h | 257 +++++ .../firmware/nnet_utils/nnet_pooling_stream.h | 322 ++++++ .../oneapi/firmware/nnet_utils/nnet_printf.h | 18 + .../firmware/nnet_utils/nnet_recurrent.h | 566 ++++++++++ .../nnet_utils/nnet_recurrent_activation.h | 47 + .../nnet_utils/nnet_recurrent_stream.h | 68 ++ .../oneapi/firmware/nnet_utils/nnet_resize.h | 36 + .../firmware/nnet_utils/nnet_resize_stream.h | 58 ++ .../oneapi/firmware/nnet_utils/nnet_stream.h | 126 +++ .../firmware/nnet_utils/nnet_transpose.h | 48 + .../nnet_utils/nnet_transpose_stream.h | 39 + .../oneapi/firmware/nnet_utils/nnet_types.h | 71 ++ hls4ml/templates/oneapi/firmware/parameters.h | 11 + hls4ml/templates/oneapi/myproject_bridge.cpp | 77 ++ hls4ml/templates/oneapi/myproject_test.cpp | 133 +++ .../quartus/firmware/nnet_utils/nnet_conv1d.h | 2 +- .../nnet_utils/nnet_conv1d_resource.h | 12 +- .../quartus/firmware/nnet_utils/nnet_conv2d.h | 2 +- .../nnet_utils/nnet_conv2d_resource.h | 18 +- .../firmware/nnet_utils/nnet_pooling.h | 34 +- .../quartus/firmware/nnet_utils/nnet_stream.h | 1 + hls4ml/utils/fixed_point_utils.py | 11 +- hls4ml/writer/__init__.py | 2 + hls4ml/writer/oneapi_writer.py | 969 ++++++++++++++++++ test/pytest/test_activations.py | 2 +- test/pytest/test_batchnorm.py | 4 +- test/pytest/test_conv1d.py | 4 + test/pytest/test_embed.py | 4 +- test/pytest/test_globalpooling.py | 4 +- test/pytest/test_keras_api.py | 12 +- test/pytest/test_merge.py | 10 +- test/pytest/test_multi_dense.py | 1 + test/pytest/test_pointwiseconv.py | 8 +- test/pytest/test_pooling.py | 70 +- test/pytest/test_pytorch_api.py | 27 +- test/pytest/test_qkeras.py | 22 +- test/pytest/test_repack_stream.py | 23 +- test/pytest/test_reshape.py | 4 +- test/pytest/test_rnn.py | 39 +- test/pytest/test_stream_clone.py | 4 +- test/pytest/test_stream_multi_clone.py | 48 - test/pytest/test_transpose_concat.py | 4 +- test/pytest/test_upsampling.py | 2 +- test/pytest/test_zeropadding.py | 2 +- 101 files changed, 10764 insertions(+), 169 deletions(-) create mode 100644 docs/advanced/oneapi.rst rename hls4ml/backends/{fpga => catapult}/passes/bn_quant.py (100%) create mode 100644 hls4ml/backends/oneapi/__init__.py create mode 100644 hls4ml/backends/oneapi/oneapi_backend.py create mode 100644 hls4ml/backends/oneapi/oneapi_template.py create mode 100644 hls4ml/backends/oneapi/oneapi_types.py create mode 100644 hls4ml/backends/oneapi/passes/__init__.py create mode 100644 hls4ml/backends/oneapi/passes/bn_quant.py create mode 100644 hls4ml/backends/oneapi/passes/clone_templates.py create mode 100644 hls4ml/backends/oneapi/passes/convolution_templates.py create mode 100644 hls4ml/backends/oneapi/passes/convolution_winograd.py create mode 100644 hls4ml/backends/oneapi/passes/core_templates.py create mode 100644 hls4ml/backends/oneapi/passes/embedding_templates.py create mode 100644 hls4ml/backends/oneapi/passes/merge_templates.py create mode 100644 hls4ml/backends/oneapi/passes/pointwise.py create mode 100644 hls4ml/backends/oneapi/passes/pooling_templates.py create mode 100644 hls4ml/backends/oneapi/passes/quantization_templates.py create mode 100644 hls4ml/backends/oneapi/passes/recurrent_templates.py create mode 100644 hls4ml/backends/oneapi/passes/reshaping_templates.py create mode 100644 hls4ml/backends/oneapi/passes/resource_strategy.py create mode 100644 hls4ml/backends/oneapi/passes/transform_types.py create mode 100644 hls4ml/backends/quartus/passes/bn_quant.py create mode 100644 hls4ml/backends/vivado/passes/bn_quant.py create mode 100644 hls4ml/templates/oneapi/CMakeLists.txt create mode 100644 hls4ml/templates/oneapi/exception_handler.hpp create mode 100644 hls4ml/templates/oneapi/firmware/defines.h create mode 100644 hls4ml/templates/oneapi/firmware/myproject.cpp create mode 100644 hls4ml/templates/oneapi/firmware/myproject.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h create mode 100644 hls4ml/templates/oneapi/firmware/parameters.h create mode 100644 hls4ml/templates/oneapi/myproject_bridge.cpp create mode 100644 hls4ml/templates/oneapi/myproject_test.cpp create mode 100644 hls4ml/writer/oneapi_writer.py delete mode 100644 test/pytest/test_stream_multi_clone.py diff --git a/docs/advanced/oneapi.rst b/docs/advanced/oneapi.rst new file mode 100644 index 0000000000..ae0e0bc56b --- /dev/null +++ b/docs/advanced/oneapi.rst @@ -0,0 +1,35 @@ +============== +oneAPI Backend +============== + +The ``oneAPI`` backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually +replace the ``Quartus`` backend, which should really have been called the Intel HLS backend. (The actual Quartus +program continues to be used with IP produced by the ``oneAPI`` backend.) +This section discusses details of the ``oneAPI`` backend. + +The ``oneAPI`` code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the +accelerator style of programming. In the IP Component flow, which is currently the only flow supported, the +kernel becomes the IP, and the "host code" becomes the testbench. An accelerator flow, with easier deployment on +PCIe accelerator boards, is planned to be added in the future. + +The produced work areas use cmake to build the projects in a style based +`oneAPI-samples `_. +The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` are supported. Additionally, ``make lib`` +produces the library used for calling the ``predict`` function from hls4ml. The ``compile`` and ``build`` commands +in hls4ml interact with the cmake system, so one does not need to manually use the build system, but it there +if desired. + +The ``oneAPI`` backend, like the ``Quartus`` backend, only implements the ``Resource`` strategy for the layers. There +is no ``Latency`` implementation of any of the layers. + +Note: currently tracing and external weights (i.e. setting BramFactor) are not supported. + +io_parallel and io_stream +========================= + +As mentioned in the :ref:`I/O Types` section, ``io_parallel`` is for small models, while ``io_stream`` is for +larger models. In ``oneAPI``, there is an additional difference: ``io_stream`` implements each layer on its +own ``task_sequence``. Thus, the layers run in parallel, with pipes connecting the inputs and outputs. This +is similar in style to the `dataflow` implementation on Vitis, but more explicit. On the other hand, ``io_parallel`` +always uses a single task, relying on pipelining within the task for good performance. In contrast, the Vitis +backend sometimes uses dataflow with ``io_parallel``. diff --git a/docs/index.rst b/docs/index.rst index c21b90aebc..07fcd217db 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,7 @@ advanced/fifo_depth advanced/extension + advanced/oneapi advanced/accelerator advanced/model_optimization diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 8b3117af7a..4a48f072cd 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -1,5 +1,6 @@ from hls4ml.backends.backend import Backend, get_available_backends, get_backend, register_backend # noqa: F401 from hls4ml.backends.fpga.fpga_backend import FPGABackend # noqa: F401 +from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend from hls4ml.backends.vivado.vivado_backend import VivadoBackend @@ -16,3 +17,4 @@ register_backend('Quartus', QuartusBackend) register_backend('Catapult', CatapultBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) +register_backend('oneAPI', OneAPIBackend) diff --git a/hls4ml/backends/fpga/passes/bn_quant.py b/hls4ml/backends/catapult/passes/bn_quant.py similarity index 100% rename from hls4ml/backends/fpga/passes/bn_quant.py rename to hls4ml/backends/catapult/passes/bn_quant.py diff --git a/hls4ml/backends/oneapi/__init__.py b/hls4ml/backends/oneapi/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py new file mode 100644 index 0000000000..c85a8c0e94 --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -0,0 +1,376 @@ +import subprocess +from pathlib import Path +from warnings import warn + +import numpy as np + +from hls4ml.backends import FPGABackend +from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute +from hls4ml.model.flow import register_flow +from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax +from hls4ml.model.optimizer import get_backend_passes, layer_optimizer +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType + +# from hls4ml.report import parse_oneapi_report + + +class OneAPIBackend(FPGABackend): + def __init__(self): + super().__init__('oneAPI') + self._register_layer_attributes() + self._register_flows() + + def _register_layer_attributes(self): + # Add RNN-specific recurrent_reuse_factor attribute + rnn_layers = [ + SimpleRNN, + LSTM, + GRU, + ] + + for layer in rnn_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1)) + attrs.append(ConfigurableAttribute('table_size', default=1024)) + attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + self.attribute_map[layer] = attrs + + # Add ParallelizationFactor to Conv1D/2D + pf_layers = [ + Conv1D, + Conv2D, + ] + + for layer in pf_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(ConfigurableAttribute('parallelization_factor', default=1)) + self.attribute_map[layer] = attrs + + def _register_flows(self): + initializers = self._get_layer_initializers() + init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name) + + streaming_passes = ['oneapi:clone_output'] + streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name) + + oneapi_types = [ + 'oneapi:transform_types', + 'oneapi:register_bram_weights', + 'oneapi:apply_resource_strategy', + 'oneapi:apply_winograd_kernel_transformation', + ] + oneapi_types_flow = register_flow('specific_types', oneapi_types, requires=[init_flow], backend=self.name) + + quantization_passes = [ + 'oneapi:merge_batch_norm_quantized_tanh', + 'oneapi:quantize_dense_output', + 'fuse_consecutive_batch_normalization', + 'oneapi:xnor_pooling', + 'oneapi:generate_conv_im2col', + ] + quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name) + + optimization_passes = [ + 'oneapi:remove_final_reshape', + 'oneapi:optimize_pointwise_conv', + 'oneapi:inplace_parallel_reshape', + 'oneapi:skip_softmax', + 'oneapi:fix_softmax_table_size', + 'infer_precision_types', + ] + optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) + + templates = self._get_layer_templates() + template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name) + + writer_passes = ['make_stamp', 'oneapi:write_hls'] + + self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name) + + all_passes = get_backend_passes(self.name) + + extras = [ + # Ideally this should be empty + opt_pass + for opt_pass in all_passes + if opt_pass + not in initializers + + streaming_passes + + oneapi_types + + quantization_passes + + templates + + optimization_passes + + writer_passes + + ['oneapi:inplace_stream_flatten', 'oneapi:reshape_stream'] # not needed + + ['oneapi:process_fixed_point_quantizer_layer'] # not yet supported + ] + + if len(extras) > 0: + for opt in extras: + warn(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') + + ip_flow_requirements = [ + 'optimize', + init_flow, + streaming_flow, + quantization_flow, + optimization_flow, + oneapi_types_flow, + template_flow, + ] + ip_flow_requirements = list(filter(None, ip_flow_requirements)) + + self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) + + def get_default_flow(self): + return self._default_flow + + def get_writer_flow(self): + return self._writer_flow + + def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel'): + config = {} + + config['Part'] = part if part is not None else 'Arria10' + config['ClockPeriod'] = clock_period + config['IOType'] = io_type + config['HLSConfig'] = {} + + return config + + def compile(self, model): + """Compile the generated project that can be linked into Python runtime. + + Args: + model (ModelGraph): Model to compile. + + Raises: + Exception: If the project failed to compile + + Returns: + string: Returns the name of the compiled library. + """ + outdir = Path(Path.cwd(), model.config.get_output_dir()) + builddir = outdir / 'build' + builddir.mkdir(exist_ok=True) + try: + subprocess.run('which icpx', shell=True, cwd=builddir, check=True) + except subprocess.CalledProcessError: + raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately') + subprocess.run('cmake ..', shell=True, cwd=builddir, check=True) + subprocess.run('make lib', shell=True, cwd=builddir, check=True) + + lib_name = builddir / f'lib{model.config.get_project_name()}-{model.config.get_config_value("Stamp")}.so' + return lib_name + + def build(self, model, build_type='fpga_emu', run=False): + """ + Builds the project using Intel DPC++ (oneAPI) compiler. + + Args: + model (ModelGraph): The model to build + build_type, optional: What to build (e.g. fpga_emu, fpga_sim, fpga, report) + run, optional: Whether to run the testbench + Errors raise exceptions + """ + + # Check software needed is present + outdir = Path(Path.cwd(), model.config.get_output_dir()) + builddir = outdir / 'build' + builddir.mkdir(exist_ok=True) + try: + subprocess.run('which icpx', shell=True, cwd=builddir, check=True) + except subprocess.CalledProcessError: + raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately') + subprocess.run('cmake ..', shell=True, cwd=builddir, check=True) + subprocess.run(f'make {build_type}', shell=True, cwd=builddir, check=True) + + if run and build_type in ('fpga_emu', 'fpga_sim', 'fpga'): + executable = builddir / f'{model.config.get_project_name()}.{build_type}' + subprocess.run(f'{str(executable)}', shell=True, cwd=builddir, check=True) + + @layer_optimizer(Layer) + def init_base_layer(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('reuse_factor', reuse_factor) + + target_cycles = layer.model.config.get_target_cycles(layer) + layer.set_attr('target_cycles', target_cycles) + + @layer_optimizer(Dense) + def init_dense(self, layer): + index_t = IntegerPrecisionType(width=1, signed=False) + + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + if layer.model.config.get_compression(layer): + layer.set_attr('strategy', 'compressed') + else: + n_in, n_out = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + + if layer.model.config.is_resource_strategy(layer): + if layer.model.config.get_compression(layer): + index_t = layer.get_weights('weight').type.index_precision + + layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) + + @layer_optimizer(Activation) + def init_activation(self, layer): + if layer.get_attr('activation') == 'tanh': + layer.set_attr('activation', 'dense_tanh') + if layer.get_attr('recurrent_activation') == 'tanh': + layer.set_attr('recurrent_activation', 'dense_tanh') + + @layer_optimizer(Softmax) + def init_softmax(self, layer): + if layer.model.config.get_config_value('IOType') == 'io_parallel': + assert ( + len(layer.get_input_variable().shape) == 1 + ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.' + + @layer_optimizer(Embedding) + def init_embed(self, layer): + if layer.attributes['n_in'] is None: + raise Exception('Input length of Embedding layer must be specified.') + + @layer_optimizer(GRU) + def init_gru(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # Dense multiplication properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + index_t = IntegerPrecisionType(width=1, signed=False) + layer.set_attr('index_t', index_t) + + if 'table_t' not in layer.attributes: + layer.set_attr( + 'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=18, integer=8)) + ) + if 'table_size' not in layer.attributes: + layer.set_attr('table_size', 1024) + if True: # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'resource') + + layer.set_attr('index_t', index_t) + + @layer_optimizer(Conv1D) + def init_conv1d(self, layer): + # This can happen if we assign weights of Dense layer to 1x1 Conv1D + if len(layer.weights['weight'].data.shape) == 2: + layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1)) + + # Dense matrix multiply properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + # Reuse and parallelization factors + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)) + + # impl_filt_width determines the filter size post-Winograd transformation + layer.set_attr('impl_filt_width', layer.get_attr('filt_width')) + + # Implementation: + # - combination - at compile-time, the decision between Winograd and im2col is made + # - im2col - specifically use im2col + # - Winograd - use Winograd, if possible + layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'im2col')) + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend + + @layer_optimizer(Conv2D) + def init_conv2d(self, layer): + # This can happen if we assign weights of Dense layer to 1x1 Conv2D + if len(layer.weights['weight'].data.shape) == 2: + layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1)) + + # Dense matrix multiply properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + # Reuse and parallelization factors + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)) + + # impl_filt_width & impl_filt_height determine the filter size post-Winograd transformation + layer.set_attr('impl_filt_height', layer.get_attr('filt_height')) + layer.set_attr('impl_filt_width', layer.get_attr('filt_width')) + + # Implementation: + # - combination - at compile-time, the decision between Winograd and im2col is made + # - im2col - specifically use im2col + # - Winograd - use Winograd, if possible + layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'im2col')) + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend + + @layer_optimizer(LSTM) + def init_lstm(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # We don't use RF yet + if True: # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'resource') + + # Split weights for easier storage in on-chip memory and implementation in HLS + weights_data = layer.weights['weight'].data + rec_weights_data = layer.weights['recurrent_weight'].data + bias_data = layer.weights['bias'].data + + weight_types = ['i', 'f', 'c', 'o'] + for i in range(0, 4): + layer.add_weights_variable( + name=f'weight_{weight_types[i]}', + var_name=f'kernel_{weight_types[i]}_{{index}}', + data=weights_data[ + 0 : layer.get_attr('n_in'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out') + ], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + layer.add_weights_variable( + name=f'recurrent_weight_{weight_types[i]}', + var_name=f'recurrent_kernel_{weight_types[i]}_{{index}}', + data=rec_weights_data[ + 0 : layer.get_attr('n_out'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out') + ], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + layer.add_weights_variable( + name=f'bias_{weight_types[i]}', + var_name=f'bias_{weight_types[i]}_{{index}}', + data=bias_data[i * layer.get_attr('n_out') : (i + 1) * (layer.get_attr('n_out'))], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + + @layer_optimizer(SimpleRNN) + def init_simple_rnn(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # TODO - Consider setting and using RF diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py new file mode 100644 index 0000000000..c86b8f7ea3 --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_template.py @@ -0,0 +1,61 @@ +''' +This package includes oneAPI-specific templates +''' + +from hls4ml.backends.template import Template + + +class StreamFunctionCallTemplate(Template): + """Base class for the streaming function call templates in oneAPI: provides the 'stream_function_cpp' attribute. + This generally provides the async call to the task sequence that executes the streaming function. + + Note: the include header files are specified in the regular FunctionCallTemplate, not here. + + Args: + layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles. + """ + + def __init__(self, layer_class): + if isinstance(layer_class, (list, tuple, set)): + name = '_'.join([cls.__name__.lower() for cls in layer_class]) + else: + name = layer_class.__name__.lower() + name += '_stream_function_template' + super().__init__(name, layer_class, 'stream_function_cpp') + + def _default_function_params(self, layer): + params = self._default_params(layer) + params['name'] = layer.name + return params + + def transform(self, model, node): + return super().transform(model, node) + + +class TaskSequenceTemplate(Template): + """Base class for the task sequence definition in oneAPI: provides the 'task_sequence_cpp' attribute. + This defines the task sequence that is then called by the StreamFunctionCallTemplate. + + Args: + layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles. + """ + + def __init__(self, layer_class): + if isinstance(layer_class, (list, tuple, set)): + name = '_'.join([cls.__name__.lower() for cls in layer_class]) + else: + name = layer_class.__name__.lower() + name += '_task_sequence_template' + super().__init__(name, layer_class, 'tast_sequence_cpp') + + def _default_function_params(self, layer): + params = self._default_params(layer) + params['name'] = layer.name + params['config'] = f'config{layer.index}' + params['input_pipe'] = layer.get_input_variable().pipe_name + params['output_pipe'] = layer.get_output_variable().pipe_name + + return params + + def transform(self, model, node): + return super().transform(model, node) diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py new file mode 100644 index 0000000000..3106e1e10d --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -0,0 +1,267 @@ +''' +This package includes oneAPI-specific customizations to the variable types +''' + +import numpy as np + +from hls4ml.backends.fpga.fpga_types import ( + ACFixedPrecisionDefinition, + ACIntegerPrecisionDefinition, + FixedPrecisionConverter, + HLSTypeConverter, + NamedTypeConverter, + PrecisionDefinition, + TypeDefinition, + TypePrecisionConverter, + VariableDefinition, +) +from hls4ml.model.types import ( + CompressedType, + ExponentPrecisionType, + ExponentType, + FixedPrecisionType, + IntegerPrecisionType, + NamedType, + PackedType, + XnorPrecisionType, +) +from hls4ml.utils.fixed_point_utils import next_pow2 +from hls4ml.utils.string_utils import convert_to_pascal_case + + +class ACExponentPrecisionDefinition(PrecisionDefinition): + def definition_cpp(self): + typestring = f'std::pair, ac_int<{self.width}, true>>' + return typestring + + +class OneAPIACTypeConverter(FixedPrecisionConverter): + def __init__(self): + super().__init__( + type_map={ + FixedPrecisionType: ACFixedPrecisionDefinition, + IntegerPrecisionType: ACIntegerPrecisionDefinition, + ExponentPrecisionType: ACExponentPrecisionDefinition, + XnorPrecisionType: ACIntegerPrecisionDefinition, + }, + prefix='AC', + ) + + +class OneAPICompressedTypeConverter(TypeDefinition, TypePrecisionConverter): + """Use a tuple for storing a compressed type for oneAPI since it's better supported. (Currently unused)""" + + def definition_cpp(self): + """tuple format is row_index, col_index, weight""" + cpp_fmt = 'typedef std::tuple<{index}, {index}, {precision}> {name};\n' + return cpp_fmt.format(name=self.name, index=self.index_precision, precision=self.precision.definition_cpp()) + + def convert_precision(self, precision_converter): + super().convert_precision(precision_converter) + self.index_precision = precision_converter.convert(self.index_precision) + + +class OneAPIExponentTypeConverter(TypeDefinition, TypePrecisionConverter): + """Use a pair for storing a exponent type for oneAPI since it's better supported""" + + def definition_cpp(self): + cpp_fmt = 'typedef std::pair<{sign}, {precision}> {name};\n' + return cpp_fmt.format(name=self.name, precision=self.precision.definition_cpp(), sign=self.sign.definition_cpp()) + + def convert_precision(self, precision_converter): + super().convert_precision(precision_converter) + self.sign = precision_converter.convert(self.sign) + + +class OneAPIPackedTypeConverter(TypeDefinition, TypePrecisionConverter): + def definition_cpp(self): + n_elem_expr = '/' if self.unpack else '*' + return 'typedef nnet::array<{precision}, {n_elem}> {name};\n'.format( + name=self.name, + precision=self.precision.definition_cpp(), + n_elem=str(self.n_elem) + n_elem_expr + str(self.n_pack), + ) + + def convert_precision(self, precision_converter): + self.precision = precision_converter.convert(self.precision) + + +class OneAPIHLSTypeConverter(HLSTypeConverter): + def __init__(self, precision_converter): + self.precision_converter = precision_converter + self.type_map = { + NamedType: NamedTypeConverter, + CompressedType: OneAPICompressedTypeConverter, + ExponentType: OneAPIExponentTypeConverter, + PackedType: OneAPIPackedTypeConverter, + } + + +# region ArrayVarable + + +class OneAPIArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if self.pragma and not isinstance(self.pragma, tuple): + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + else: + return f'{self.type.name} {self.name}{name_suffix}' + + +class OneAPIInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class AggregratedArrayVariableConverter: + """This is a bit of an extension of the standard ArrayVariableConverter""" + + def __init__(self, type_converter, prefix, definition_cls): + self.type_converter = type_converter + self.prefix = prefix + self.definition_cls = definition_cls + + def convert(self, tensor_var, pragma='', depth=0, n_pack=1): + if isinstance(tensor_var, self.definition_cls): # Already converted + return tensor_var + + tensor_var.pragma = pragma + if pragma == 'stream': + if depth == 0: + depth = np.prod(tensor_var.shape) // tensor_var.shape[-1] + tensor_var.pragma = ('stream', depth) + n_elem = tensor_var.shape[-1] + else: + tensor_var.pragma = pragma + n_elem = tensor_var.size() + n_pack = 1 # ignore any passed value + + tensor_var.type = self.type_converter.convert( + PackedType(tensor_var.type.name, tensor_var.type.precision, n_elem, n_pack) + ) + + # pipe_name and pipe_id are only used for io_stream and interface variables in io_parallel + tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe' + tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID' + + tensor_var.__class__ = type(self.prefix + 'AggregateArrayVariable', (type(tensor_var), self.definition_cls), {}) + return tensor_var + + +class OneAPIArrayVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition) + + +class OneAPIInplaceArrayVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition) + + +# endregion + +# region InterfaceMemberVariable + + +class OneAPIInterfaceVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if self.pragma and not isinstance(self.pragma, tuple): + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + else: + return f'{self.type.name} {self.name}{name_suffix}' + + def declare_cpp(self, pipe_min_size=0, indent=''): + lines = indent + f'class {self.pipe_id};\n' + lines += indent + ( + f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n' + ) + return lines + + +class OneAPIInterfaceVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition) + + +# endregion + + +# region StreamVariable +class OneAPIStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=True): + return f'{self.name}{name_suffix}' + + def declare_cpp(self, indent=''): + lines = indent + f'class {self.pipe_id};\n' + lines += indent + ( + f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + + f'{self.type.name}, {self.pragma[-1]}>;\n' + ) + return lines + + +class OneAPIInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'using {self.name} = {self.input_var.name}' + + +class OneAPIStreamVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIStreamVariableDefinition) + + +class OneAPIInplaceStreamVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceStreamVariableDefinition + ) + + +# region WeightsVariable + + +class OneAPIStaticWeightVariableDefinition(VariableDefinition): + def definition_cpp(self, reuse_factor): + """Write the appropriate weight definiiton""" + # first determine whether to store in register or bram (heuristic) + if reuse_factor == 1 or self.data_length < 2048 or self.type.precision.width < 3: + attribute = '[[intel::fpga_register]]' + else: + # revisit this heuristic + nbanks = int(2 ** np.ceil(np.log2(self.data_length)) / 2) + var_width = int(np.ceil(self.type.precision.width / 8)) + bwidth = next_pow2(var_width) + attribute = ( + f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), ' + 'intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]' + ) + if self.storage == 'register': + return f'{attribute} static constexpr {self.type.name} {self.name}' + else: + return f'{attribute} {self.type.name} {self.name}' + + +class OneAPIStaticWeightVariableConverter: + def __init__(self, type_converter): + self.type_converter = type_converter + + def convert(self, weight_var): + if isinstance(weight_var, OneAPIStaticWeightVariableDefinition): # Already converted + return weight_var + + weight_var.weight_class = weight_var.__class__.__name__ + weight_var.storage = 'register' + weight_var.type = self.type_converter.convert( + PackedType(weight_var.name + '_t', weight_var.type.precision, weight_var.data_length, 1) + ) + + weight_var.__class__ = type( + 'OneAPIStaticWeightVariable', (type(weight_var), OneAPIStaticWeightVariableDefinition), {} + ) + return weight_var + + +# endregion + +# endregion diff --git a/hls4ml/backends/oneapi/passes/__init__.py b/hls4ml/backends/oneapi/passes/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/oneapi/passes/bn_quant.py b/hls4ml/backends/oneapi/passes/bn_quant.py new file mode 100644 index 0000000000..8425d5da1a --- /dev/null +++ b/hls4ml/backends/oneapi/passes/bn_quant.py @@ -0,0 +1,222 @@ +import numpy as np + +from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import BatchNormalization, register_layer +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType + +batchnorm_quantized_tanh_binary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {threshold_t.name} threshold_t; +}};\n""" + +batchnorm_quantized_tanh_ternary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {threshold_hi_t.name} threshold_hi_t; + typedef {threshold_lo_t.name} threshold_lo_t; +}};\n""" + +batchnorm_quantized_tanh_function_template = ( + 'nnet::normalize_{quantize}_tanh<{input_t}, {output_t}, {config}>({input}, {output}, {threshold});' +) + +bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + +batchnorm_quantized_tanh_task_sequence_template = ( + 'task_sequence> {name};' +) + +batchnorm_quantized_tanh_stream_function_template = '{name}.async({threshold});' + + +class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = (batchnorm_quantized_tanh_binary_config_template, batchnorm_quantized_tanh_ternary_config_template) + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + + if node.get_attr('quantize') == 2: + return self.template[0].format(**params) + else: + return self.template[1].format(**params) + + +class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list) + self.template = batchnorm_quantized_tanh_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['quantize'] = 'binary' + params['threshold'] = node.get_weights('threshold').name + elif node.get_attr('quantize') == 3: + params['quantize'] = 'ternary' + params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name + + return self.template.format(**params) + + +class BatchNormalizationQuantizedTanhTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = batchnorm_quantized_tanh_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['quantize'] = 'binary' + elif node.get_attr('quantize') == 3: + params['quantize'] = 'ternary' + + return self.template.format(**params) + + +class BatchNormalizationQuantizedTanhStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = batchnorm_quantized_tanh_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['threshold'] = node.get_weights('threshold').name + elif node.get_attr('quantize') == 3: + params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name + + return self.template.format(**params) + + +def register_bn_quant(backend): + # Register the layer types to the layer map + register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh) + + # Register the optimization passes + backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh) + backend.register_pass('quantize_dense_output', QuantizeDenseOutput) + + # Register template passes + backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate) + backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate) + backend.register_template(BatchNormalizationQuantizedTanhTaskSequenceTemplate) + backend.register_template(BatchNormalizationQuantizedTanhStreamFunctionTemplate) + + +class MergeBatchNormAndQuantizedTanh(OptimizerPass): + def match(self, node): + is_match = ( + node.class_name == 'Activation' + and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh'] + or node.class_name == 'TernaryTanh' + ) + is_match = is_match and isinstance(node.get_input_node(), BatchNormalization) + return is_match + + def transform(self, model, node): + bn_layer = node.get_input_node() + # Make a new layer with the new attributes + quantize = 0 + if 'binary' in node.get_attr('activation'): + quantize = 2 + if 'ternary' in node.get_attr('activation'): + quantize = 3 + attrs = { + 'name': bn_layer.get_attr('name'), + 'original_name': bn_layer.get_attr('name'), + 'class_name': 'BatchNormalizationQuantizedTanh', + 'n_in': bn_layer.get_attr('n_in'), + 'n_out': bn_layer.get_attr('n_in'), + 'n_filt': bn_layer.get_attr('n_filt'), + 'quantize': quantize, + 'trace': bn_layer.get_attr('trace'), + } + bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs) + bnbt_layer.set_thresholds( + bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5) + ) + # Remove the BatchNormalization layer + model.remove_node(bn_layer, rewire=True) + # Replace the old Activation layer with this one + model.replace_node(node, bnbt_layer) + + return True + + +class QuantizeDenseOutput(OptimizerPass): + def match(self, node): + is_dense = node.class_name == 'Dense' + input_node = node.get_input_node() + is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh' + quantizer = node.get_attr('weight_quantizer') + is_binary_ternary = quantizer is not None and ( + quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer' + ) + return is_dense and is_input_bnqt and is_binary_ternary + + def transform(self, model, node): + # Compute the required precision and update the variables + # Number of bits for output is log2 of number of input nodes + # Since this is the number of uint<1>'s which are summed + nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2) + out_type = IntegerPrecisionType(width=nbits) + accum_t = NamedType(f'layer{node.index}_accum_t', out_type) + node.set_attr('accum_t', accum_t) + out_var = node.get_output_variable() + out_var.type.precision = out_type + + quantized_data = None + quantized_precision = None + quantizer = node.get_attr('weight_quantizer') + if quantizer.__class__.__name__ == 'BinaryQuantizer': + quantized_precision = XnorPrecisionType() + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + quantized_precision = IntegerPrecisionType(width=2) + else: + print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out') + return False + quantizer.bits = quantized_precision.width + quantizer.hls_type = quantized_precision + quantized_data = quantizer(node.weights['weight'].data) + + weights = node.weights['weight'] + weights.data = quantized_data + weights.type.name = f'weight{node.index}_t' + weights.update_precision(quantized_precision) + + bias = node.weights['bias'] + bias.data = np.zeros(shape=(node.get_attr('n_out'))) + bias.type.name = f'bias{node.index}_t' + bias.nzeros = 0 + bias.update_precision(quantized_precision) + + # If followed by the BatchNormalizationBinaryTanh, update its input + # Also requantise the weights + bd_out_nodes = node.get_output_nodes() + for out_node in bd_out_nodes: + if isinstance(out_node, BatchNormalizationQuantizedTanh): + var_names = [] + if quantizer.__class__.__name__ == 'BinaryQuantizer': + var_names.append('threshold') + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + var_names.append('threshold_hi') + var_names.append('threshold_lo') + for var_name in var_names: + threshold_var = out_node.weights[var_name] + threshold_var.update_precision(out_type) + threshold_var.data = np.floor(threshold_var.data) + + return False diff --git a/hls4ml/backends/oneapi/passes/clone_templates.py b/hls4ml/backends/oneapi/passes/clone_templates.py new file mode 100644 index 0000000000..447ae126e9 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/clone_templates.py @@ -0,0 +1,32 @@ +""" The clone templates in the fpga backend are not enough for oneAPI, so this adds the missing parts +""" + +from hls4ml.backends.fpga.passes.clone import Clone +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate + +clone_stream_function_template = '{name}.async();' + + +class CloneTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Clone) + + def format(self, node): + params = self._default_function_params(node) + for i in range(len(node.outputs)): + params[f'output{i + 1}_pipe'] = node.variables[node.outputs[i]].pipe_name + + output_pipes = ', '.join([f'{{output{i + 1}_pipe}}' for i in range(len(node.outputs))]) + + template = f'task_sequence> {{name}};' + return template.format(**params) + + +class CloneStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(Clone) + self.template = clone_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py new file mode 100644 index 0000000000..17154559d8 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/convolution_templates.py @@ -0,0 +1,235 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm + +# TODO - Dilation rate ? + +''' Shared mutliplication config ''' +conv_mult_config_template = """struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + +''' 1D Conv ''' +conv1d_config_template = """struct config{index} : nnet::conv1d_config {{ + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned filt_width = {filt_width}; + static const unsigned impl_filt_width = {impl_filt_width}; + static const unsigned kernel_size = filt_width; + + static const unsigned n_filt = {n_filt}; + static const unsigned out_width = {out_width}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned stride_width = {stride_width}; + static const unsigned dilation = {dilation}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned parallelization_factor = {parallelization}; + static const bool store_weights_in_bram = false; + + static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation}; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; +}}; +""" + +conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' + +conv1d_task_sequence_template = ( + 'task_sequence> {name};' +) + +conv_stream_function_template = '{name}.async({w}, {b});' + +conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h'] + + +class Conv1DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Conv1D) + self.template = conv1d_config_template + self.mult_template = conv_mult_config_template + + def format(self, node): + conv_params = self._default_config_params(node) + conv_params['dilation'] = node.get_attr('dilation', 1) + if conv_params['dilation'] != 1: + raise RuntimeError('dilation != 1 not supported yet') + conv_params['config_t'] = f'config{node.index}_mult' + conv_config = self.template.format(**conv_params) + + mult_params = self._default_config_params(node) + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_config = self.mult_template.format(**mult_params) + + return mult_config + '\n' + conv_config + + +class Conv1DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Conv1D, include_header=conv1d_include_list) + self.template = conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class Conv1DTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Conv1D) + self.template = conv1d_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + return self.template.format(**params) + + +class ConvStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Conv1D, Conv2D, Conv2DBatchnorm)) + self.template = conv_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +''' 2D Conv ''' +conv2d_config_template = """struct config{index} : nnet::conv2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned filt_height = {filt_height}; + static const unsigned filt_width = {filt_width}; + static const unsigned impl_filt_height = {impl_filt_height}; + static const unsigned impl_filt_width = {impl_filt_width}; + static const unsigned kernel_size = filt_height * filt_width; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned parallelization_factor = {parallelization}; + static const bool store_weights_in_bram = false; + + static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation}; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; +}};\n""" + +conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' + +conv2d_task_sequence_template = ( + 'task_sequence> {name};' +) + +conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h'] + + +class Conv2DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm)) + self.template = conv2d_config_template + self.mult_template = conv_mult_config_template + + def format(self, node): + conv_params = self._default_config_params(node) + conv_params['dilation'] = node.get_attr('dilation', 1) + if conv_params['dilation'] != 1: + raise RuntimeError('dilation != 1 not supported yet') + conv_params['config_t'] = f'config{node.index}_mult' + conv_config = self.template.format(**conv_params) + + mult_params = self._default_config_params(node) + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_config = self.mult_template.format(**mult_params) + + return mult_config + '\n' + conv_config + + +class Conv2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm), include_header=conv2d_include_list) + self.template = conv2d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported for oneAPI') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class Conv2DTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm)) + self.template = conv2d_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/convolution_winograd.py b/hls4ml/backends/oneapi/passes/convolution_winograd.py new file mode 100644 index 0000000000..fdab408b38 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/convolution_winograd.py @@ -0,0 +1,179 @@ +import math + +import numpy as np + +from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.optimizer import OptimizerPass + + +class ApplyWinogradKernelTransformation(OptimizerPass): + ''' + Transforms the weights of a Conv2D kernel to a format suitable for Wingorad convolution + For further information, refer to Lavin & Gray, 2015 - Fast Algorithms for Convolutional Neural Networks + ''' + + def match(self, node): + node_matches = isinstance(node, (Conv1D, Conv2D)) + + # This optimizer works only after the Resource Strategy Optimizer, since order of transposition matters + weights_transformed = node.get_attr('_weights_transposed', False) is True + + # User opted for Winograd + implementation_is_winograd = ( + node.get_attr('implementation', 'combination') == 'combination' + or node.get_attr('implementation', 'combination') == 'winograd' + ) + + parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel' + + # Winograd algorithm-specific conditions + if isinstance(node, Conv1D): + # Winograd only applies to specific kernel sizes + # Current implementation only supports fs = 3; easily extendable to other filter sizes + filter_size_matches = node.get_attr('filt_width', 3) == 3 + + # Winograd's minimal filtering algorithm doesn't work with stride != 1 + stride_is_one = node.get_attr('stride_width', 1) == 1 + + # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once + loop_itr_gt_one = node.get_attr('out_width') > 2 + + winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type + + elif isinstance(node, (Conv2D)): + # Winograd only applies to specific kernel sizes + # Current implementation only supports fs = 3; easily extendable to other filter sizes + filter_size_matches = node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3 + + # Winograd's minimal filtering algorithm doesn't work with striede != 1 + stride_is_one = node.get_attr('stride_height', 1) == 1 and node.get_attr('stride_width', 1) == 1 + + # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once + loop_itr_gt_one = node.get_attr('out_height') > 2 and node.get_attr('out_width') > 2 + + padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr( + 'pad_left', 0 + ) == node.get_attr('pad_right', 0) + + winograd_conditions = ( + filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type + ) + + else: + winograd_conditions = False + + # Check any previous transformations + already_transformed = node.get_attr('_winograd_transformation_applied', False) is True + + if not winograd_conditions and node.get_attr('implementation', 'combination') == 'winograd': + raise RuntimeError( + 'Not possible to use Winograd algorithm with current architecture. ' + 'Please set implementation to im2col or combination' + ) + + return ( + node_matches + and weights_transformed + and winograd_conditions + and not already_transformed + and implementation_is_winograd + ) + + def transform(self, model, node): + if isinstance(node, Conv1D): + if node.get_attr('filt_width', 3) == 3: + # First, transpose to a format suitable for the Winograd algorithm (F, C, W) + # Note, this assumes a format post-resource strategy optimizer, that is (F, W, C) + # Therefore, (F, W, C) => (F, C, W) + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 2, 1]) + + # Temporary copy of data + weights = node.weights['weight'].data + + # Expand weight dimensionality (3) => (4) + node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4)) + + # Transformation matrices for 3x1 kernels + G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) + + # Transformation GfG' + for filter in range(0, weights.data.shape[0]): + for channel in range(0, weights.data.shape[1]): + node.weights['weight'].data[filter][channel] = np.matmul(G, weights[filter][channel]) + node.weights['weight'].data_length = node.weights['weight'].data.size + # need to always be consistent + node.weights['weight'].type.n_elem = node.weights['weight'].data_length + + # Winograd's minimal filtering algorithm transforms the weight matrix + # This transformation consists of addition and division (by 2&4) of the weight matrix + # Therefore, increase precision (if needed), to accomodate for new weights + # This error is only noticeable for low precisions, such as those used with QKeras + + # Integer precision is only updated if it exceeds the one defined in hls4ml config + maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max())) + if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer: + node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1 + node.weights['weight'].type.precision.width += ( + maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer + ) + + # Fractional precision is increased by 2 bits (division by 4), + # for low-precision (less than 8) fractional weights + if node.weights['weight'].type.precision.fractional < 8: + node.weights['weight'].type.precision.width += 2 + + # Modified kernel size + node.set_attr('impl_filt_width', 4) + + elif isinstance(node, Conv2D): + if node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3: + # First, transpose to a format suitable for the Winograd algorithm (F, C, H, W) + # Note, this assumes a format post-resource strategy optimizer, that is (F, H, W, C) + # Therefore, (F, H, W, C) => (F, C, H, W) + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 3, 1, 2]) + + # Temporary copy of data + weights = node.weights['weight'].data + + # Expand weight dimensionality (3x3) => (4x4) + node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4, 4)) + + # Transformation matrices for 3x3 kernels + G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) + GT = np.array([[1, 0.5, 0.5, 0], [0, 0.5, -0.5, 0], [0, 0.5, 0.5, 1]]) + + # Transformation GfG' + for filter in range(0, weights.data.shape[0]): + for channel in range(0, weights.data.shape[1]): + node.weights['weight'].data[filter][channel] = np.matmul(np.matmul(G, weights[filter][channel]), GT) + node.weights['weight'].data_length = node.weights['weight'].data.size + # need to always be consistent + node.weights['weight'].type.n_elem = node.weights['weight'].data_length + + # Winograd's minimal filtering algorithm transforms the weight matrix + # This transformation consists of addition and division (by 2&4) of the weight matrix + # Therefore, increase precision (if needed), to accomodate for new weights + # This error is only noticeable for low precisions, such as those used with QKeras + + # Integer precision is only updated if it exceeds the one defined in hls4ml config + maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max())) + if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer: + node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1 + node.weights['weight'].type.precision.width += ( + maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer + ) + + # Fractional precision is increased by 2 bits (division by 4), + # for low-precision (less than 8) fractional weights + if node.weights['weight'].type.precision.fractional < 8: + node.weights['weight'].type.precision.width += 2 + + # Modified kernel size + node.set_attr('impl_filt_height', 4) + node.set_attr('impl_filt_width', 4) + else: + raise Exception(f'Unexpected layer {node.class_name} with Winograd kernel optimizer') + + node.set_attr('_winograd_transformation_applied', True) + + return False diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py new file mode 100644 index 0000000000..5ccf1a5213 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -0,0 +1,351 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax + +# Dense templates + +dense_config_template = """struct config{index} : nnet::dense_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned n_out = {n_out}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned n_zeros = {nzeros}; + static constexpr unsigned n_nonzeros = {nonzeros}; + static constexpr bool store_weights_in_bram = false; + + static constexpr unsigned rf_pad = {rfpad}; + static constexpr unsigned bf_pad = {bfpad}; + + static constexpr unsigned reuse_factor = {reuse}; + static constexpr unsigned compressed_block_factor = DIV_ROUNDUP(n_nonzeros, reuse_factor); + static constexpr unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static constexpr unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static constexpr unsigned block_factor_rounded = block_factor + bf_pad; + static constexpr unsigned multiplier_factor = MIN(n_in, reuse_factor); + static constexpr unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static constexpr unsigned multiplier_scale = multiplier_limit/n_out; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {index_t.name} index_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + +dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +dense_task_sequence_template = 'task_sequence> {name};' +dense_stream_function_template = '{name}.async({w}, {b});' +dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h'] + + +class DenseConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_config_template + + def format(self, node): + params = self._default_config_params(node) + params['nzeros'] = node.get_weights('weight').nzeros + params['nonzeros'] = node.get_weights('weight').nonzeros + params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + + return self.template.format(**params) + + +class DenseFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Dense, include_header=dense_include_list) + self.template = dense_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class DenseTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class DenseStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# BatchNormalization templates + +batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned n_filt = {n_filt}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; + static constexpr bool store_weights_in_bram = false; + typedef {bias_t.name} bias_t; + typedef {scale_t.name} scale_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});' +batchnorm_task_sequence_template = 'task_sequence> {name};' +batchnorm_stream_function_template = '{name}.async({scale}, {bias});' +batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class BatchNormalizationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalization, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class BatchNormalizationTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class BatchNormalizationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# Activation templates + +activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned table_size = {table_size}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n""" + +param_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned table_size = {table_size}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; + typedef {param_t.name} param_t; +}};\n""" + +hard_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr {slope_t.name} slope = {slope}; + static constexpr {shift_t.name} shift = {shift}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; +}};\n""" + +softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned table_size = {table_size}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; + static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation}; + typedef {exp_table_t.name} exp_table_t; + typedef {inv_table_t.name} inv_table_t; +}};\n""" + +activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' +param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' + +activ_task_sequence_template = 'task_sequence> {name};' +activ_stream_function_template = '{name}.async();' +param_activ_stream_function_template = '{name}.async({param});' + +activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] + + +class ActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Activation) + self.template = activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class ParamActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ParametrizedActivation, PReLU)) + self.template = param_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class HardActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(HardActivation) + self.template = hard_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class SoftmaxConfigTemplate(ActivationConfigTemplate): + def __init__(self): + super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ + self.template = softmax_config_template + + +class ActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list) + self.template = activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + + return self.template.format(**params) + + +class ParametrizedActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(ParametrizedActivation, include_header=activ_include_list) + self.template = param_activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node._get_act_function_name() + params['param'] = node.get_attr('activ_param', 1.0) + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + + return self.template.format(**params) + + +class PReLUFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(PReLU, include_header=activ_include_list) + self.template = param_activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['param'] = node.get_weights('param').name + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + + return self.template.format(**params) + + +class ActivationTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax, PReLU)) + self.template = activ_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + return self.template.format(**params) + + +class ParametrizedActivationTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(ParametrizedActivation) + self.template = activ_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node._get_act_function_name() + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + return self.template.format(**params) + + +class ActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax)) + self.template = activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) + + +class ParametrizedActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(ParametrizedActivation) + self.template = param_activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['param'] = node.get_attr('activ_param', 1.0) + return self.template.format(**params) + + +class PReLUActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(PReLU) + self.template = param_activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['param'] = node.get_weights('param').name + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/embedding_templates.py b/hls4ml/backends/oneapi/passes/embedding_templates.py new file mode 100644 index 0000000000..6fda678f05 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/embedding_templates.py @@ -0,0 +1,32 @@ +""" +These are the stream oneAPI templates for embedding layers. The io_parallel ones are in backends/fpga/passes/embedding.py. +""" + +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.model.layers import Embedding + +embed_task_sequence_template = 'task_sequence> {name};' +embed_stream_function_template = '{name}.async({e});' + + +class EmbeddingTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Embedding) + self.template = embed_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class EmbeddingStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(Embedding) + self.template = embed_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['e'] = node.get_weights('embeddings').name + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py new file mode 100644 index 0000000000..c38e1e055f --- /dev/null +++ b/hls4ml/backends/oneapi/passes/merge_templates.py @@ -0,0 +1,137 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Concatenate, Dot, Merge + +# TODO - Very similar to vivado/merge_templates.py - only difference is on line 67: +# TODO - get_backend('vivado').product_type(inp1.type.precision, inp2.type.precision) +# TODO - Look into ways of having passes similar accross many backends in a shared folder thorugh inheritance and overriding. + +# Merge templates +merge_config_template = """struct config{index} : nnet::merge_config {{ + static const unsigned n_elem = {n_elem}; +}};\n""" + +merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});' + +merge_task_sequence_template = ( + 'task_sequence> {name};' +) + +merge_stream_function_template = '{name}.async();' + +merge_include_list = ['nnet_utils/nnet_merge.h', 'nnet_utils/nnet_merge_stream.h'] + + +class MergeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Merge) + self.template = merge_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_elem'] = node.get_input_variable(node.inputs[0]).size_cpp() + + return self.template.format(**params) + + +class MergeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Merge, Concatenate, Dot), include_header=merge_include_list) + self.template = merge_function_template + + def format(self, node): + params = self._default_function_params(node) + params['merge'] = node.get_attr('op').lower() + params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name + params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name + params['input1'] = node.get_input_variable(node.inputs[0]).name + params['input2'] = node.get_input_variable(node.inputs[1]).name + + return self.template.format(**params) + + +class MergeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Merge, Concatenate, Dot)) + self.template = merge_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['merge'] = node.get_attr('op').lower() + params['input1_pipe'] = node.get_input_variable(node.inputs[0]).pipe_name + params['input2_pipe'] = node.get_input_variable(node.inputs[1]).pipe_name + return self.template.format(**params) + + +class MergeStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Merge, Concatenate, Dot)) + self.template = merge_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +# Dot templates +dot_config_template = """struct config{index} : nnet::dot_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned reuse_factor = {reuse}; + + typedef {accum_t.name} accum_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + + +class DotConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dot) + self.template = dot_config_template + + def format(self, node): + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + params = self._default_config_params(node) + params['n_out'] = 1 + params['n_in'] = inp1.shape[0] + params['product_type'] = get_backend('oneAPI').product_type(inp1.type.precision, inp2.type.precision) + + return self.template.format(**params) + + +# Concatenate templates +concat_config_template = """struct config{index} : nnet::concat_config {{ + static const unsigned n_elem1_0 = {n_elem1_0}; + static const unsigned n_elem1_1 = {n_elem1_1}; + static const unsigned n_elem1_2 = {n_elem1_2}; + static const unsigned n_elem2_0 = {n_elem2_0}; + static const unsigned n_elem2_1 = {n_elem2_1}; + static const unsigned n_elem2_2 = {n_elem2_2}; + + static const int axis = {axis}; +}};\n""" + + +class ConcatenateConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Concatenate) + self.template = concat_config_template + + def format(self, node): + params = self._default_config_params(node) + for i in range(3): + params.setdefault(f'n_elem1_{i}', 0) + params.setdefault(f'n_elem2_{i}', 0) + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + for i, (s1, s2) in enumerate(zip(inp1.shape, inp2.shape)): + params[f'n_elem1_{i}'] = s1 + params[f'n_elem2_{i}'] = s2 + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py new file mode 100644 index 0000000000..ccf410d1f6 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/pointwise.py @@ -0,0 +1,156 @@ +from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.oneapi.passes.convolution_templates import ( + Conv1DConfigTemplate, + Conv2DConfigTemplate, + conv1d_config_template, + conv2d_config_template, + conv_mult_config_template, +) +from hls4ml.backends.template import FunctionCallTemplate +from hls4ml.model.layers import register_layer +from hls4ml.model.optimizer import OptimizerPass + +''' +Custom hls4ml layer implementation for 1x1 Conv filters using im2col +Allows lower latency andresource usage, due to less loop invocations +''' + +pointwise_conv1d_function_template = ( + 'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) +pointwise_conv2d_function_template = ( + 'nnet::pointwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) + +pointwise_conv1d_task_sequence_template = ( + 'task_sequence> {name};' +) + +pointwise_conv2d_task_sequence_template = ( + 'task_sequence> {name};' +) + +pointwise_conv_stream_function_template = '{name}.async({w}, {b});' + +sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h'] +sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h'] + + +class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate): + def __init__(self): + super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D) + self.template = conv1d_config_template + self.mult_template = conv_mult_config_template + + +class PointwiseConv1DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(PointwiseConv1D, include_header=sepconv1d_include_list) + self.template = pointwise_conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class PointwiseConv1DTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(PointwiseConv1D) + self.template = pointwise_conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + return self.template.format(**params) + + +class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate): + def __init__(self): + super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D) + self.template = conv2d_config_template + self.mult_template = conv_mult_config_template + + +class PointwiseConv2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(PointwiseConv2D, include_header=sepconv2d_include_list) + self.template = pointwise_conv2d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class PointwiseConv2DTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(PointwiseConv2D) + self.template = pointwise_conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + return self.template.format(**params) + + +class PointwiseConvStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((PointwiseConv1D, PointwiseConv2D)) + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +def register_pointwise(backend): + # Register the layer types to the layer map + register_layer('PointwiseConv1D', PointwiseConv1D) + register_layer('PointwiseConv2D', PointwiseConv2D) + + # Register the optimization passes + backend.register_pass('optimize_pointwise_conv', OptimizePointwiseConv) + + # Register template passes + backend.register_template(PointwiseConv1DConfigTemplate) + backend.register_template(PointwiseConv1DFunctionTemplate) + backend.register_template(PointwiseConv2DConfigTemplate) + backend.register_template(PointwiseConv2DFunctionTemplate) + + +class OptimizePointwiseConv(OptimizerPass): + def match(self, node): + return ( + node.class_name in ('Conv1D', 'Conv2D') + and node.get_attr('filt_height', 1) == 1 + and node.get_attr('filt_width') == 1 + and node.model.config.get_config_value('IOType') == 'io_parallel' + ) + + def transform(self, model, node): + dim = node.__class__.__name__[-2:] # '1D' or '2D' + new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} + pw_node = model.make_node( + 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() + ) + model.replace_node(node, pw_node) + + return True diff --git a/hls4ml/backends/oneapi/passes/pooling_templates.py b/hls4ml/backends/oneapi/passes/pooling_templates.py new file mode 100644 index 0000000000..97136ed847 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/pooling_templates.py @@ -0,0 +1,153 @@ +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GlobalPooling1D, GlobalPooling2D, Pooling1D, Pooling2D + +pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{ + static const unsigned stride_width = {stride_width}; + static const unsigned pool_width = {pool_width}; + + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned filt_width = {pool_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned n_chan = {n_filt}; + + static const unsigned in_width = {n_in}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const bool count_pad = {count_pad}; + + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{ + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + + static const unsigned pool_height = {pool_height}; + static const unsigned pool_width = {pool_width}; + static const unsigned filt_height = {pool_height}; + static const unsigned filt_width = {pool_width}; + + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned n_chan = {n_filt}; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const bool count_pad = {count_pad}; + + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_filt = {n_filt}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +global_pooling1d_function_template = ( + 'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +) +global_pooling2d_function_template = ( + 'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +) + +pooling1d_task_sequence_template = ( + 'task_sequence>({name});' +) +pooling2d_task_sequence_template = ( + 'task_sequence>({name});' +) +global_pooling1d_task_sequence_template = ( + 'task_sequence>({name});' +) +global_pooling2d_task_sequence_template = ( + 'task_sequence>({name});' +) + +pooling_stream_function_template = '{name}.async();' + +pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h'] + + +class PoolingConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D)) + self.templates = { + 'Pooling1D': pooling1d_config_template, + 'Pooling2D': pooling2d_config_template, + 'GlobalPooling1D': global_pooling1d_config_template, + 'GlobalPooling2D': global_pooling2d_config_template, + } + + def format(self, node): + params = self._default_config_params(node) + return self.templates[node.class_name].format(**params) + + +class PoolingFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D), include_header=pooling_include_list) + self.templates = { + 'Pooling1D': pooling1d_function_template, + 'Pooling2D': pooling2d_function_template, + 'GlobalPooling1D': global_pooling1d_function_template, + 'GlobalPooling2D': global_pooling2d_function_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('channels_first not supported for oneAPI') + params['data_format'] = 'cl' + return self.templates[node.class_name].format(**params) + + +class PoolingTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D)) + self.templates = { + 'Pooling1D': pooling1d_task_sequence_template, + 'Pooling2D': pooling2d_task_sequence_template, + 'GlobalPooling1D': global_pooling1d_task_sequence_template, + 'GlobalPooling2D': global_pooling2d_task_sequence_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('channels_first not supported for oneAPI') + params['data_format'] = 'cl' + return self.templates[node.class_name].format(**params) + + +class PoolingStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D)) + self.template = pooling_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/quantization_templates.py b/hls4ml/backends/oneapi/passes/quantization_templates.py new file mode 100644 index 0000000000..c46e174852 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/quantization_templates.py @@ -0,0 +1,63 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.oneapi.passes.core_templates import ( + batchnorm_config_template, + batchnorm_function_template, + batchnorm_include_list, + batchnorm_stream_function_template, + batchnorm_task_sequence_template, +) +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.optimizer.passes.qkeras import ApplyAlpha + + +class ApplyAlphaConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(ApplyAlpha) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class ApplyAlphaFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(ApplyAlpha, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class ApplyAlphaTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(ApplyAlpha) + self.template = batchnorm_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class ApplyAlphaStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(ApplyAlpha) + self.template = batchnorm_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py new file mode 100644 index 0000000000..00cd168790 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py @@ -0,0 +1,369 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GRU, LSTM, SimpleRNN + +# Note: currently only GRU is supported for stream; lstm and simpleRNN are parallel-only + +recurrent_include_list = ['nnet_utils/nnet_recurrent.h', 'nnet_utils/nnet_recurrent_stream.h'] + +################################################ +# Shared Matrix Multiplication Template (Dense) +################################################ +recr_mult_x_config_template = '''struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + static const unsigned reuse_factor = {reuse}; + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + + template + using product = nnet::product::{product_type}; +}};\n''' + +recr_mult_h_config_template = '''struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + static const unsigned reuse_factor = {reuse}; + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + typedef {accum_t.name} accum_t; + typedef {recurrent_bias_t.name} bias_t; + typedef {recurrent_weight_t.name} weight_t; + + template + using product = nnet::product::{product_type}; +}};\n''' + +################################################ +# Shared Activation Template +################################################ +activ_config_template = '''struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n''' + +################################################ +# GRU Template +################################################ +gru_config_template = '''struct config{index} : nnet::gru_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_units = {n_units}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned n_outputs = {n_outputs}; + static const bool return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_t.name} weight_t; + typedef {bias_t.name} bias_t; + typedef {recurrent_weight_t.name} recurrent_weight_t; + typedef {recurrent_bias_t.name} recurrent_bias_t; + + typedef {config_mult_x} mult_config_x; + typedef {config_mult_h} mult_config_h; + + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n''' + +gru_function_template = 'nnet::gru<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});' +gru_task_sequence_template = 'task_sequence> {name};' +gru_stream_function_template = '{name}.async({w}, {wr}, {b}, {br});' + + +class GRUConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(GRU) + self.gru_template = gru_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + self.mult_x_template = recr_mult_x_config_template + self.mult_h_template = recr_mult_h_config_template + + def format(self, node): + # Input has shape (n_timesteps, inp_dimensionality) + # Output / hidden units has shape (1 if !return_sequences else n_timesteps , n_units) + params = self._default_config_params(node) + params['n_units'] = node.get_attr('n_out') + params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + params['return_sequences'] = 'true' if node.get_attr('return_sequences', False) else 'false' + params['config_mult_x'] = f'config{node.index}_x_mult' + params['config_mult_h'] = f'config{node.index}_h_mult' + params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + params['act_recurrent_t'] = '{}_config{}'.format(node.get_attr('recurrent_activation'), str(node.index) + '_rec_act') + gru_config = self.gru_template.format(**params) + + # Activation is on candidate hidden state, dimensionality (1, n_units) + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + # Recurrent activation is on reset and update gates (therefore x2), dimensionality (1, n_units) + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = str(node.get_attr('n_out')) + ' * 2' + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + # Multiplication config for matrix multiplications of type Wx (reset, update and candidate states) + mult_params_x = self._default_config_params(node) + mult_params_x['n_in'] = node.get_attr('n_in') + mult_params_x['n_out'] = str(node.get_attr('n_out')) + ' * 3' + mult_params_x['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_params_x['index'] = str(node.index) + '_x' + mult_config_x = self.mult_x_template.format(**mult_params_x) + + # Multiplication config for matrix multiplications of type Wh (reset, update and candidate states) + mult_params_h = self._default_config_params(node) + mult_params_h['n_in'] = node.get_attr('n_out') + mult_params_h['n_out'] = str(node.get_attr('n_out')) + ' * 3' + mult_params_h['reuse_factor'] = params['recurrent_reuse_factor'] + mult_params_h['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision + ) + mult_params_h['index'] = str(node.index) + '_h' + mult_config_h = self.mult_h_template.format(**mult_params_h) + + return mult_config_x + '\n' + mult_config_h + '\n' + recr_act_config + '\n' + act_config + '\n' + gru_config + + +class GRUFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(GRU, include_header=recurrent_include_list) + self.template = gru_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + params['wr'] = node.get_weights('recurrent_weight').name + params['br'] = node.get_weights('recurrent_bias').name + return self.template.format(**params) + + +class GRUTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(GRU) + self.template = gru_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class GRUStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(GRU) + self.template = gru_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + params['wr'] = node.get_weights('recurrent_weight').name + params['br'] = node.get_weights('recurrent_bias').name + + return self.template.format(**params) + + +################################################ +# LSTM Template +################################################ +lstm_config_template = """struct config{index} : nnet::lstm_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_i_t.name} weight_i_t; + typedef {bias_i_t.name} bias_i_t; + typedef {weight_f_t.name} weight_f_t; + typedef {bias_f_t.name} bias_f_t; + typedef {weight_c_t.name} weight_c_t; + typedef {bias_c_t.name} bias_c_t; + typedef {weight_o_t.name} weight_o_t; + typedef {bias_o_t.name} bias_o_t; + typedef {recurrent_weight_i_t.name} recurrent_weight_i_t; + typedef {recurrent_weight_f_t.name} recurrent_weight_f_t; + typedef {recurrent_weight_c_t.name} recurrent_weight_c_t; + typedef {recurrent_weight_o_t.name} recurrent_weight_o_t; + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n""" + +lstm_function_template = 'nnet::lstm<{input_t}, {output_t}, {config}>({input}, {output}, {weights});' + + +class LSTMConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(LSTM) + self.template = lstm_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + + def format(self, node): + lstm_params = self._default_config_params(node) + lstm_params['n_in'] = node.get_attr('n_in') + lstm_params['n_out'] = node.get_attr('n_out') + lstm_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + + lstm_params['return_sequences'] = str(node.get_attr('return_sequences')).lower() + lstm_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + lstm_params['act_recurrent_t'] = '{}_config{}'.format( + node.get_attr('recurrent_activation'), str(node.index) + '_rec_act' + ) + lstm_config = self.template.format(**lstm_params) + + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = node.get_attr('n_out') + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + return act_config + '\n' + recr_act_config + '\n' + lstm_config + + +class LSTMFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(LSTM, include_header=recurrent_include_list) + self.template = lstm_function_template + + def format(self, node): + params = self._default_function_params(node) + + types = ['i', 'f', 'c', 'o'] + params['weights'] = '' + for t in types: + params['weights'] += f'kernel_{t}_{str(node.index)},' + for t in types: + params['weights'] += f'recurrent_kernel_{t}_{str(node.index)},' + for t in types: + params['weights'] += 'bias_{}_{}{}'.format(t, str(node.index), ',' if t != 'o' else '') + + return self.template.format(**params) + + +################################################ +# SimpleRNN Template +################################################ +simple_rnn_config_template = """struct config{index} : nnet::simpleRNN_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_outputs = {n_outputs}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_t.name} weight_t; + typedef {bias_t.name} bias_t; + typedef {recurrent_weight_t.name} recurrent_weight_t; + + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n""" + +simple_rnn_function_template = 'nnet::simple_rnn<{input_t}, {output_t}, {config}>({input}, {output}, {weights});' + + +class SimpleRNNConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SimpleRNN) + self.template = simple_rnn_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + + def format(self, node): + simple_rnn_params = self._default_config_params(node) + simple_rnn_params['n_in'] = node.get_attr('n_in') + simple_rnn_params['n_out'] = node.get_attr('n_out') + simple_rnn_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + simple_rnn_params['return_sequences'] = str(node.get_attr('return_sequences')).lower() + simple_rnn_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + simple_rnn_params['act_recurrent_t'] = '{}_config{}'.format( + node.get_attr('recurrent_activation'), str(node.index) + '_rec_act' + ) + simple_rnn_params['recurrent_activation'] = 'relu' + + simple_rnn_config = self.template.format(**simple_rnn_params) + + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = node.get_attr('n_out') + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + return act_config + '\n' + recr_act_config + '\n' + simple_rnn_config + + +class SimpleRNNFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SimpleRNN, include_header=recurrent_include_list) + self.template = simple_rnn_function_template + + def format(self, node): + params = self._default_function_params(node) + params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index)) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py new file mode 100644 index 0000000000..85357cdb2d --- /dev/null +++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py @@ -0,0 +1,244 @@ +import numpy as np + +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Reshape, Resize, Transpose, ZeroPadding1D, ZeroPadding2D + +# ZeroPadding templates + +zeropad1d_config_template = """struct config{index} : nnet::padding1d_config {{ + static const unsigned in_width = {in_width}; + static const unsigned out_width = {out_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; +}};\n""" + +zeropad2d_config_template = """struct config{index} : nnet::padding2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; +}};\n""" + +zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' + +zeropad1d_task_sequence_template = ( + 'task_sequence> {name};' +) +zeropad2d_task_sequence_template = ( + 'task_sequence> {name};' +) + +reshaping_stream_function_template = '{name}.async();' + +padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h'] + + +class ZeroPaddingConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D)) + self.templates = { + 'ZeroPadding1D': zeropad1d_config_template, + 'ZeroPadding2D': zeropad2d_config_template, + } + + def format(self, node): + params = self._default_config_params(node) + return self.templates[node.class_name].format(**params) + + +class ZeroPaddingFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D), include_header=padding_include_list) + self.templates = { + 'ZeroPadding1D': zeropad1d_function_template, + 'ZeroPadding2D': zeropad2d_function_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('oneAPI only supports channels_last data format') + params['data_format'] = 'cl' + + return self.templates[node.class_name].format(**params) + + +class ZeroPaddingTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D)) + self.templates = { + 'ZeroPadding1D': zeropad1d_task_sequence_template, + 'ZeroPadding2D': zeropad2d_task_sequence_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + + return self.templates[node.class_name].format(**params) + + +class ReshapingStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D, Resize, Reshape, Transpose)) + self.template = reshaping_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +# Resize templates + +resize_config_template = """struct config{index} : nnet::resize_config {{ + static const unsigned height = {in_height}; + static const unsigned width = {in_width}; + + static const unsigned new_height = {out_height}; + static const unsigned new_width = {out_width}; + + static const unsigned n_chan = {n_chan}; +}};\n""" + +resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {output_t}, {config}>({input}, {output});' +resize_task_sequence_template = ( + 'task_sequence> {name};' +) +resize_include_list = ['nnet_utils/nnet_resize.h', 'nnet_utils/nnet_resize_stream.h'] + + +class ResizeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Resize) + self.template = resize_config_template + + def format(self, node): + params = self._default_config_params(node) + + return self.template.format(**params) + + +class ResizeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Resize, include_header=resize_include_list) + self.template = resize_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('algorithm') != 'nearest': + raise Exception('Currently only supporting resize_nearest') + params['algorithm'] = node.get_attr('algorithm') + + return self.template.format(**params) + + +class ResizeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Resize) + self.template = resize_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('algorithm') != 'nearest': + raise Exception('Currently only supporting resize_nearest') + params['algorithm'] = node.get_attr('algorithm') + + return self.template.format(**params) + + +# Transpose templates + +transpose_config_template = """struct config{index} : nnet::transpose_config {{ + static const unsigned depth = {depth}; + static const unsigned height = {height}; + static const unsigned width = {width}; + static constexpr unsigned perm[3] = {{{perm_str}}}; +}};\n""" + +transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});' +transpose_task_sequence_template = ( + 'task_sequence> {name};' +) +transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h'] + + +class TransposeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Transpose) + self.template = transpose_config_template + + def format(self, node): + params = self._default_config_params(node) + + return self.template.format(**params) + + +class TransposeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Transpose, include_header=transpose_include_list) + self.template = transpose_function_template + + def format(self, node): + params = self._default_function_params(node) + params['dim'] = node.get_attr('dim') + + return self.template.format(**params) + + +class TransposeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Transpose) + self.template = transpose_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['dim'] = node.get_attr('dim') + + return self.template.format(**params) + + +# Reshape template (only used in streaming) +reshape_task_sequence_template = 'task_sequence> {name};' +reshape_include_list = ['nnet_utils/nnet_stream.h'] + + +class ReshapeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Reshape) + + def format(self, node): + return '' + + +class ReshapeFunctionTemplate(FunctionCallTemplate): + """Only used to add the include list""" + + def __init__(self): + super().__init__(Reshape, include_header=reshape_include_list) + + def format(self, node): + return '' + + +class ReshapeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Reshape) + self.template = reshape_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['size'] = np.prod(node.get_output_variable().shape) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/resource_strategy.py b/hls4ml/backends/oneapi/passes/resource_strategy.py new file mode 100644 index 0000000000..15af1d197b --- /dev/null +++ b/hls4ml/backends/oneapi/passes/resource_strategy.py @@ -0,0 +1,77 @@ +import numpy as np + +from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SimpleRNN +from hls4ml.model.optimizer import OptimizerPass + + +class ApplyResourceStrategy(OptimizerPass): + '''Transposes the weights to use the dense_resource matrix multiply routine''' + + def match(self, node): + node_matches = isinstance(node, (Dense, Conv1D, Conv2D, GRU, LSTM, SimpleRNN)) + is_resource_strategy = ( + True # node.get_attr('strategy', '').lower() == 'resource' -> oneAPI only supportr Resource strategy + ) + already_transformed = node.get_attr('_weights_transposed', False) is True + return node_matches and is_resource_strategy and not already_transformed + + def transform(self, model, node): + if isinstance(node, Dense) and not node.model.config.get_compression(node): + rf = node.get_attr('reuse_factor') + bf = int((node.attributes['n_in'] * node.attributes['n_out']) / rf) + bf_rounded = int(pow(2, np.ceil(np.log2(bf)))) + rf_rounded = int(pow(2, np.ceil(np.log2(rf)))) + + node.weights['weight'].data = np.transpose(node.weights['weight'].data).flatten() + + if node.attributes['n_in'] * node.attributes['n_out'] > 2048 and rf_rounded != rf: + node.set_attr('rfpad', rf_rounded - rf) + node.set_attr('bfpad', bf_rounded - bf) + + temp = np.empty([bf_rounded, rf_rounded]) + for i in range(rf_rounded): + for j in range(bf_rounded): + if i < rf and j < bf: + w_index = i + rf * j + temp[j][i] = node.weights['weight'].data[w_index] + else: + temp[j][i] = 0 + node.weights['weight'].data = temp.flatten() + node.weights['weight'].data_length = node.weights['weight'].data.size + + elif isinstance(node, Conv1D): + # (W,C,F) => (F,W,C) + # IMPORTANT - This format only works with im2col convolution + # - Future commits add new optimizers that further transpose THIS format to a format + # useful for Winograd's minimal filtering algorithm + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[2, 0, 1]) + + elif isinstance(node, Conv2D): + # (H,W,C,F) => (F,H,W,C) + # IMPORTANT - This format only works with im2col convolution + # - Future commits add new optimizers that further transpose THIS format to a format + # useful for Winograd's minimal filtering algorithm + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[3, 0, 1, 2]) + + elif isinstance(node, GRU): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + elif isinstance(node, SimpleRNN): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + elif isinstance(node, LSTM): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + for weight_type in ['i', 'f', 'c', 'o']: + node.weights[f'weight_{weight_type}'].data = np.transpose(node.weights[f'weight_{weight_type}'].data) + node.weights[f'recurrent_weight_{weight_type}'].data = np.transpose( + node.weights[f'recurrent_weight_{weight_type}'].data + ) + + else: + raise Exception(f'Unexpected layer {node.class_name} with resource strategy') + node.set_attr('_weights_transposed', True) + return False diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py new file mode 100644 index 0000000000..8a90bad820 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/transform_types.py @@ -0,0 +1,60 @@ +from hls4ml.backends.oneapi.oneapi_types import ( + OneAPIACTypeConverter, + OneAPIArrayVariableConverter, + OneAPIHLSTypeConverter, + OneAPIInplaceArrayVariableConverter, + OneAPIInplaceStreamVariableConverter, + OneAPIInterfaceVariableConverter, + OneAPIStaticWeightVariableConverter, + OneAPIStreamVariableConverter, +) +from hls4ml.model.optimizer import GlobalOptimizerPass +from hls4ml.model.types import InplaceTensorVariable + +# from hls4ml.utils.string_utils import convert_to_pascal_case + + +class TransformTypes(GlobalOptimizerPass): + def __init__(self): + self.type_converter = OneAPIHLSTypeConverter(precision_converter=OneAPIACTypeConverter()) + self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter) + self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter) + self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter) + self.stream_var_converter = OneAPIStreamVariableConverter(type_converter=self.type_converter) + self.inplace_stream_var_converter = OneAPIInplaceStreamVariableConverter(type_converter=self.type_converter) + self.weight_var_converter = OneAPIStaticWeightVariableConverter(type_converter=self.type_converter) + + def transform(self, model, node): + io_type = node.model.config.get_config_value('IOType') + + for out_name, var in node.variables.items(): + if io_type == 'io_stream': + if out_name in node.model.inputs: + new_var = self.interface_var_converter.convert(var, pragma='stream') + elif out_name in node.model.outputs: + new_var = self.interface_var_converter.convert(var, pragma='stream') + if isinstance(var, InplaceTensorVariable): + new_var = self.inplace_stream_var_converter.convert(var, pragma='stream') + else: + new_var = self.stream_var_converter.convert(var, pragma='stream') + elif io_type == 'io_parallel': + if out_name in node.model.inputs: + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') + elif out_name in node.model.outputs: + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') + elif isinstance(var, InplaceTensorVariable): + new_var = self.inplace_array_var_converter.convert(var, pragma='') + else: + new_var = self.array_var_converter.convert(var, pragma='intel::fpga_register') + else: + raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})') + + node.set_attr(out_name, new_var) + + for w_name, weight in node.weights.items(): + new_weight = self.weight_var_converter.convert(weight) + node.set_attr(w_name, new_weight) + + for t_name, type in node.types.items(): + new_type = self.type_converter.convert(type) + node.set_attr(t_name, new_type) diff --git a/hls4ml/backends/quartus/passes/bn_quant.py b/hls4ml/backends/quartus/passes/bn_quant.py new file mode 100644 index 0000000000..3224b00022 --- /dev/null +++ b/hls4ml/backends/quartus/passes/bn_quant.py @@ -0,0 +1,169 @@ +import numpy as np + +from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import BatchNormalization, register_layer +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType + +batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; +}};\n""" + +batchnorm_quantized_tanh_function_template = ( + 'nnet::normalize_{quantize}_tanh<{input_t}, {config}>({input}, {output}, {threshold});' +) + +bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = batchnorm_quantized_tanh_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + + return self.template.format(**params) + + +class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list) + self.template = batchnorm_quantized_tanh_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['quantize'] = 'binary' + params['threshold'] = node.get_weights('threshold').name + elif node.get_attr('quantize') == 3: + params['quantize'] = 'ternary' + params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name + + return self.template.format(**params) + + +def register_bn_quant(backend): + # Register the layer types to the layer map + register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh) + + # Register the optimization passes + backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh) + backend.register_pass('quantize_dense_output', QuantizeDenseOutput) + + # Register template passes + backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate) + backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate) + + +class MergeBatchNormAndQuantizedTanh(OptimizerPass): + def match(self, node): + is_match = ( + node.class_name == 'Activation' + and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh'] + or node.class_name == 'TernaryTanh' + ) + is_match = is_match and isinstance(node.get_input_node(), BatchNormalization) + return is_match + + def transform(self, model, node): + bn_layer = node.get_input_node() + # Make a new layer with the new attributes + quantize = 0 + if 'binary' in node.get_attr('activation'): + quantize = 2 + if 'ternary' in node.get_attr('activation'): + quantize = 3 + attrs = { + 'name': bn_layer.get_attr('name'), + 'original_name': bn_layer.get_attr('name'), + 'class_name': 'BatchNormalizationQuantizedTanh', + 'n_in': bn_layer.get_attr('n_in'), + 'n_out': bn_layer.get_attr('n_in'), + 'n_filt': bn_layer.get_attr('n_filt'), + 'quantize': quantize, + 'trace': bn_layer.get_attr('trace'), + } + bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs) + bnbt_layer.set_thresholds( + bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5) + ) + # Remove the BatchNormalization layer + model.remove_node(bn_layer, rewire=True) + # Replace the old Activation layer with this one + model.replace_node(node, bnbt_layer) + + return True + + +class QuantizeDenseOutput(OptimizerPass): + def match(self, node): + is_dense = node.class_name == 'Dense' + input_node = node.get_input_node() + is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh' + quantizer = node.get_attr('weight_quantizer') + is_binary_ternary = quantizer is not None and ( + quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer' + ) + return is_dense and is_input_bnqt and is_binary_ternary + + def transform(self, model, node): + # Compute the required precision and update the variables + # Number of bits for output is log2 of number of input nodes + # Since this is the number of uint<1>'s which are summed + nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2) + out_type = IntegerPrecisionType(width=nbits) + accum_t = NamedType(f'layer{node.index}_accum_t', out_type) + node.set_attr('accum_t', accum_t) + out_var = node.get_output_variable() + out_var.type.precision = out_type + + quantized_data = None + quantized_precision = None + quantizer = node.get_attr('weight_quantizer') + if quantizer.__class__.__name__ == 'BinaryQuantizer': + quantized_precision = XnorPrecisionType() + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + quantized_precision = IntegerPrecisionType(width=2) + else: + print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out') + return False + quantizer.bits = quantized_precision.width + quantizer.hls_type = quantized_precision + quantized_data = quantizer(node.weights['weight'].data) + + weights = node.weights['weight'] + weights.data = quantized_data + weights.type.name = f'weight{node.index}_t' + weights.update_precision(quantized_precision) + + bias = node.weights['bias'] + bias.data = np.zeros(shape=(node.get_attr('n_out'))) + bias.type.name = f'bias{node.index}_t' + bias.nzeros = 0 + bias.update_precision(quantized_precision) + + # If followed by the BatchNormalizationBinaryTanh, update its input + # Also requantise the weights + bd_out_nodes = node.get_output_nodes() + for out_node in bd_out_nodes: + if isinstance(out_node, BatchNormalizationQuantizedTanh): + var_names = [] + if quantizer.__class__.__name__ == 'BinaryQuantizer': + var_names.append('threshold') + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + var_names.append('threshold_hi') + var_names.append('threshold_lo') + for var_name in var_names: + threshold_var = out_node.weights[var_name] + threshold_var.update_precision(out_type) + threshold_var.data = np.floor(threshold_var.data) + + return False diff --git a/hls4ml/backends/quartus/passes/convolution_templates.py b/hls4ml/backends/quartus/passes/convolution_templates.py index 75f8ca6871..d1c36fe1b1 100644 --- a/hls4ml/backends/quartus/passes/convolution_templates.py +++ b/hls4ml/backends/quartus/passes/convolution_templates.py @@ -46,7 +46,7 @@ static const unsigned dilation = {dilation}; static const unsigned reuse_factor = {reuse}; - static const unsigned parallelisation_factor = {parallelization}; + static const unsigned parallelization_factor = {parallelization}; static const bool store_weights_in_bram = false; static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation}; @@ -127,7 +127,7 @@ def format(self, node): static const unsigned stride_width = {stride_width}; static const unsigned reuse_factor = {reuse}; - static const unsigned parallelisation_factor = {parallelization}; + static const unsigned parallelization_factor = {parallelization}; static const bool store_weights_in_bram = false; static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation}; diff --git a/hls4ml/backends/template.py b/hls4ml/backends/template.py index 9638b53add..f7f6fe313a 100644 --- a/hls4ml/backends/template.py +++ b/hls4ml/backends/template.py @@ -2,6 +2,14 @@ class Template(OptimizerPass): + """The Template base class, should not be instantiated directly + + Args: + name (str): Name of the template. + layer_class (Layer or list, tuple, or aet of Layers): The Layers that this template handles. + attribute_name (str): The type of attribute provided + """ + def __init__(self, name, layer_class, attribute_name): self.name = name self.layer_class = layer_class @@ -36,6 +44,12 @@ def _default_params(self, node): class LayerConfigTemplate(Template): + """Base class for layer config templates: provides the 'config_cpp' attribute + + Args: + layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles. + """ + def __init__(self, layer_class): if isinstance(layer_class, (list, tuple, set)): name = '_'.join([cls.__name__.lower() for cls in layer_class]) @@ -53,6 +67,13 @@ def _default_config_params(self, layer): class FunctionCallTemplate(Template): + """Base class for function call templates: provides the 'function_cpp' attribute + + Args: + layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles. + include_header (list, tuple, or set of str, or None): The list of needed include files + """ + def __init__(self, layer_class, include_header=None): if isinstance(layer_class, (list, tuple, set)): name = '_'.join([cls.__name__.lower() for cls in layer_class]) diff --git a/hls4ml/backends/vivado/passes/bn_quant.py b/hls4ml/backends/vivado/passes/bn_quant.py new file mode 100644 index 0000000000..3224b00022 --- /dev/null +++ b/hls4ml/backends/vivado/passes/bn_quant.py @@ -0,0 +1,169 @@ +import numpy as np + +from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import BatchNormalization, register_layer +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType + +batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; +}};\n""" + +batchnorm_quantized_tanh_function_template = ( + 'nnet::normalize_{quantize}_tanh<{input_t}, {config}>({input}, {output}, {threshold});' +) + +bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = batchnorm_quantized_tanh_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + + return self.template.format(**params) + + +class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list) + self.template = batchnorm_quantized_tanh_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['quantize'] = 'binary' + params['threshold'] = node.get_weights('threshold').name + elif node.get_attr('quantize') == 3: + params['quantize'] = 'ternary' + params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name + + return self.template.format(**params) + + +def register_bn_quant(backend): + # Register the layer types to the layer map + register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh) + + # Register the optimization passes + backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh) + backend.register_pass('quantize_dense_output', QuantizeDenseOutput) + + # Register template passes + backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate) + backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate) + + +class MergeBatchNormAndQuantizedTanh(OptimizerPass): + def match(self, node): + is_match = ( + node.class_name == 'Activation' + and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh'] + or node.class_name == 'TernaryTanh' + ) + is_match = is_match and isinstance(node.get_input_node(), BatchNormalization) + return is_match + + def transform(self, model, node): + bn_layer = node.get_input_node() + # Make a new layer with the new attributes + quantize = 0 + if 'binary' in node.get_attr('activation'): + quantize = 2 + if 'ternary' in node.get_attr('activation'): + quantize = 3 + attrs = { + 'name': bn_layer.get_attr('name'), + 'original_name': bn_layer.get_attr('name'), + 'class_name': 'BatchNormalizationQuantizedTanh', + 'n_in': bn_layer.get_attr('n_in'), + 'n_out': bn_layer.get_attr('n_in'), + 'n_filt': bn_layer.get_attr('n_filt'), + 'quantize': quantize, + 'trace': bn_layer.get_attr('trace'), + } + bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs) + bnbt_layer.set_thresholds( + bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5) + ) + # Remove the BatchNormalization layer + model.remove_node(bn_layer, rewire=True) + # Replace the old Activation layer with this one + model.replace_node(node, bnbt_layer) + + return True + + +class QuantizeDenseOutput(OptimizerPass): + def match(self, node): + is_dense = node.class_name == 'Dense' + input_node = node.get_input_node() + is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh' + quantizer = node.get_attr('weight_quantizer') + is_binary_ternary = quantizer is not None and ( + quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer' + ) + return is_dense and is_input_bnqt and is_binary_ternary + + def transform(self, model, node): + # Compute the required precision and update the variables + # Number of bits for output is log2 of number of input nodes + # Since this is the number of uint<1>'s which are summed + nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2) + out_type = IntegerPrecisionType(width=nbits) + accum_t = NamedType(f'layer{node.index}_accum_t', out_type) + node.set_attr('accum_t', accum_t) + out_var = node.get_output_variable() + out_var.type.precision = out_type + + quantized_data = None + quantized_precision = None + quantizer = node.get_attr('weight_quantizer') + if quantizer.__class__.__name__ == 'BinaryQuantizer': + quantized_precision = XnorPrecisionType() + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + quantized_precision = IntegerPrecisionType(width=2) + else: + print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out') + return False + quantizer.bits = quantized_precision.width + quantizer.hls_type = quantized_precision + quantized_data = quantizer(node.weights['weight'].data) + + weights = node.weights['weight'] + weights.data = quantized_data + weights.type.name = f'weight{node.index}_t' + weights.update_precision(quantized_precision) + + bias = node.weights['bias'] + bias.data = np.zeros(shape=(node.get_attr('n_out'))) + bias.type.name = f'bias{node.index}_t' + bias.nzeros = 0 + bias.update_precision(quantized_precision) + + # If followed by the BatchNormalizationBinaryTanh, update its input + # Also requantise the weights + bd_out_nodes = node.get_output_nodes() + for out_node in bd_out_nodes: + if isinstance(out_node, BatchNormalizationQuantizedTanh): + var_names = [] + if quantizer.__class__.__name__ == 'BinaryQuantizer': + var_names.append('threshold') + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + var_names.append('threshold_hi') + var_names.append('threshold_lo') + for var_name in var_names: + threshold_var = out_node.weights[var_name] + threshold_var.update_precision(out_type) + threshold_var.data = np.floor(threshold_var.data) + + return False diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py index aff15808ad..67798ae7b1 100644 --- a/hls4ml/converters/keras/core.py +++ b/hls4ml/converters/keras/core.py @@ -62,6 +62,10 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader): if layer['class_name'] != 'Activation': layer['activation'] = layer['class_name'] + + if layer['activation'] == 'elu': + layer['class_name'] = 'ELU' # always use ELU type for elu, even if passed as activation + if layer['class_name'] == 'LeakyReLU': layer['activ_param'] = keras_layer['config'].get('alpha', 0.3) elif layer['class_name'] == 'ThresholdedReLU': diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 1ceb6456b8..8054f41ee6 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -884,7 +884,7 @@ class HardActivation(Activation): def initialize(self): super().initialize() slope_prec = self.get_attr('slope_prec', FixedPrecisionType(width=16, integer=0, signed=False)) - shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=1, integer=0, signed=False)) + shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=2, integer=0, signed=False)) index = self.get_attr('index') slope_t = NamedType(f'slope{index}_t', precision=slope_prec) shift_t = NamedType(f'shift{index}_t', precision=shift_prec) diff --git a/hls4ml/model/optimizer/passes/stamp.py b/hls4ml/model/optimizer/passes/stamp.py index f29ae2a186..84bb466aa2 100644 --- a/hls4ml/model/optimizer/passes/stamp.py +++ b/hls4ml/model/optimizer/passes/stamp.py @@ -1,3 +1,5 @@ +import uuid + from hls4ml.model.optimizer import ModelOptimizerPass @@ -9,11 +11,11 @@ def transform(self, model): def _make_stamp(): """Create a unique identifier for the generated code. This identifier is used to compile a unique library and link it with python.""" - from random import choice - from string import hexdigits length = 8 - return ''.join(choice(hexdigits) for m in range(length)) + + stamp = uuid.uuid4() + return str(stamp)[-length:] model.config.config['Stamp'] = _make_stamp() diff --git a/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py index 798542cfc0..9374f4aef8 100644 --- a/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py +++ b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py @@ -32,7 +32,7 @@ def layer_resources(self, layer_attributes): if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9: return [0] else: - # TOOD - Extend for parallelisation factor + # TOOD - Extend for parallelization factor return [np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor] @classmethod @@ -117,7 +117,7 @@ def layer_resources(self, layer_attributes): if not layer_attributes.weight_shape: return [0] - # TOOD - Extend for parallelisation factor + # TOOD - Extend for parallelization factor if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency': return [ int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor), diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt new file mode 100644 index 0000000000..e2b386d70d --- /dev/null +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -0,0 +1,338 @@ +# Direct CMake to use icpx rather than the default C++ compiler/linker on Linux +# and icx-cl on Windows +if(UNIX) + set(CMAKE_CXX_COMPILER icpx) +else() # Windows + include (CMakeForceCompiler) + CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP) + include (Platform/Windows-Clang) +endif() + +cmake_minimum_required (VERSION 3.7.2) + +project(myproject CXX) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +############################################################################### +### Customize these build variables +############################################################################### +set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp) +set(LIBRARY_FILES src/firmware/myproject.cpp src/myproject_bridge.cpp) +set(LIB_STAMP mystamp) +set(TARGET_NAME myproject) +set(LIBRARY_NAME myproject-${LIB_STAMP}) + +# Use cmake -DFPGA_DEVICE=: to choose a +# different device. Here are a few device examples (this list is not +# exhaustive): +# intel_s10sx_pac:pac_s10 +# intel_s10sx_pac:pac_s10_usm +# intel_a10gx_pac:pac_a10 +# Note that depending on your installation, you may need to specify the full +# path to the board support package (BSP), this usually is in your install +# folder. +# +# You can also specify a device family (E.g. "Arria10" or "Stratix10") or a +# specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP. +if(NOT DEFINED FPGA_DEVICE) + set(FPGA_DEVICE "Arria10") +endif() + +# Use cmake -DUSER_FPGA_FLAGS= to set extra flags for FPGA backend +# compilation. +set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS}) + +# Use cmake -DUSER_FLAGS= to set extra flags for general compilation. +set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS}) + +# Use cmake -DUSER_INCLUDE_PATHS= to set extra paths for general +# compilation. +set(USER_INCLUDE_PATHS src;src/firmware;${USER_INCLUDE_PATHS}) + +############################################################################### +### no changes after here +############################################################################### + +# Print the device being used for the compiles +message(STATUS "Configuring the design to run on FPGA board ${FPGA_DEVICE}") + +# Set the names of the makefile targets to be generated by cmake +set(EMULATOR_TARGET fpga_emu) +set(SIMULATOR_TARGET fpga_sim) +set(REPORT_TARGET report) +set(FPGA_TARGET fpga) +set(IP_EXPORT_TARGET fpga_ip_export) +set(LIBRARY_TARGET lib) + +# Set the names of the generated files per makefile target +set(EMULATOR_OUTPUT_NAME ${TARGET_NAME}.${EMULATOR_TARGET}) +set(SIMULATOR_OUTPUT_NAME ${TARGET_NAME}.${SIMULATOR_TARGET}) +set(REPORT_OUTPUT_NAME ${TARGET_NAME}.${REPORT_TARGET}) +set(FPGA_OUTPUT_NAME ${TARGET_NAME}.${FPGA_TARGET}) +set(IP_EXPORT_OUTPUT_NAME ${TARGET_NAME}.${IP_EXPORT_TARGET}) + +message(STATUS "Additional USER_FPGA_FLAGS=${USER_FPGA_FLAGS}") +message(STATUS "Additional USER_FLAGS=${USER_FLAGS}") + +include_directories(${USER_INCLUDE_PATHS}) +message(STATUS "Additional USER_INCLUDE_PATHS=${USER_INCLUDE_PATHS}") + +link_directories(${USER_LIB_PATHS}) +message(STATUS "Additional USER_LIB_PATHS=${USER_LIB_PATHS}") + +link_libraries(${USER_LIBS}) +message(STATUS "Additional USER_LIBS=${USER_LIBS}") + +if(WIN32) + # add qactypes for Windows + set(QACTYPES "-Qactypes") + # This is a Windows-specific flag that enables exception handling in host code + set(WIN_FLAG "/EHsc") +else() + # add qactypes for Linux + set(QACTYPES "-qactypes") +endif() + +set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS}) +# for debugging need to do this. Not sure why +# set(COMMON_LINK_FLAGS -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) +set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) + +# A SYCL ahead-of-time (AoT) compile processes the device code in two stages. +# 1. The "compile" stage compiles the device code to an intermediate +# representation (SPIR-V). +# 2. The "link" stage invokes the compiler's FPGA backend before linking. For +# this reason, FPGA backend flags must be passed as link flags in CMake. +set(EMULATOR_COMPILE_FLAGS -DFPGA_EMULATOR) +set(LIBRARY_COMPILE_FLAGS -DFPGA_EMULATOR) +set(EMULATOR_LINK_FLAGS ) +set(LIBRARY_LINK_FLAGS -L$ENV{FPGA_VARS_DIR}/host/linux64/lib) +set(REPORT_COMPILE_FLAGS -DFPGA_HARDWARE) +set(REPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early) +set(SIMULATOR_COMPILE_FLAGS -Xssimulation -DFPGA_SIMULATOR) +set(SIMULATOR_LINK_FLAGS -Xssimulation -Xsghdl -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${SIMULATOR_OUTPUT_NAME}) +set(FPGA_COMPILE_FLAGS -DFPGA_HARDWARE) +set(FPGA_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_OUTPUT_NAME}) +# get rid of this once host pipes work properly +set(IP_EXPORT_COMPILE_FLAGS -DFPGA_HARDWARE) +set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early -fsycl-device-code-split=per_kernel) + +############################################################################### +### FPGA Emulator library +############################################################################### +add_library(${LIBRARY_TARGET} SHARED ${LIBRARY_FILES}) +target_compile_options(${LIBRARY_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${LIBRARY_TARGET} PRIVATE ${LIBRARY_COMPILE_FLAGS}) +target_link_libraries(${LIBRARY_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${LIBRARY_TARGET} ${LIBRARY_LINK_FLAGS}) +set_target_properties(${LIBRARY_TARGET} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) + +############################################################################### +### FPGA Emulator +############################################################################### +add_executable(${EMULATOR_TARGET} ${SOURCE_FILES}) +target_compile_options(${EMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${EMULATOR_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS}) +target_link_libraries(${EMULATOR_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${EMULATOR_TARGET} ${EMULATOR_LINK_FLAGS}) +set_target_properties(${EMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${EMULATOR_OUTPUT_NAME}) + +############################################################################### +### FPGA Simulator +############################################################################### +add_executable(${SIMULATOR_TARGET} ${SOURCE_FILES}) +target_compile_options(${SIMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${SIMULATOR_TARGET} PRIVATE ${SIMULATOR_COMPILE_FLAGS}) +target_link_libraries(${SIMULATOR_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${SIMULATOR_TARGET} ${SIMULATOR_LINK_FLAGS}) +set_target_properties(${SIMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${SIMULATOR_OUTPUT_NAME}) + +############################################################################### +### Generate Report +############################################################################### +add_executable(${REPORT_TARGET} ${SOURCE_FILES}) +target_compile_options(${REPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${REPORT_TARGET} PRIVATE ${REPORT_COMPILE_FLAGS}) + +# The report target does not need the QACTYPES flag at link stage +set(MODIFIED_COMMON_LINK_FLAGS_REPORT ${COMMON_LINK_FLAGS}) +list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_REPORT ${QACTYPES}) + +target_link_libraries(${REPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_REPORT}) +target_link_libraries(${REPORT_TARGET} ${REPORT_LINK_FLAGS}) +set_target_properties(${REPORT_TARGET} PROPERTIES OUTPUT_NAME ${REPORT_OUTPUT_NAME}) + +############################################################################### +### FPGA Hardware +############################################################################### +add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILES}) +target_compile_options(${FPGA_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${FPGA_TARGET} PRIVATE ${FPGA_COMPILE_FLAGS}) +target_link_libraries(${FPGA_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${FPGA_TARGET} ${FPGA_LINK_FLAGS}) +set_target_properties(${FPGA_TARGET} PROPERTIES OUTPUT_NAME ${FPGA_OUTPUT_NAME}) + +############################################################################### +### FPGA IP Export (only necessary until native host pipes) +############################################################################### +add_executable(${IP_EXPORT_TARGET} ${SOURCE_FILES}) +target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${IP_EXPORT_COMPILE_FLAGS}) + +# The ip export target does not need the QACTYPES flag at link stage +set(MODIFIED_COMMON_LINK_FLAGS_EXPORT ${COMMON_LINK_FLAGS}) +list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_EXPORT ${QACTYPES}) + +target_link_libraries(${IP_EXPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_EXPORT}) +target_link_libraries(${IP_EXPORT_TARGET} ${IP_EXPORT_LINK_FLAGS}) +set_target_properties(${IP_EXPORT_TARGET} PROPERTIES OUTPUT_NAME ${IP_EXPORT_OUTPUT_NAME}) + +############################################################################### +### This part only manipulates cmake variables to print the commands to the user +############################################################################### + +# set the correct object file extension depending on the target platform +if(WIN32) + set(OBJ_EXTENSION "obj") +else() + set(OBJ_EXTENSION "o") +endif() + +# Set the source file names in a string +set(SOURCE_FILE_NAME "${SOURCE_FILES}") + +function(getCompileCommands common_compile_flags special_compile_flags common_link_flags special_link_flags target output_name) + + set(file_names ${SOURCE_FILE_NAME}) + set(COMPILE_COMMAND ) + set(LINK_COMMAND ) + + foreach(source ${file_names}) + # Get the relative path to the source and object files + file(RELATIVE_PATH CURRENT_SOURCE_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${source}) + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Creating a string that contains the compile command + # Start by the compiler invocation + set(COMPILE_COMMAND "${COMPILE_COMMAND}${CMAKE_CXX_COMPILER}") + + # Add all the potential includes + foreach(INCLUDE ${USER_INCLUDE_PATHS}) + if(NOT IS_ABSOLUTE ${INCLUDE}) + file(RELATIVE_PATH INCLUDE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${INCLUDE}) + endif() + set(COMPILE_COMMAND "${COMPILE_COMMAND} -I${INCLUDE}") + endforeach() + + # Add all the common compile flags + foreach(FLAG ${common_compile_flags}) + set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}") + endforeach() + + # Add all the specific compile flags + foreach(FLAG ${special_compile_flags}) + set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}") + endforeach() + + # Get the location of the object file + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Add the source file and the output file + set(COMPILE_COMMAND "${COMPILE_COMMAND} -c ${CURRENT_SOURCE_FILE} -o ${OBJ_FILE}\n") + endforeach() + + set(COMPILE_COMMAND "${COMPILE_COMMAND}" PARENT_SCOPE) + + # Creating a string that contains the link command + # Start by the compiler invocation + set(LINK_COMMAND "${LINK_COMMAND}${CMAKE_CXX_COMPILER}") + + # Add all the common link flags + foreach(FLAG ${common_link_flags}) + set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}") + endforeach() + + # Add all the specific link flags + foreach(FLAG ${special_link_flags}) + set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}") + endforeach() + + # Add the output file + set(LINK_COMMAND "${LINK_COMMAND} -o ${output_name}") + + foreach(source ${file_names}) + # Get the relative path to the source and object files + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Add the source file and the output file + set(LINK_COMMAND "${LINK_COMMAND} ${OBJ_FILE}") + endforeach() + + # Add all the potential library paths + foreach(LIB_PATH ${USER_LIB_PATHS}) + if(NOT IS_ABSOLUTE ${LIB_PATH}) + file(RELATIVE_PATH LIB_PATH ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${LIB_PATH}) + endif() + if(NOT WIN32) + set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH}") + else() + set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH} -Wl,-rpath,${LIB_PATH}") + endif() + endforeach() + + # Add all the potential includes + foreach(LIB ${USER_LIBS}) + set(LINK_COMMAND "${LINK_COMMAND} -l${LIB}") + endforeach() + + set(LINK_COMMAND "${LINK_COMMAND}" PARENT_SCOPE) + +endfunction() + +# Windows executable is going to have the .exe extension +if(WIN32) + set(EXECUTABLE_EXTENSION ".exe") +endif() + +# Display the compile instructions in the emulation flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${EMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${EMULATOR_LINK_FLAGS}" "${EMULATOR_TARGET}" "${EMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayEmulationCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${EMULATOR_TARGET} displayEmulationCompileCommands) + +# Display the compile instructions in the simulation flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${SIMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${SIMULATOR_LINK_FLAGS}" "${SIMULATOR_TARGET}" "${SIMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displaySimulationCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${SIMULATOR_TARGET} displaySimulationCompileCommands) + +# Display the compile instructions in the report flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${REPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_REPORT}" "${REPORT_LINK_FLAGS}" "${REPORT_TARGET}" "${REPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayReportCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${REPORT_TARGET} displayReportCompileCommands) + +# Display the compile instructions in the IP export flow (Remove after native host pipes work properly) +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${IP_EXPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_EXPORT}" "${IP_EXPORT_LINK_FLAGS}" "${IP_EXPORT_TARGET}" "${IP_EXPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayExportCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${IP_EXPORT_TARGET} displayExportCompileCommands) + +# Display the compile instructions in the fpga flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${FPGA_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${FPGA_LINK_FLAGS}" "${FPGA_TARGET}" "${FPGA_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayFPGACompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${FPGA_TARGET} displayFPGACompileCommands) diff --git a/hls4ml/templates/oneapi/exception_handler.hpp b/hls4ml/templates/oneapi/exception_handler.hpp new file mode 100644 index 0000000000..bb7976f61f --- /dev/null +++ b/hls4ml/templates/oneapi/exception_handler.hpp @@ -0,0 +1,21 @@ +#ifndef __EXCEPTIONHANDLER_HPP__ +#define __EXCEPTIONHANDLER_HPP__ +#include +#include +#include + +namespace fpga_tools { + +void exception_handler(sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl; + } + } +} + +} // namespace fpga_tools + +#endif //__EXCEPTIONHANDLER_HPP__ diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h new file mode 100644 index 0000000000..05de507dcd --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -0,0 +1,20 @@ +#ifndef DEFINES_H_ +#define DEFINES_H_ + +#include +#include +#include +#include + +// Include nnet::array - a custom array-like struct, mainly used with io_stream +#include "nnet_utils/nnet_types.h" + +// hls-fpga-machine-learning insert numbers + +// hls-fpga-machine-learning insert layer-precision + +#define DIV_ROUNDUP(n, d) ((n + d - 1) / d) +#define MIN(n, d) (n > d ? d : n) +#define MAX(n, d) (n < d ? d : n) + +#endif diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp new file mode 100644 index 0000000000..06e7d3fe37 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/myproject.cpp @@ -0,0 +1,24 @@ +#include "myproject.h" +#include "parameters.h" +#include + +// hls-fpga-machine-learning insert weights + +// The inter-task pipes need to be declared in the global scope +// hls-fpga-machine-learning insert inter-task pipes + +using sycl::ext::intel::experimental::task_sequence; + +void MyProject::operator()() const { + // **************************************** + // NETWORK INSTANTIATION + // **************************************** + + // hls-fpga-machine-learning read in + + // hls-fpga-machine-learning declare task sequences + + // hls-fpga-machine-learning insert layers + + // hls-fpga-machine-learning return +} diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h new file mode 100644 index 0000000000..082ae5dc8c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -0,0 +1,29 @@ +#ifndef MYPROJECT_H_ +#define MYPROJECT_H_ + +#include "defines.h" + +// This file defines the interface to the kernel + +// currently this is fixed +using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>)); + +// Need to declare the input and output pipes + +// hls-fpga-machine-learning insert inputs +// hls-fpga-machine-learning insert outputs + +class MyProjectID; + +struct MyProject { + + // kernel property method to config invocation interface + auto get(sycl::ext::oneapi::experimental::properties_tag) { + return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>, + sycl::ext::intel::experimental::pipelined<>}; + } + + SYCL_EXTERNAL void operator()() const; +}; + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h new file mode 100644 index 0000000000..ab1874ec10 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -0,0 +1,499 @@ +#ifndef NNET_ACTIVATION_H_ +#define NNET_ACTIVATION_H_ + +#include "nnet_common.h" + +namespace nnet { + +struct activ_config { + // IO size + static constexpr unsigned n_in = 10; + + // Internal info + static constexpr unsigned table_size = 512; + + // Resource reuse info + static constexpr unsigned io_type = io_parallel; + static constexpr unsigned reuse_factor = 1; + + // Internal data type definitions + typedef ac_fixed<16, 8> table_t; +}; + +// ************************************************* +// LINEAR Activation -- See Issue 53 +// ************************************************* +template void linear(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + res[ii] = datareg; + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +template void relu_max(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg < 0) + res[ii] = 0; + else if (datareg > MAX_INT) + res[ii] = MAX_INT; + else + res[ii] = datareg; + } +} + +template void relu6(const data_T &data, res_T &res) { + relu_max(data, res); +} + +template void relu1(const data_T &data, res_T &res) { + relu_max(data, res); +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +template void sigmoid(const data_T &data, res_T &res) { + static constexpr int MAX_VALUE = 8; +#include "activation_tables/sigmoid_table.tb" + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + [[intel::fpga_register]] typename data_T::value_type absoluteValue; + [[intel::fpga_register]] typename res_T::value_type temp2; + if (data[ii] < 0) { + absoluteValue = -data[ii]; + } else { + absoluteValue = data[ii]; + } + int index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + if (absoluteValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = static_cast(sigmoid_table[index]); + if (data[ii] < 0) { + res[ii] = 1 - temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; + +template inline unsigned softmax_stable_idx_from_real_val(const data_T x) { + // Number of address bits for table + static constexpr int N = ceillog2::val; + + // Slice the top N bits of the input + [[intel::fpga_register]] ac_int y = x.template slc(x.width - N - 1); + // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness + if (x != 0 && y == 0) + y[0] = 1; + return y.to_uint(); +} + +template inline unsigned softmax_latency_idx_from_real_val(const data_T x) { + // Number of address bits for table + static constexpr int N = ceillog2::val; + + // Slice the top N bits of the input + [[intel::fpga_register]] ac_int y = x.template slc(x.width - N); + return y.to_uint(); +} + +template void softmax_stable(const data_T &data, res_T &res) { +// Look-up tables +#include "activation_tables/exp_table.tb" +#include "activation_tables/invert_table.tb" + + // Find maximum + Op_max op_max; + [[intel::fpga_register]] auto x_max = + reduce>(data.data(), op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + [[intel::fpga_register]] ac_fixed + d_xi_xmax[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + d_xi_xmax[i] = data[i] - x_max; + } + + // Calculate all the e^x's + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + exp_res[i] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[i])]; + } + + // Explicitly sum previously calculated exponentials with an adder tree + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + +// TODO - Improve accuracy +template void softmax_latency(const data_T &data, res_T &res) { +#include "activation_tables/exp_table_latency.tb" +#include "activation_tables/invert_table_latency.tb" + + // Calculate all the e^x's + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; + } + + // Explicitly sum the results with an adder tree. + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + +template void softmax_legacy(const data_T &data, res_T &res) { +#include "activation_tables/exp_table_legacy.tb" +#include "activation_tables/invert_table_legacy.tb" + + [[intel::fpga_register]] int data_round[CONFIG_T::n_in]; +New_loop: + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round[ii] = (data[ii] * CONFIG_T::table_size / 16).to_int(); + } +NN_Outer: + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + typename CONFIG_T::exp_table_t exp_res_temp = 0; + NN_Inner: + #pragma unroll + for (int jj = 0; jj < CONFIG_T::n_in; jj++) { + if (ii == jj) { + exp_res_temp += 1; + } else { + int _data_cache = (data_round[jj] - data_round[ii]); + int index = _data_cache + 8 * CONFIG_T::table_size / 16; + + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + + typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index]; + exp_res_temp += temp_exp; + } + } + int exp_res_index = (exp_res_temp * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + res[ii] = invert_table_legacy[exp_res_index]; + } +} + +template void softmax_argmax(const data_T &data, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_in; i++) { + res[i] = static_cast(0); + } + + [[intel::fpga_register]] auto maximum = data[0]; + [[intel::fpga_register]] int idx = 0; + + [[intel::initiation_interval(1)]] for (int i = 1; i < CONFIG_T::n_in; i++) { + if (data[i] > maximum) { + maximum = data[i]; + idx = i; + } + } + + res[idx] = static_cast(1); +} + +template inline void softmax(const data_T &data, res_T &res) { + switch (CONFIG_T::implementation) { + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + default: + softmax_stable(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* +template void dense_tanh(const data_T &data, res_T &res) { + static constexpr int MAX_VALUE = 4; +// Initialize the lookup table +#include "activation_tables/tanh_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + [[intel::fpga_register]] typename data_T::value_type temp; + [[intel::fpga_register]] typename res_T::value_type temp2; + if (data[ii] < 0) { + temp = -data[ii]; + } else { + temp = data[ii]; + } + ac_int<16> index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + if (temp > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = static_cast(tanh_table[index]); + if (data[ii] < 0) { + res[ii] = -temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +template void hard_sigmoid(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + res[ii] = datareg; + } +} + +template void hard_tanh(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + res[ii] = 2 * sigmoid - 1; + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template +void leaky_relu(const data_T &data, const typename CONFIG_T::param_t alpha, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha * datareg; + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu(const data_T &data, const typename CONFIG_T::param_t theta, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg > theta) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +template void softplus(const data_T &data, res_T &res) { +// Initialize the lookup table +#include "activation_tables/softplus_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<16> data_round = (data[ii] * CONFIG_T::table_size / 16).to_int(); + ac_int<16> index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = static_cast(softplus_table[index]); + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* +template void softsign(const data_T &data, res_T &res) { + static constexpr int MAX_VALUE = 8; +// Initialize the lookup table +#include "activation_tables/softsign_table.tb" + + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + [[intel::fpga_register]] typename data_T::value_type temp; + [[intel::fpga_register]] typename res_T::value_type temp2; + if (data[ii] < 0) { + temp = -data[ii]; + } else { + temp = data[ii]; + } + ac_int<16> index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int(); + if (temp > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = static_cast(softsign_table[index]); + if (data[ii] < 0) { + res[ii] = -temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +template +void elu(const data_T &data, const typename CONFIG_T::param_t alpha, res_T &res) { +// Initialize the lookup table +#include "activation_tables/elu_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg >= 0) { + res[ii] = datareg; + } else { + ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = alpha * elu_table[index]; + } + } +} + +// ************************************************* +// SELU Activation +// ************************************************* +template void selu(const data_T &data, res_T &res) { +// Initialize the lookup table +#include "activation_tables/selu_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg >= 0) { + res[ii] = static_cast(1.0507009873554804934193349852946) * datareg; + } else { + ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = selu_table[index]; + } + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* +template +void prelu(const data_T &data, const typename CONFIG_T::param_t &alpha, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha[ii] * datareg; + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template void binary_tanh(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + typename res_T::value_type cache; + if (datareg > 0) + cache = 1; + else + cache = -1; + + res[ii] = cache; + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template void ternary_tanh(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = 2 * data[ii]; + typename res_T::value_type cache; + if (datareg > 1) + cache = 1; + else if (datareg > -1 && datareg <= 1) + cache = 0; + else + cache = -1; + + res[ii] = cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h new file mode 100644 index 0000000000..13de5ab3bb --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -0,0 +1,712 @@ +#ifndef NNET_ACTIVATION_STREAM_H_ +#define NNET_ACTIVATION_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_types.h" + +namespace nnet { + +// ************************************************* +// Linear Activation +// ************************************************* +template void linear_stream() { +LinearActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + LinearPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + out_data[j] = in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// ReLU Activation +// ************************************************* +template void relu_stream() { +ReLUActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + ReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template void leaky_relu_stream(typename CONFIG_T::param_t alpha) { + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +LeakyReLUActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + LeakyReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha * in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu_stream(typename CONFIG_T::param_t theta) { +ThresholdedReLUActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + ThresholdedReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > theta) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +template void elu_stream(typename CONFIG_T::param_t alpha) { +#include "activation_tables/elu_table.tb" + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +EluActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + EluPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = datareg; + } else { + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = alpha * elu_table[index]; + } + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// SeLU Activation +// ************************************************* +template void selu_stream() { +#include "activation_tables/selu_table.tb" + +SeluActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + SeluPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = + typename ExtractPipeType::value_type::value_type(1.0507009873554804934193349852946) * datareg; + } else { + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = selu_table[index]; + } + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* +template void prelu_stream(typename CONFIG_T::param_t alpha) { + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +PReLUActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + PReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha[i * std::tuple_size::value_type>{} + j] * in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +template void softplus_stream() { +#include "activation_tables/softplus_table.tb" + +SoftplusActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + SoftplusPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int(); + [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = softplus_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* +template void softsign_stream() { +#include "activation_tables/softsign_table.tb" + + static const int MAX_VALUE = 8; + +SoftsignActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + SoftsignPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absValue; + ; + if (in_data[j] < 0) { + absValue = -in_data[j]; + } else { + absValue = in_data[j]; + } + ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int(); + if (absValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + if (in_data[j] < 0) { + out_data[j] = + static_cast::value_type::value_type>(-softsign_table[index]); + } else { + out_data[j] = static_cast::value_type::value_type>(softsign_table[index]); + } + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +template void softmax_stable_stream() { +#include "activation_tables/exp_table.tb" +#include "activation_tables/invert_table.tb" + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type + data_array[std::tuple_size::value_type>{}]; + +SoftmaxArrayLoop: + [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_pack = data_pipe::read(); + + SoftmaxArrayPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + data_array[j] = in_pack[j]; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max::value_type::value_type> op_max; + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type x_max = + reduce::value_type::value_type, + std::tuple_size::value_type>{}, + Op_max::value_type::value_type>>(data_array, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + [[intel::fpga_register]] ac_fixed::value_type::value_type::width, + ExtractPipeType::value_type::value_type::i_width, true, AC_RND, AC_SAT> + d_xi_xmax[std::tuple_size::value_type>{}]; + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + d_xi_xmax[j] = data_array[j] - x_max; + } + + // Calculate all the e^x's + [[intel::fpga_register]] + typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + exp_res[j] = + exp_table[softmax_stable_idx_from_real_val::value_type::value_type, + CONFIG_T>(d_xi_xmax[j])]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce::value_type>{}, + Op_add>(exp_res, op_add); + + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + typename ExtractPipeType::value_type out_pack; + + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + + // TODO - Find Quartus-equivalent pragma + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + + out_pack[j] = exp_res[j] * inv_exp_sum; + } + + res_pipe::write(out_pack); + } +} + +template void softmax_latency_stream() { +#include "activation_tables/exp_table_latency.tb" +#include "activation_tables/invert_table_latency.tb" + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + + // Calculate all the e^x's + [[intel::fpga_register]] + typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; + +SoftmaxExpLoop: + [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_pack = data_pipe::read(); + + SoftmaxExpPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val< + typename ExtractPipeType::value_type::value_type, CONFIG_T>(in_pack[j])]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; + + typename ExtractPipeType::value_type out_pack; + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + out_pack[j] = exp_res[j] * inv_exp_sum; + } + + res_pipe::write(out_pack); + } +} + +template void softmax_legacy_stream() { +#include "activation_tables/exp_table_legacy.tb" +#include "activation_tables/invert_table_legacy.tb" + + // Index into the lookup table based on data for exponentials + [[intel::fpga_register]] + typename CONFIG_T::table_t exp_res[std::tuple_size::value_type>{}]; + [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res; + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type + data_cache[std::tuple_size::value_type>{}]; + +SoftmaxInitLoop: + [[intel::initiation_interval(1)]] for (unsigned s = 0; + s < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + s++) { + auto in_pack = data_pipe::read(); + + SoftmaxInitPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + data_cache[j] = in_pack[j]; + exp_res[j] = 0; + } + + SoftmaxExpLoop: + #pragma unroll + for (int i = 0; i < std::tuple_size::value_type>{}; i++) { + SoftmaxExpInner: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (i == j) { + exp_diff_res = 1; + } else { + int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int(); + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table_legacy[index]; + } + exp_res[i] += exp_diff_res; + } + } + + typename ExtractPipeType::value_type out_pack; + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + out_pack[j] = + static_cast::value_type::value_type>(invert_table_legacy[exp_res_index]); + } + + res_pipe::write(out_pack); + } +} + +template void softmax_argmax_stream() { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + #pragma unroll + for (int i = 0; i < std::tuple_size::value_type>{}; i++) { + out_data[i] = static_cast::value_type::value_type>(0); + } + + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type maximum = in_data[0]; + [[intel::fpga_register]] int idx = 0; + + [[intel::initiation_interval(1)]] for (int i = 1; + i < std::tuple_size::value_type>{}; i++) { + if (in_data[i] > maximum) { + maximum = in_data[i]; + idx = i; + } + } + + out_data[idx] = static_cast::value_type::value_type>(1); + res_pipe::write(out_data); + } +} + +template void softmax_stream() { + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency_stream(); + break; + case softmax_implementation::stable: + softmax_stable_stream(); + break; + case softmax_implementation::legacy: + softmax_legacy_stream(); + break; + case softmax_implementation::argmax: + softmax_argmax_stream(); + break; + default: + softmax_stable_stream(); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* +template void dense_tanh_stream() { +#include "activation_tables/tanh_table.tb" + static const int MAX_VALUE = 4; + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +TanHActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + TanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absoluteValue; + + if (in_data[j] < 0) + absoluteValue = (-1) * in_data[j]; + else + absoluteValue = in_data[j]; + + [[intel::fpga_register]] int index; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data[j] > 0) + out_data[j] = tanh_table[index]; + else + out_data[j] = -tanh_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +template void sigmoid_stream() { +#include "activation_tables/sigmoid_table.tb" + static const int MAX_VALUE = 8; + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +SigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + SigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absoluteValue; + + if (in_data[j] < 0) + absoluteValue = (-1) * in_data[j]; + else + absoluteValue = in_data[j]; + + [[intel::fpga_register]] int index; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data[j] > 0) + out_data[j] = sigmoid_table[index]; + else + out_data[j] = 1 - sigmoid_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +// Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations +template void hard_sigmoid_stream() { + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +HardSigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + out_data[j] = datareg; + } + + res_pipe::write(out_data); + } +} + +template void hard_tanh_stream() { + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +HardSigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data[j] = 2 * sigmoid - 1; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template void binary_tanh_stream() { +BinaryTanHActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + + [[intel::fpga_register]] auto in_data = data_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + BinaryTanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 0) + out_data[j] = static_cast::value_type::value_type>(1); + else + out_data[j] = static_cast::value_type::value_type>(-1); + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template void ternary_tanh_stream() { +TernaryTanHActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + + [[intel::fpga_register]] auto in_data = data_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + TernaryTanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 1) + out_data[j] = static_cast::value_type::value_type>(1); + else if (in_data[j] <= -1) + out_data[j] = static_cast::value_type::value_type>(-1); + else + out_data[j] = static_cast::value_type::value_type>(0); + } + + res_pipe::write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h new file mode 100644 index 0000000000..f8e5bcb792 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h @@ -0,0 +1,104 @@ +#ifndef NNET_BATCHNORM_H_ +#define NNET_BATCHNORM_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" + +namespace nnet { + +struct batchnorm_config { + // Internal data type definitions + typedef float bias_t; + typedef float scale_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + + // Default multiplication + template using product = nnet::product::mult; +}; + +template +void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t &scale, + const typename CONFIG_T::bias_t &bias) { +// Calcuate result +Result: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_in; ires++) { + if (CONFIG_T::n_filt == -1) { + res[ires] = + CONFIG_T::template product::product( + data[ires], scale[ires]) + + bias[ires]; + } else { + int norm_index = ires % CONFIG_T::n_filt; + res[ires] = + CONFIG_T::template product::product( + data[ires], scale[norm_index]) + + bias[norm_index]; + } + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +struct batchnorm_quantized_tanh_config { + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; +}; + +template +void normalize_binary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_t &threshold) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<1, false> cache; + auto datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg >= threshold[norm_index]) + cache = 1; + else + cache = 0; + + res[ii] = cache; + } +} + +template +void normalize_ternary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_hi_t &threshold_hi, + const typename CONFIG_T::threshold_lo_t &threshold_lo) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<2, true> cache; + auto datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg > threshold_hi[norm_index]) + cache = 1; + else if (datareg <= threshold_lo[norm_index]) + cache = -1; + else + cache = 0; + res[ii] = cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h new file mode 100644 index 0000000000..128b3ac1a4 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h @@ -0,0 +1,107 @@ +#ifndef NNET_BATCHNORM_STREAM_H_ +#define NNET_BATCHNORM_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include "nnet_types.h" + +namespace nnet { + +// **************************************************** +// Streaming Batch Normalization +// **************************************************** +template +void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_t bias) { + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit; + constexpr auto datasize = std::tuple_size::value_type>{}; + CONFIG_T::template product::value_type::value_type, + typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit); + +BatchNormLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + BatchNormpack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + out_data[j] = + CONFIG_T::template product::value_type::value_type, + typename CONFIG_T::scale_t::value_type>::product(in_data[j], scale[norm_index]) + + bias[norm_index]; + } + + res_pipe::write(out_data); + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +template +void normalize_binary_tanh_stream(typename CONFIG_T::threshold_t threshold) { + constexpr auto datasize = std::tuple_size::value_type>{}; + +BinaryNormLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + + BatchNormPack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + + out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0; + } + + res_pipe::write(out_data); + } +} + +template +void normalize_ternary_tanh_stream(typename CONFIG_T::threshold_hi_t threshold_hi, + typename CONFIG_T::threshold_lo_t threshold_lo) { + constexpr auto datasize = std::tuple_size::value_type>{}; + +TernaryNormLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + + BatchNormPack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + + if (in_data[j] > threshold_hi[norm_index]) + out_data[j] = 1; + else if (in_data[j] <= threshold_lo[norm_index]) + out_data[j] = -1; + else + out_data[j] = 0; + } + + res_pipe::write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h new file mode 100644 index 0000000000..f37a61cb0c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h @@ -0,0 +1,76 @@ +#ifndef NNET_COMMON_H_ +#define NNET_COMMON_H_ + +#include "nnet_helpers.h" +#include +#include +#include + +typedef ac_fixed<16, 6> table_default_t; + +namespace nnet { + +// Common type definitions +enum io_type { io_parallel = 0, io_stream }; + +// Default data types (??) TODO: Deprecate +typedef ac_fixed<16, 4> weight_t_def; +typedef ac_fixed<16, 4> bias_t_def; +typedef ac_fixed<32, 10> accum_t_def; + +template void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) { + #pragma unroll + for (int ii = 0; ii < NIN1; ii++) { + res[ii] = data1[ii]; + } + #pragma unroll + for (int ii = 0; ii < NIN2; ii++) { + res[NIN1 + ii] = data2[ii]; + } +} + +/* --- + * Balanced tree reduce implementation. + * For use in scenarios where Quartus cannot expression balance + * Reduces an array of inputs to a single value using the template binary operator 'Op', + * for example summing all elements with Op_add, or finding the maximum with Op_max + * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section + * before applying and accumulate the result over the rolled dimension. + * --- */ +template T reduce(const T *x, Op op) { + static constexpr int leftN = pow2::val>::val > 0 ? pow2::val>::val : 0; + static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; + if constexpr (N == 1) { + return x[0]; + } else if constexpr (N == 2) { + return op(x[0], x[1]); + } else { + return op(reduce(x, op), reduce(x + leftN, op)); + } +} + +// alternate reduce - basic +// template T reduce(const T *x, Op op) { +// if (N == 1) { +// return x[0]; +// } +// auto val = op(x[0], x[1]); +// for (int i = 2; i < N; i++) { +// val = op(val, x[i]); +// } +// return val; +// } + +template class Op_add { + public: + T operator()(T a, T b) { return a + b; } +}; + +template class Op_max { + public: + T operator()(T a, T b) { return a >= b ? a : b; } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h new file mode 100644 index 0000000000..38560f120c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h @@ -0,0 +1,61 @@ +#ifndef NNET_CONV1D_H_ +#define NNET_CONV1D_H_ + +#include "nnet_common.h" +#include "nnet_conv1d_resource.h" + +namespace nnet { + +struct conv1d_config { + // I/O sizes + static const unsigned in_width = 10; + static const unsigned out_width = 10; + + // Number of channels, filters + static const unsigned n_chan = 1; + static const unsigned n_filt = 1; + + // Original filter size + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_width; + + // Modified filter size (post-Wionograd transformation, if applied) + static const unsigned impl_filt_height = 1; + static const unsigned impl_filt_width = 1; + + // Padding, stride, dilation + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned stride_width = 1; + static const unsigned dilation = 1; + + // Run-time Configuration + static const unsigned n_zeros = 0; + static const unsigned reuse_factor = 1; + static const unsigned parallelization_factor = 1; + + // TODO: BRAM Storage on Quartus + static const bool store_weights_in_bram = false; + + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; +}; + +template +void conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + conv_1d_resource_cl(data, res, weights, biases); +} + +template +void pointwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert(CONFIG_T::filt_width == 1); + pointwise_conv_1d_resource_cl(data, res, weights, biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h new file mode 100644 index 0000000000..85009d4a3a --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h @@ -0,0 +1,237 @@ +#ifndef NNET_CONV1D_RESOURCE_H_ +#define NNET_CONV1D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +enum class conv1d_implementation { combination, im2col, winograd }; + +// **************************************************************** +// im2col - General-purpose 1D Convolution algorithm +// **************************************************************** + +template +void im2col_1d_cl(const data_T &data, data_col_T &data_col, const int col) { + // im2col can be unrolled fully, since number of parallel executions = filt_w x n_chann ~ O(100) and very little DSP + // usage + + [[intel::fpga_register]] int index = 0; + +KernelLoop: + #pragma unroll + for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + [[intel::fpga_register]] int index_data = + (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { + data_col[index++] = data[index_data]; + } else { + data_col[index++] = 0; + } + } + } +} + +template +void conv_1d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + // im2col performs no filter transformations; therefore, filter size remains constant + assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width); + + // Unroll factor for loop traversing input image, derived from parallelization_factor + static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + + using data_col_T = array; + using res_col_T = array; + +ColLoop: + #pragma unroll pf + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int i = 0; i < CONFIG_T::out_width; i++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + [[intel::fpga_register]] data_col_T data_col; + im2col_1d_cl(data, data_col, i); + + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int j = 0; j < CONFIG_T::n_filt; j++) { + res[i * CONFIG_T::n_filt + j] = res_col[j]; + } + } +} + +// **************************************************************** +// 1D Convolution for 3x1 kernels from Winograd's algoirithm +// **************************************************************** + +// Explicity transofrmed input (B'dB) needed for Winograd convolution, as explained by Lavin & Gray (2015) +template +inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[4]) { + D[0] = I[0] - I[2]; + D[1] = I[1] + I[2]; + D[2] = -I[1] + I[2]; + D[3] = I[1] - I[3]; +} + +template +void winograd_conv1d_3x1_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + // Ensure Winograd conditions are met + assert(CONFIG_T::filt_width == 3); + assert(CONFIG_T::stride_width == 1); + assert(CONFIG_T::out_width > 2); + + // Unroll factor for loop traversing input image, derived from parallelization_factor + static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + + // Initialise result to bias + // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value + #pragma unroll + for (int i = 0; i < CONFIG_T::out_width; i++) { + int offset = CONFIG_T::n_filt * i; + #pragma unroll + for (int f = 0; f < CONFIG_T::n_filt; f++) { + res[offset + f] = static_cast(biases[f]); + } + } + +WidthLoop: + #pragma unroll pf + for (int col = 0; col < CONFIG_T::out_width; col += 2) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + // Get current 4x1 tile + [[intel::fpga_register]] typename data_T::value_type T[16]; + [[intel::fpga_register]] uint8_t p = 0; + + #pragma unroll + for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) { + if (c < CONFIG_T::in_width && c >= 0) { + T[p++] = data[c * CONFIG_T::n_chan + channel]; + } else { + T[p++] = 0; + } + } + + // Transform input tile + [[intel::fpga_register]] typename CONFIG_T::accum_t D[4]; + winograd_transform_input_tile_3x1_kernel(T, D); + + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel); + + // Hadamard product between transformed input tile and kernel + [[intel::fpga_register]] typename CONFIG_T::accum_t Y[4]; + #pragma unroll + for (int i = 0; i < 4; i++) { + Y[i] = static_cast(D[i] * weights[filter_offset + i]); + } + + // Explicitly transform intermediate result Z = A'YA and save to output + res[CONFIG_T::n_filt * col + filter] += static_cast(Y[0] + Y[1] + Y[2]); + if ((col + 1) < CONFIG_T::out_width) + res[CONFIG_T::n_filt * (col + 1) + filter] += + static_cast(Y[1] - Y[2] - Y[3]); + } + } + } +} + +// **************************************************************** +// 1D Convolution for 1x1 kernels using optimized im2col +// **************************************************************** + +template +void im2col_1d_pointwise_cl(const data_T &data, data_col_T &data_col, const int col) { + // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations + + [[intel::fpga_register]] int index = 0; + +ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + [[intel::fpga_register]] int index_data = + (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { + data_col[index++] = data[index_data]; + } else { + data_col[index++] = 0; + } + } +} + +template +void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert(CONFIG_T::filt_width == 1); + + // Unroll factor for loop traversing input image, derived from parallelization_factor + static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + + using data_col_T = array; + using res_col_T = array; + +ColLoop: + #pragma unroll pf + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int col = 0; col < CONFIG_T::out_width; col++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + [[intel::fpga_register]] data_col_T data_col; + im2col_1d_pointwise_cl(data, data_col, col); + + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[col * CONFIG_T::n_filt + k] = res_col[k]; + } + } +} + +// **************************************************************** +// Top-level function - handles different implementations +// **************************************************************** +template +void conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + static constexpr bool winograd_conditions = + // Winograd's minimal filtering algorithm not applicable to stride != 1 + CONFIG_T::stride_width == 1 && + + // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once + CONFIG_T::out_width > 2 && + + // Verify user opted for Winograd + (CONFIG_T::implementation == nnet::conv1d_implementation::combination || + CONFIG_T::implementation == nnet::conv1d_implementation::winograd); + + if (CONFIG_T::filt_width == 3 && winograd_conditions) { + winograd_conv1d_3x1_kernel_cl(data, res, weights, biases); + } else { + conv_1d_im2col_cl(data, res, weights, biases); + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h new file mode 100644 index 0000000000..1ffd11774f --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h @@ -0,0 +1,177 @@ +#ifndef NNET_CONV1D_STREAM_H_ +#define NNET_CONV1D_STREAM_H_ + +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void kernel_shift(shift_buffer, kernel_window) + * + * Args: + * shift_buffer - array elements popped from the line the buffer during the shift line buffer operation + * kernel_window - array of values from the input curently being convolved with the kernel + * + * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved + */ +template +void kernel_shift_1d(typename data_T::value_type shift_buffer[CONFIG_T::n_chan], data_window_T &kernel_window) { +/* + * Manually shift kernel_window by one step to the left + * Not possible to use nnet::shift_reg as the kernel window is convolved with the kernel weights using dense matrix + * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are + * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see + * synthesis report in report.html > Area Analysis of System + */ +KernelShiftWidth: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_width - 1; col++) { + KernelShiftChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[col * CONFIG_T::n_chan + channel] = kernel_window[(col + 1) * CONFIG_T::n_chan + channel]; + } + } + +// Insert shift_buffer values into the last column of the kernel window +KernelPushChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + channel] = shift_buffer[channel]; + } +} + +/* + * void shift_line_buffer(in_element, line_buffer, shift_buffer) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer - + * array elements popped from the line the buffer during the shift operation + * + * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one + * Popped elements are later used to update the kernel window, during the kernel_shift operation + */ +template +void shift_line_buffer_1d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[CONFIG_T::n_chan], + typename data_T::value_type shift_buffer[CONFIG_T::n_chan]) { +// For every channel, insert the incoming pixel at end of the shift buffer +UpdateBuffer: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + shift_buffer[channel] = in_elem[channel]; + } +} + +/* + * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved + * with the kernel weights - Conv1D layer weights biases - Conv1D layer biases + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and + * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride + */ +template +void compute_output_buffer_1d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[CONFIG_T::n_chan], + data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases, + int &pX, int &sX) { + + using res_T = typename ExtractPipeType::value_type; + + // Thresholds + constexpr int lShiftX = CONFIG_T::filt_width - 1; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_chan]; + nnet::shift_line_buffer_1d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_1d(shift_buffer, kernel_window); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) { + // Step 3 - Dense matrix multiplication + [[intel::fpga_register]] res_T res_out; + dense_resource(kernel_window, res_out, weights, biases); + + // Write result to output stream + [[intel::fpga_register]] res_T res_pack; + CastLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_filt; channel++) { + res_pack[channel] = res_out[channel]; + } + res_pipe::write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + // Move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template +void conv_1d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { + + using data_arr_T = typename ExtractPipeType::value_type; + using data_element_T = typename data_arr_T::value_type; + using data_window_T = array; + + // Line buffer and kernel window + [[intel::fpga_register]] nnet::shift_reg + line_buffer[CONFIG_T::n_chan]; + [[intel::fpga_register]] data_window_T kernel_window; + + // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel) + constexpr auto padds = zero_array(); + + // move former static variables outside the function calls + // X position pixel + int pX = 0; + // X strides + int sX = 0; + +// Input image left-side padding +PaddingLeftWidth: + for (int col = 0; col < CONFIG_T::pad_left; col++) { + compute_output_buffer_1d(padds, line_buffer, kernel_window, weights, + biases, pX, sX); + } + +// Read input image +ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_output_buffer_1d(data_pipe::read(), line_buffer, + kernel_window, weights, biases, pX, sX); + } + +// Input image right-side padding +PaddingRightWidth: + for (int col = 0; col < CONFIG_T::pad_right; col++) { + compute_output_buffer_1d(padds, line_buffer, kernel_window, weights, + biases, pX, sX); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h new file mode 100644 index 0000000000..79b1508c5f --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h @@ -0,0 +1,67 @@ +#ifndef NNET_CONV2D_H_ +#define NNET_CONV2D_H_ + +#include "nnet_conv2d_resource.h" + +namespace nnet { + +struct conv2d_config { + // I/O sizes + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + // Number of channels, filters + static const unsigned n_chan = 1; + static const unsigned n_filt = 1; + + // Original filter size + static const unsigned filt_height = 1; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_height * filt_width; + + // Modified filter size (post-Wionograd transformation, if applied) + static const unsigned impl_filt_height = 1; + static const unsigned impl_filt_width = 1; + + // Padding, stride, dilation + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned stride_height = 1; + static const unsigned stride_width = 1; + static const unsigned dilation_height = 1; + static const unsigned dilation_width = 1; + + // Run-time configuration + static const unsigned n_zeros = 0; + static const unsigned reuse_factor = 1; + static const unsigned parallelization_factor = 1; + + // TODO: BRAM Storage on Quartus + static const bool store_weights_in_bram = false; + + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; +}; + +template +void conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + conv_2d_resource_cl(data, res, weights, biases); +} + +template +void pointwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + pointwise_conv_2d_resource_cl(data, res, weights, biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h new file mode 100644 index 0000000000..7265d90e1c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h @@ -0,0 +1,297 @@ +#ifndef NNET_CONV2D_RESOURCE_H_ +#define NNET_CONV2D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_helpers.h" + +namespace nnet { + +enum class conv2d_implementation { combination, im2col, winograd }; + +// **************************************************************** +// im2col - General-purpose 2D Convolution algorithm +// **************************************************************** + +template +void im2col_2d_cl(const data_T &data, data_col_T &data_col, const int row, const int col) { + // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little + // DSP usage + + [[intel::fpga_register]] int index = 0; + +FiltHeightLoop: + #pragma unroll + for (int kernel_row = 0; kernel_row < CONFIG_T::impl_filt_height; kernel_row++) { + [[intel::fpga_register]] int input_row = + -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height; + + FiltWidthLoop: + #pragma unroll + for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) { + [[intel::fpga_register]] int input_col = + -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width; + + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) { + data_col[index++] = + data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel]; + } else { + data_col[index++] = 0; + } + } + } + } +} + +template +void conv_2d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + // im2col performs no filter transformations; therefore, filter size remains constant + assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width); + + // Unroll factors for loop traversing input image, derived from parallelization_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height); + + using data_col_T = + array; + using res_col_T = array; + +HeightLoop: + #pragma unroll pfr + for (int i = 0; i < CONFIG_T::out_height; i++) { + WidthLoop: + #pragma unroll pfc + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < CONFIG_T::out_width; j++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + [[intel::fpga_register]] data_col_T data_col; + im2col_2d_cl(data, data_col, i, j); + + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k]; + } + } + } +} + +// **************************************************************** +// 2D Convolution for 3x3 kernels from Winograd's algoirithm +// **************************************************************** + +// Explicity transofrmed input (B'dB) needed for Winograd calculation, as explained by Lavin & Gray, 2015 +template +inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D[16]) { + D[0] = I[0] - I[2] - I[8] + I[10]; + D[1] = I[1] + I[2] - I[9] - I[10]; + D[2] = -I[1] + I[2] + I[9] - I[10]; + D[3] = I[1] - I[3] - I[9] + I[11]; + + D[4] = I[4] - I[6] + I[8] - I[10]; + D[5] = I[5] + I[6] + I[9] + I[10]; + D[6] = -I[5] + I[6] - I[9] + I[10]; + D[7] = I[5] - I[7] + I[9] - I[11]; + + D[8] = -I[4] + I[6] + I[8] - I[10]; + D[9] = -I[5] - I[6] + I[9] + I[10]; + D[10] = I[5] - I[6] - I[9] + I[10]; + D[11] = -I[5] + I[7] + I[9] - I[11]; + + D[12] = I[4] - I[6] - I[12] + I[14]; + D[13] = I[5] + I[6] - I[13] - I[14]; + D[14] = I[6] - I[5] + I[13] - I[14]; + D[15] = I[5] - I[7] - I[13] + I[15]; +} + +template +void winograd_conv2d_3x3_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + // Ensure Winograd conditions are met + assert(CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3); + assert(CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1); + assert(CONFIG_T::pad_left == CONFIG_T::pad_right && CONFIG_T::pad_top == CONFIG_T::pad_bottom); + assert(CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2); + + // Unroll factor for loop traversing input image, derived from parallelization_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2)); + static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2)); + + // Initialise result to bias + // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value + #pragma unroll + for (int i = 0; i < CONFIG_T::out_height * CONFIG_T::out_width; i++) { + int offset = CONFIG_T::n_filt * i; + #pragma unroll + for (int f = 0; f < CONFIG_T::n_filt; f++) { + res[offset + f] = static_cast(biases[f]); + } + } + +HeightLoop: + #pragma unroll pfr + for (int row = 0; row < CONFIG_T::out_height; row += 2) { + WidthLoop: + #pragma unroll pfc + for (int col = 0; col < CONFIG_T::out_width; col += 2) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + // Get current 4x4 tile + [[intel::fpga_register]] typename data_T::value_type T[16]; + [[intel::fpga_register]] typename CONFIG_T::accum_t D[16]; + [[intel::fpga_register]] uint8_t p = 0; + + #pragma unroll + for (int r = row - (int)CONFIG_T::pad_top; r < row + 4 - (int)CONFIG_T::pad_top; r++) { + #pragma unroll + for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) { + if (r < CONFIG_T::in_height && r >= 0 && c < CONFIG_T::in_width && c >= 0) { + T[p++] = data[r * CONFIG_T::in_width * CONFIG_T::n_chan + c * CONFIG_T::n_chan + channel]; + } else { + T[p++] = 0; + } + } + } + + // Transform input tile + winograd_transform_input_tile_3x3_kernel(T, D); + + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel); + + // Hadamard product between transformed input tile and kernel + [[intel::fpga_register]] typename CONFIG_T::accum_t Y[16]; + #pragma unroll + for (int i = 0; i < 16; i++) { + Y[i] = static_cast(D[i] * weights[filter_offset + i]); + } + + // Explicitly transform intermediate result Z = A'YA and save to output + res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + col) + filter] += + static_cast(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + + Y[10]); + if ((col + 1) < CONFIG_T::out_height) + res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + (col + 1)) + filter] += + static_cast(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - + Y[11]); + if ((row + 1) < CONFIG_T::out_width) + res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + col) + filter] += + static_cast(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - + Y[13] - Y[14]); + if ((row + 1) < (CONFIG_T::out_width) && (col + 1) < CONFIG_T::out_height) + res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + (col + 1)) + filter] += + static_cast(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - + Y[13] + Y[14]); + } + } + } + } +} + +// **************************************************************** +// 2D Convolution for 1x1 kernels using optimized im2col +// **************************************************************** + +template +void im2col_2d_pointwise_cl(const data_T &data, data_col_T &data_col, const int row, const int col) { + // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations + + [[intel::fpga_register]] int index = 0; + +ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + + [[intel::fpga_register]] int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height; + [[intel::fpga_register]] int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width; + + if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) { + data_col[index++] = + data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel]; + } else { + data_col[index++] = 0; + } + } +} + +template +void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + + // Unroll factors for loop traversing input image, derived from parallelization_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height); + + using data_col_T = array; + using res_col_T = array; + +HeightLoop: + #pragma unroll pfr + for (int row = 0; row < CONFIG_T::out_height; row++) { + WidthLoop: + #pragma unroll pfc + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int col = 0; col < CONFIG_T::out_width; col++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + [[intel::fpga_register]] data_col_T data_col; + im2col_2d_pointwise_cl(data, data_col, row, col); + + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); + + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[row * CONFIG_T::out_width * CONFIG_T::n_filt + col * CONFIG_T::n_filt + k] = res_col[k]; + } + } + } +} + +// **************************************************************** +// Top-level function - handles different implementations +// **************************************************************** +template +void conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + static constexpr bool winograd_conditions = + // Winograd's minimal filtering algorithm not applicable to stride != 1 + CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1 && + + // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once + CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2 && + + // Verify user opted for Winograd + (CONFIG_T::implementation == nnet::conv2d_implementation::combination || + CONFIG_T::implementation == nnet::conv2d_implementation::winograd); + + if (CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3 && winograd_conditions) { + winograd_conv2d_3x3_kernel_cl(data, res, weights, biases); + } else { + conv_2d_im2col_cl(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h new file mode 100644 index 0000000000..08f0eaa872 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h @@ -0,0 +1,241 @@ +#ifndef NNET_CONV2D_STREAM_H_ +#define NNET_CONV2D_STREAM_H_ + +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void kernel_shift(shift_buffer, kernel_window) + * + * Args: + * shift_buffer - array elements popped from the line the buffer during the shift line buffer operation + * kernel_window - array of values from the input curently being convolved with the kernel + * + * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved + */ +template +void kernel_shift_2d(typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan], + data_window_T &kernel_window) { +/* + * Manually shift kernel_window by one step to the left + * Not possible to use nnet::shift_reg as the kernel window is convolved with the kernel weights using dense matrix + * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are + * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see + * synthesis report in report.html > Area Analysis of System + */ +KernelShiftWidth: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_width - 1; col++) { + KernelShiftHeight: + #pragma unroll + for (int row = 0; row < CONFIG_T::filt_height; row++) { + KernelShiftChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + col * CONFIG_T::n_chan + channel] = + kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + (col + 1) * CONFIG_T::n_chan + channel]; + } + } + } + +// Insert shift_buffer values into the last column of the kernel window +KernelPushHeight: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_height; col++) { + KernelPushChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + col * CONFIG_T::filt_width * CONFIG_T::n_chan + + channel] = shift_buffer[col][channel]; + } + } +} + +/* + * void shift_line_buffer(in_element, line_buffer, shift_buffer) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer - + * array elements popped from the line the buffer during the shift operation + * + * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one + * Popped elements are later used to update the kernel window, during the kernel_shift operation + */ +template +void shift_line_buffer_2d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]) { +// For every channel, insert the incoming pixel at end of the shift buffer +UpdateBuffer: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + shift_buffer[CONFIG_T::filt_height - 1][channel] = in_elem[channel]; + } + +// Shift line buffer and save popped values to shift buffer +LineBufferDataIn: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + LineBufferShift: + #pragma unroll + for (unsigned col = 1; col < CONFIG_T::filt_height; col++) { + // Shift the line buffer, return the popped pixel + typename data_T::value_type pop = + line_buffer[col - 1][channel].shift(shift_buffer[CONFIG_T::filt_height - col][channel]); + + // Place popped pixed into the shift buffer, one row above + shift_buffer[CONFIG_T::filt_height - col - 1][channel] = pop; + } + } +} + +/* + * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved + * with the kernel weights - Conv1D/Conv2D layer weights biases - Conv1D/Conv2D layer biases + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and + * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride + */ +template +void compute_output_buffer_2d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], + data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases, + int &pX, int &pY, int &sX, int &sY) { + + using res_T = typename ExtractPipeType::value_type; + + // Thresholds + constexpr int lShiftX = CONFIG_T::filt_width - 1; + constexpr int lShiftY = CONFIG_T::filt_height - 1; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]; + nnet::shift_line_buffer_2d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_2d(shift_buffer, kernel_window); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) { + // Step 3 - Dense matrix multiplication + [[intel::fpga_register]] res_T res_out; + dense_resource(kernel_window, res_out, weights, biases); + + // Write result to output stream + [[intel::fpga_register]] res_T res_pack; + CastLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_filt; channel++) { + res_pack[channel] = res_out[channel]; + } + res_pipe::write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) && + (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) { + pX = 0; + sX = 0; + pY = 0; + sY = 0; + // Reached end of row + } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + pY++; + sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1); + // Same row, same colum, therefore, move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template +void conv_2d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { + + using data_arr_T = typename ExtractPipeType::value_type; + using data_element_T = typename data_arr_T::value_type; + using data_window_T = array; + + // Line buffer and kernel window + [[intel::fpga_register]] nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan]; + [[intel::fpga_register]] data_window_T kernel_window; + + // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel) + constexpr auto padds = zero_array(); + + // move former static variables outside the function calls + // X position pixel + int pX = 0; + // Y position pixel + int pY = 0; + // X strides + int sX = 0; + // Y strides + int sY = 0; + +// Padding above input image +PaddingTopHeight: + [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_top; row++) { + PaddingTopWidth: + for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, line_buffer, kernel_window, + weights, biases, pX, pY, sX, sY); + } + } + +ReadInputHeight: + [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::in_height; row++) { + // Input image left-side padding + PaddingLeftWidth: + for (int col = 0; col < CONFIG_T::pad_left; col++) { + compute_output_buffer_2d(padds, line_buffer, kernel_window, + weights, biases, pX, pY, sX, sY); + } + + // Read input image + ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_output_buffer_2d( + data_pipe::read(), line_buffer, kernel_window, weights, biases, pX, pY, sX, sY); + } + + // Input image right-side padding + PaddingRightWidth: + for (int col = 0; col < CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, line_buffer, kernel_window, + weights, biases, pX, pY, sX, sY); + } + } + +// Padding below input image +PaddingBottomHeight: + [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_bottom; row++) { + PaddingBottomWidth: + for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, line_buffer, kernel_window, + weights, biases, pX, pY, sX, sY); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h new file mode 100644 index 0000000000..dc76189083 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -0,0 +1,164 @@ +#ifndef NNET_DENSE_LARGE_H_ +#define NNET_DENSE_LARGE_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +struct dense_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_out = 10; + + static const unsigned reuse_factor = 1; + static const unsigned block_factor = 1; // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor); + static const unsigned multiplier_limit = 1; // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor) + static const unsigned multiplier_factor = 1; // min n_in, rf + static const unsigned multiplier_scale = 1; // M_LIMIT/CONFIG_T::n_out; + static const unsigned reciprocal = 1; // 2^35 / 25 + static const unsigned rf_pad = 0; + static const unsigned bf_pad = 0; + // Resource reuse info + static const unsigned io_type = io_parallel; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + + // Default multiplication + template using product = nnet::product::mult; +}; + +template +void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && + "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN"); + //#pragma ii CONFIG_T::reuse_factor + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; +Load: + #pragma unroll + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + [[intel::fpga_register]] int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; + [[intel::fpga_register]] int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; + + #pragma unroll + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + CONFIG_T::reuse_factor * im; + out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor); + d_index[ir][im] = w_index % CONFIG_T::n_in; + } + } +Product1: + [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor]; + Product2: + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im; + if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded) + continue; + int data_index = d_index[ir][im]; + // Modified this + tmp_acc[im] = + CONFIG_T::template product::product( + data[data_index], weights[w_index]); + } + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit]; + ResetMult: + #pragma unroll + for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) { + mult[imult] = 0; + } + AccumLoop1: + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + int o_index = out_index[ir][im]; + if (o_index >= CONFIG_T::n_out) + continue; // check out of bounds + mult[o_index] += tmp_acc[im]; + } + AccumLoop2: + #pragma unroll + for (int im = 0; im < CONFIG_T::multiplier_limit; im++) { + acc[im] += mult[im]; + } + } +Store: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); // acc[jj]; + } +} +template +void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && + "The current Reuse Factor is not allowed"); + assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN"); + + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; +InitAccum: + #pragma unroll + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } +ReuseLoop: + [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::block_factor]; + MultLoop: + #pragma unroll + for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im; + if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out) + continue; + // Modified this + mult[im] = + CONFIG_T::template product::product( + data[in_index], weights[w_index]); + in_index += CONFIG_T::reuse_factor; + if (in_index >= CONFIG_T::n_in) + in_index = ir; + } + AccumLoop: + #pragma unroll + for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) { + acc[out_index] += mult[im]; + if (acc_step + 1 >= CONFIG_T::multiplier_scale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } +// Cast to "res_t" type +Result: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); + } +} +template +void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_rf_lt(data, res, weights, biases); + } else { + dense_rf_gt(data, res, weights, biases); + } +} +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h new file mode 100644 index 0000000000..92c9adc3bb --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -0,0 +1,23 @@ +#ifndef NNET_DENSE_STREAM_H_ +#define NNET_DENSE_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +// Note: DataPack logic removed, at least in the initial version +template +void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { + + [[intel::fpga_register]] typename ExtractPipeType::value_type res; + [[intel::fpga_register]] auto data = data_pipe::read(); + dense_resource::value_type, typename ExtractPipeType::value_type, + CONFIG_T>(data, res, weights, biases); + res_pipe::write(res); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h new file mode 100644 index 0000000000..1188fe3ecc --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h @@ -0,0 +1,43 @@ +#ifndef NNET_EMBED_H_ +#define NNET_EMBED_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet { + +struct embed_config { + // Internal data type definitions + typedef float embeddings_t; + + // (Default layer sizes, overwritten form the backend + static const unsigned n_in = 10; + static const unsigned n_out = 16; + static const unsigned vocab_size = 50; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; +}; + +template +void embedding(const data_T &data, res_T &res, const typename CONFIG_T::embeddings_t &embeddings) { + + /* + * Can store embeddings[] in a register, but a large multiiplexer + * is created due to a non-constant access pattern + */ + +InputSequence: + #pragma unroll + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < CONFIG_T::n_in; j++) { + DenseEmbedding: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_out; i++) { + res[j * CONFIG_T::n_out + i] = embeddings[data[j].to_uint() * CONFIG_T::n_out + i]; + } + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h new file mode 100644 index 0000000000..0f2acb098c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h @@ -0,0 +1,31 @@ +#ifndef NNET_EMBED_STREAM_H_ +#define NNET_EMBED_STREAM_H_ + +namespace nnet { + +template +void embedding_stream(typename CONFIG_T::embeddings_t embeddings) { + + using res_T = typename ExtractPipeType::value_type; + constexpr auto datasize = std::tuple_size::value_type>{}; + + auto in_data = data_pipe::read(); + +InputSequence: + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < datasize; j++) { + + res_T res_pack; + + DenseEmbedding: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_out; i++) { + res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i]; + } + + res_pipe::write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h new file mode 100644 index 0000000000..c7af2e7a68 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h @@ -0,0 +1,118 @@ +#ifndef NNET_HELPERS_H +#define NNET_HELPERS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nnet { + +template void convert_data(sycl::queue &q, srcType *src) { + constexpr auto dstTypeSize = std::tuple_size::value_type>{}; + for (size_t i = 0; i < SIZE / dstTypeSize; i++) { + typename ExtractPipeType::value_type ctype; + for (size_t j = 0; j < dstTypeSize; j++) { + ctype[j] = src[i * dstTypeSize + j]; + } + dest_pipe::write(q, ctype); + } +} + +template void convert_data_back(sycl::queue &q, dstType *dst) { + constexpr auto srcTypeSize = std::tuple_size::value_type>{}; + for (size_t i = 0; i < SIZE / srcTypeSize; i++) { + auto ctype = src_pipe::read(q); + for (size_t j = 0; j < srcTypeSize; j++) { + dst[i * srcTypeSize + j] = ctype[j].to_double(); + } + } +} + +extern bool trace_enabled; +extern std::map *trace_outputs; +extern size_t trace_type_size; + +// constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } +// replace with template metaprogramming +template struct ceillog2 { + enum { val = 1 + ceillog2<((n + 1) / 2)>::val }; +}; + +template <> struct ceillog2<2> { + enum { val = 1 }; +}; + +template <> struct ceillog2<1> { + enum { val = 0 }; +}; + +// constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); } +// replace with template metaprogramming +template struct floorlog2 { + enum { val = 1 + floorlog2<(n / 2)>::val }; +}; + +template <> struct floorlog2<1> { + enum { val = 0 }; +}; + +template <> struct floorlog2<0> { + enum { val = 0 }; +}; + +// constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); } +// replace with template metaprogramming +template struct pow2 { + enum { val = 2 * pow2<(n - 1)>::val }; +}; + +template <> struct pow2<0> { + enum { val = 1 }; +}; + +template void save_output_array(data_T *data, save_T *ptr, size_t layer_size) { + for (int i = 0; i < layer_size; i++) { + ptr[i] = static_cast(data[i].to_double()); + } +} + +// We don't want to include save_T in this function because it will be inserted into myproject.cpp +// so a workaround with element size is used +template void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (int i = 0; i < layer_size; i++) { + out << data[i] << " "; // We don't care about precision in text files + } + out << std::endl; + out.close(); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h new file mode 100644 index 0000000000..550663b881 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h @@ -0,0 +1,232 @@ +#ifndef NNET_MERGE_H_ +#define NNET_MERGE_H_ + +#include "nnet_mult.h" + +namespace nnet { + +struct merge_config { + static const unsigned n_elem = 10; +}; + +struct dot_config { + static const unsigned n_in = 10; + static const unsigned n_out = 1; + + static const unsigned reuse_factor = 1; + + typedef float accum_t; + + template using product = nnet::product::mult; +}; + +struct concat_config { + static const unsigned n_elem1_0 = 10; + static const unsigned n_elem1_1 = 10; + static const unsigned n_elem1_2 = 10; + static const unsigned n_elem2_0 = 10; + static const unsigned n_elem2_1 = 10; + static const unsigned n_elem2_2 = 10; + + static const unsigned axis = -1; +}; + +template +void add(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] + data2[i]); + } +} + +template +void subtract(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] - data2[i]); + } +} + +template +void multiply(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] * data2[i]); + } +} + +template +void average(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast((data1[i] + data2[i]) / 2); + } +} + +template +void maximum(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast((data1[i] > data2[i]) ? data1[i] : data2[i]); + } +} + +template +void minimum(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast((data1[i] < data2[i]) ? data1[i] : data2[i]); + } +} + +template +void dot1d(const input1_T &data1, const input2_T &data2, res_T &res) { + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); + + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::n_in]; +Product: + #pragma unroll multiplier_limit + for (int i = 0; i < CONFIG_T::n_in; i++) { + mult[i] = CONFIG_T::template product::product( + data1[i], data2[i]); + } + + [[intel::fpga_register]] typename CONFIG_T::accum_t acc = 0; +Accum: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_in; i++) { + acc += mult[i]; + } + + res[0] = static_cast(acc); +} + +template +void concatenate1d(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + res[CONFIG_T::n_elem1_0 + i] = static_cast(data2[i]); + } +} + +template +void concatenate2d_0(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast(data2[i]); + } +} + +template +void concatenate2d_1(const input1_T &data1, const input2_T &data2, res_T &res) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] = + static_cast(data1[i * CONFIG_T::n_elem1_1 + j]); + } + + #pragma unroll + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] = + static_cast(data2[i * CONFIG_T::n_elem2_1 + j]); + } + } +} + +template +void concatenate2d(const input1_T &data1, const input2_T &data2, res_T &res) { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1(data1, data2, res); + } else { + concatenate2d_0(data1, data2, res); + } +} + +template +void concatenate3d_0(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = + static_cast(data2[i]); + } +} + +template +void concatenate3d_1(const input1_T &data1, const input2_T &data2, res_T &res) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = + i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + res[res_idx] = static_cast(data1[data_idx]); + } + } + + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem2_2; k++) { + int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + + (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k; + int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k; + res[res_idx] = static_cast(data2[data_idx]); + } + } + } +} + +template +void concatenate3d_2(const input1_T &data1, const input2_T &data2, res_T &res) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k; + int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + res[res_idx] = static_cast(data1[data_idx]); + } + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2; + int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k; + res[res_idx] = static_cast(data2[data_idx]); + } + } + } +} + +template +void concatenate3d(const input1_T &data1, const input2_T &data2, res_T &res) { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2(data1, data2, res); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1(data1, data2, res); + } else { + concatenate3d_0(data1, data2, res); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h new file mode 100644 index 0000000000..60028ea52e --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h @@ -0,0 +1,359 @@ +#ifndef NNET_MERGE_STREAM_H_ +#define NNET_MERGE_STREAM_H_ + +namespace nnet { + +template void add_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +AddLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + AddPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>(in_data1[j] + in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void subtract_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +SubtractLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + SubtractPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>(in_data1[j] - in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void multiply_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +MultLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + MultPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>(in_data1[j] * in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void average_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +AvgLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + AvgPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>( + (in_data1[j] + in_data2[j]) / (typename ExtractPipeType::value_type::value_type)2); + } + + res_pipe::write(out_data); + } +} + +template void maximum_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +MaxLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + MaxPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>( + (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void minimum_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +MinLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + MinPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>( + (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void concatenate1d_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + +ConcatLoop1: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0 / input2Size; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + ConcatPack1: + #pragma unroll + for (int j = 0; j < input1Size; j++) { + out_data[j + (i * input1Size)] = + static_cast::value_type::value_type>(in_data1[j]); + } + } + +ConcatLoop2: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0 / input2Size; i++) { + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + ConcatPack2: + #pragma unroll + for (int j = 0; j < input2Size; j++) { + out_data[j + (i * input2Size) + (CONFIG_T::n_elem1_0)] = + static_cast::value_type::value_type>(in_data2[j]); + } + } + res_pipe::write(out_data); +} + +template void concatenate2d_0_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight1: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + res_pipe::write(out_data); + } + +ConcatLoopHeight2: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } +} + +template void concatenate2d_1_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[input1Size + k] = static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } +} + +template void concatenate2d_stream() { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1_stream(); + } else { + concatenate2d_0_stream(); + } +} + +template void concatenate3d_0_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight1: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + res_pipe::write(out_data); + } + } + +ConcatLoopHeight2: + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + ConcatLoopWidth2: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } + } +} + +template void concatenate3d_1_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + res_pipe::write(out_data); + } + ConcatLoopWidth2: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } + } +} + +template void concatenate3d_2_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[input1Size + k] = + static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } + } +} + +template void concatenate3d_stream() { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2_stream(); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1_stream(); + } else { + concatenate3d_0_stream(); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h new file mode 100644 index 0000000000..c7dfc2d7c5 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h @@ -0,0 +1,113 @@ +#ifndef NNET_MULT_H_ +#define NNET_MULT_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include + +namespace nnet { + +// Different methods to perform the product of input and weight, depending on their types. +namespace product { + +class Product { + public: + static void limit(unsigned multiplier_limit) {} +}; + +template class both_binary : public Product { + public: + inline static x_T product(x_T a, w_T w) { + // specialisation for 1-bit weights and incoming data + return a == w; + } +}; + +template class weight_binary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 1-bit weights, arbitrary data + if (w == 0) + return -a; + else + return a; + } +}; + +template class data_binary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-w) { + // Specialisation for 1-bit data, arbitrary weight + if (a == 0) + return -w; + else + return w; + } +}; + +template class weight_ternary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 2-bit weights, arbitrary data + if (w == 0) + return 0; + else if (w == -1) + return -a; + else + return a; // if(w == 1) + } +}; + +template class mult : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(a * w) { + // 'Normal' product + return a * w; + } + static void limit(unsigned multiplier_limit) { + // TODO: Implement for Quartus + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS + // pragma + } +}; + +template class weight_exponential : public Product { + public: + using r_T = ac_fixed<2 * (w_T::second_type::width + x_T::width), (w_T::second_type::width + x_T::width), true>; + inline static r_T product(x_T a, w_T w) { + // Shift product for exponential weights + // Shift by the exponent. Negative weights shift right + r_T y = static_cast(a) << w.second; + + // Negate or not depending on weight sign + return w.first == 1 ? y : static_cast(-y); + } +}; +} // namespace product + +// TO-DO: These may need extra variants if ac_int types are used in more places +template +inline typename std::enable_if>::value && + std::is_same>::value, + ac_int>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int()); +} + +template +inline typename std::enable_if>::value && + !std::is_same>::value, + res_T>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast(x); +} + +template +inline typename std::enable_if<(!std::is_same>::value), res_T>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast(x); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h new file mode 100644 index 0000000000..e8e3d6509b --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h @@ -0,0 +1,104 @@ +#ifndef NNET_PADDING_H_ +#define NNET_PADDING_H_ + +namespace nnet { + +struct padding1d_config { + static const unsigned in_width = 10; + static const unsigned out_width = 10; + static const unsigned n_chan = 10; + + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template void zeropad1d_cl(const data_T &data, res_T &res) { + + auto resIter = res.begin(); + auto dataIter = data.cbegin(); + + for (int i = 0; i < CONFIG_T::pad_left; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(resIter++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::in_width; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(resIter++) = static_cast(*(dataIter++)); + } + } + + for (int i = 0; i < CONFIG_T::pad_right; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(resIter++) = 0; + } + } +} + +struct padding2d_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + static const unsigned n_chan = 10; + + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template void zeropad2d_cl(const data_T &data, res_T &res) { + + auto resIter = res.begin(); + auto dataIter = data.cbegin(); + + for (int i = 0; i < CONFIG_T::pad_top; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::pad_left; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = 0; + } + } + for (int j = 0; j < CONFIG_T::in_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = static_cast(*(dataIter++)); + } + } + for (int j = 0; j < CONFIG_T::pad_right; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = 0; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h new file mode 100644 index 0000000000..adb2efee29 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h @@ -0,0 +1,81 @@ +#ifndef NNET_PADDING_STREAM_H_ +#define NNET_PADDING_STREAM_H_ + +namespace nnet { + +template inline void fill_zero() { + [[intel::fpga_register]] typename ExtractPipeType::value_type res_part; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_chan; i++) { + res_part[i] = 0; + } + res_pipe::write(res_part); +} + +template inline void fill_data() { + [[intel::fpga_register]] auto data_part = data_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type res_part; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_chan; i++) { + res_part[i] = data_part[i]; + } + res_pipe::write(res_part); +} + +template void zeropad1d_cl_stream() { +PadLeft: + for (int i = 0; i < CONFIG_T::pad_left; i++) { + fill_zero(); + } + +CopyMain: + for (int i = 0; i < CONFIG_T::in_width; i++) { + fill_data(); + } + +PadRight: + for (int i = 0; i < CONFIG_T::pad_right; i++) { + fill_zero(); + } +} + +template void zeropad2d_cl_stream() { +PadTop: + [[intel::loop_coalesce(2)]] for (int i = 0; i < CONFIG_T::pad_top; i++) { + PadTopWidth: + for (int j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(); + } + } + +PadMain: + [[intel::loop_coalesce(2)]] for (int i = 0; i < CONFIG_T::in_height; i++) { + + PadLeft: + for (int j = 0; j < CONFIG_T::pad_left; j++) { + fill_zero(); + } + + CopyMain: + for (int j = 0; j < CONFIG_T::in_width; j++) { + fill_data(); + } + + PadRight: + for (int j = 0; j < CONFIG_T::pad_right; j++) { + fill_zero(); + } + } + +PadBottom: + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + PadBottomWidth: + for (int j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h new file mode 100644 index 0000000000..d4ae915335 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h @@ -0,0 +1,257 @@ +#ifndef NNET_POOLING_H_ +#define NNET_POOLING_H_ + +#include "nnet_common.h" + +namespace nnet { + +// Returns the maximum value from an array of size N +template accum_t max(T x[N]) { + [[intel::fpga_register]] T y = x[0]; + + // Due to loop dependencies, pipelining & unrolling is not possible + // Explictily disabling pipeline significantly reduces resource usage + [[intel::disable_loop_pipelining]] for (int i = 1; i < N; i++) { + if (x[i] > y) + y = x[i]; + } + + return y; +} + +// Returns the mean value of an array of size N +template accum_t avg(T x[N], unsigned length) { + [[intel::fpga_register]] accum_t y = 0; + + // Due to loop dependencies, pipelining & unrolling is not possible + // Explictily disabling pipeline significantly reduces resource usage + [[intel::disable_loop_pipelining]] for (int i = 0; i < N; i++) { y += x[i]; } + + y /= length; + return y; +} + +// Enumeration for pooling functions +enum Pool_Op { Max, Average }; +template accum_t pool_op(T x[N], unsigned length) { + switch (op) { + case Max: + return max(x); + case Average: + return avg(x, length); + } +} + +template accum_t pool_op(T (&x)[N]) { + return pool_op(x, N); +} + +/* + * In Tensorflow, pooling ignores the value in the padded cells + * For Avg pooling, return 0 (the divisior is modified to the area overlapping the unpadded image.) + * For ax pooling, return the most negative value for the type. + */ +template inline T pad_val() { + switch (op) { + case Max: { + T x = 0; + x[x.width - 1] = 1; + return x; + } + case Average: + return 0; + } +} + +struct pooling1d_config { + // Pooling paramaters + static const unsigned pool_width = 2; + static const unsigned stride_width = 2; + + // I/O sizes + static const unsigned n_in = 10; + static const unsigned n_out = (n_in - pool_width) / stride_width + 1; + static const unsigned n_filt = 4; + + // Padding + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template void pooling1d_cl(const data_T &data, res_T &res) { + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + +FiltLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + InputWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < restricted_padded_width; + inp_col += CONFIG_T::stride_width) { + [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_width]; + + // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling + [[intel::fpga_register]] unsigned img_overlap = 0; + + PoolWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) { + if (inp_col + pool_col < CONFIG_T::pad_left || + inp_col + pool_col >= (full_padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[pool_col] = pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + // Current element is from input image + pool[pool_col] = data[(inp_col + pool_col - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt]; + img_overlap++; + } + } + + // Pooling operation + res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = static_cast( + pool_op( + pool, img_overlap)); + } + } +} + +template void global_pooling1d_cl(const data_T &data, res_T &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + +FiltLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::n_in]; + + InputWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int col = 0; col < CONFIG_T::n_in; col++) { + pool[col] = data[col * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast( + pool_op(pool)); + } +} + +struct pooling2d_config { + // Pooling parameters + static const unsigned stride_height = 2; + static const unsigned stride_width = 2; + static const unsigned pool_height = 2; + static const unsigned pool_width = 2; + + // I/O sizes + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_filt = 4; + + static const unsigned out_height = (in_height - pool_height) / stride_height + 1; + static const unsigned out_width = (in_width - pool_width) / stride_width + 1; + + // Padding + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template void pooling2d_cl(const data_T &data, res_T &res) { + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height; + +FiltLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + InputHeightLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < restricted_padded_height; + inp_col += CONFIG_T::stride_height) { + InputWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int inp_width = 0; inp_width < restricted_padded_width; + inp_width += CONFIG_T::stride_width) { + [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + + // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling + [[intel::fpga_register]] unsigned img_overlap = 0; + + PoolHeightLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_height; pool_col++) { + PoolWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int pool_row = 0; pool_row < CONFIG_T::stride_width; + pool_row++) { + if (inp_col + pool_col < CONFIG_T::pad_top || + inp_col + pool_col >= (full_padded_height - CONFIG_T::pad_bottom) || + inp_width + pool_row < CONFIG_T::pad_left || + inp_width + pool_row >= (full_padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[pool_col * CONFIG_T::stride_width + pool_row] = + pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + // Current element is from input image + pool[pool_col * CONFIG_T::stride_width + pool_row] = + data[(inp_col + pool_col - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + + (inp_width + pool_row - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt]; + img_overlap++; + } + } + } + + // Pooling operation + res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + + (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = + static_cast( + pool_op(pool, img_overlap)); + } + } + } +} + +template void global_pooling2d_cl(const data_T &data, res_T &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height); + +FiltLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::in_height * CONFIG_T::in_width]; + + InputLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) { + pool[i] = data[i * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast( + pool_op(pool)); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h new file mode 100644 index 0000000000..9c30aab67d --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h @@ -0,0 +1,322 @@ +#ifndef NNET_POOLING_STREAM_H_ +#define NNET_POOLING_STREAM_H_ + +#include "nnet_conv1d_stream.h" +#include "nnet_conv2d_stream.h" +#include "nnet_pooling.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void compute_pool_buffer_1d(in_element, res_stream, line_buffer, kernel_window) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4) + * Counter housekeeping - performs the required pooling operation + * + */ +template +void compute_pool_buffer_1d(const data_T &in_elem, + nnet::shift_reg line_buffer[CONFIG_T::n_filt], + data_window_T &kernel_window, int &pX, int &sX) { + + using res_T = typename ExtractPipeType::value_type; + + // Thresholds + constexpr int lShiftX = CONFIG_T::pool_width - 1; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_filt]; + nnet::shift_line_buffer_1d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_1d(shift_buffer, kernel_window); + + // Check to see if we have a full pool window + if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) { + [[intel::fpga_register]] res_T res_pack; + + FiltLoop: + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_width]; + + // Retrieve data for current channel + PoolLoop: + #pragma unroll + for (int i = 0; i < CONFIG_T::pool_width; i++) { + pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter]; + } + + // Step 3 - Pooling + res_pack[filter] = static_cast( + pool_op( + pool_window)); + } + + // Write result to output stream + res_pipe::write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + // Move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template void pooling1d_cl_stream() { + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + using data_arr_T = typename ExtractPipeType::value_type; + using data_element_T = typename data_arr_T::value_type; + using data_window_T = array; + + // Line buffer and kernel window + [[intel::fpga_register]] nnet::shift_reg line_buffer[CONFIG_T::n_filt]; + [[intel::fpga_register]] data_window_T kernel_window; + + // move former static variables outside the function calls + // X position pixel + int pX = 0; + // X strides + int sX = 0; + +// Read input image +ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_pool_buffer_1d(data_pipe::read(), line_buffer, kernel_window, + pX, sX); + } +} + +/* + * void compute_pool_buffer_2d(in_element, res_stream, line_buffer, kernel_window) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4) + * Counter housekeeping - performs the required pooling operation + * + */ +template +void compute_pool_buffer_2d(const data_T &in_elem, + nnet::shift_reg + line_buffer[CONFIG_T::pool_height - 1][CONFIG_T::n_filt], + data_window_T &kernel_window, int &pX, int &pY, int &sX, int &sY) { + + using res_T = typename ExtractPipeType::value_type; + + // Thresholds + static constexpr int lShiftX = CONFIG_T::pool_width - 1; + static constexpr int lShiftY = CONFIG_T::pool_height - 1; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::pool_height][CONFIG_T::n_filt]; + nnet::shift_line_buffer_2d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_2d(shift_buffer, kernel_window); + + // Check to see if we have a full pool window + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) { + [[intel::fpga_register]] res_T res_pack; + + FiltLoop: + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + + // Retrieve data for current channel + PoolLoop: + #pragma unroll + for (int i = 0; i < CONFIG_T::pool_height * CONFIG_T::pool_width; i++) { + pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter]; + } + + // Step 3 - Pooling + res_pack[filter] = static_cast( + pool_op(pool_window)); + } + + // Write result to output stream + res_pipe::write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) && + (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) { + pX = 0; + sX = 0; + pY = 0; + sY = 0; + // Reached end of row + } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + pY++; + sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1); + // Same row, same colum, therefore, move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template void pooling2d_cl_stream() { + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + + using data_arr_T = typename ExtractPipeType::value_type; + using data_element_T = typename data_arr_T::value_type; + using data_window_T = array; + + // Line buffer and kernel window + [[intel::fpga_register]] nnet::shift_reg + line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt]; + [[intel::fpga_register]] data_window_T kernel_window; + + // former static variables + // X, Y position pixels + int pX = 0; + int pY = 0; + + // X, Y strides + int sX = 0; + int sY = 0; + +ReadInputHeight: + [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::in_height; row++) { + // Read input image + ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_pool_buffer_2d(data_pipe::read(), line_buffer, + kernel_window, pX, pY, sX, sY); + } + } +} + +/* + * A function used with Global Pooling + * Updates the output pooling value + * Max : Return the maximum between the previous maximum and current input + * Avg : Returns the cumulative sum + */ +template inline T_y reduce_global_pool(T_y y, T_x x) { + if (op == Max) { + return (x > y) ? (T_y)x : y; + } else { + return (T_y)(x + y); + } +} + +/* + * A function used with Global Pooling + * For every filter, it updates the value by summing the current input (Average) or updating the maximum value (Max) + */ +template void compute_global_pool(const data_T &in_elem, res_T &data_input) { + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = reduce_global_pool( + data_input[i], in_elem[i]); + } +} + +template void global_pooling1d_cl_stream() { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + + using accum_arr_t = array; + + [[intel::fpga_register]] accum_arr_t data_input; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = pad_val(); + } + + for (int i = 0; i < CONFIG_T::n_in; i++) { + compute_global_pool(data_pipe::read(), data_input); + } + + [[intel::fpga_register]] res_T res_pack; + if (CONFIG_T::pool_op == Average) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i] / CONFIG_T::n_in); + } + } else { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i]); + } + } + + res_pipe::write(res_pack); +} + +template void global_pooling2d_cl_stream() { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + + using accum_arr_t = array; + + [[intel::fpga_register]] accum_arr_t data_input; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = pad_val(); + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::in_width; j++) { + compute_global_pool(data_pipe::read(), data_input); + } + } + + [[intel::fpga_register]] res_T res_pack; + if (CONFIG_T::pool_op == Average) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = + static_cast(data_input[i] / (CONFIG_T::in_width * CONFIG_T::in_height)); + } + } else { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i]); + } + } + + res_pipe::write(res_pack); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h new file mode 100644 index 0000000000..5fec90d1aa --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h @@ -0,0 +1,18 @@ +#ifndef NNET_PRINTF_H_ +#define NNET_PRINTF_H_ + +#ifdef __SYCL_DEVICE_ONLY__ +#define CL_CONSTANT __attribute__((opencl_constant)) +#else +#define CL_CONSTANT +#endif + +using namespace sycl; + +#define PRINTF(format, ...) \ + { \ + static const CL_CONSTANT char _format[] = format; \ + ext::oneapi::experimental::printf(_format, ##__VA_ARGS__); \ + } + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h new file mode 100644 index 0000000000..4c20f28d1c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h @@ -0,0 +1,566 @@ +#ifndef NNET_RECURRENT_H_ +#define NNET_RECURRENT_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_recurrent_activation.h" + +namespace nnet { + +//---------------------- +// Utils +//---------------------- + +template +void multiply_W(const data_T &input, res_T &out, const weight_t &weight) { +MULTIPLY_W_LOOP_I: + #pragma unroll + for (int i = 0; i < N_OUT; i++) { + out[i] = 0; + + MULTIPLY_W_LOOP_J: + #pragma unroll + for (int j = 0; j < N_IN; j++) { + out[i] += input[j] * weight[i * N_IN + j]; + } + } +} + +template +void multiply_U(const data_T &input, res_T &out, const weight_t &weight) { +MULTIPLY_U_LOOP_I: + #pragma unroll + for (int i = 0; i < N_OUT; i++) { + out[i] = 0; + + MULTIPLY_U_LOOP_J: + #pragma unroll + for (int j = 0; j < N_OUT; j++) { + out[i] += input[j] * weight[i * N_OUT + j]; + } + } +} + +template +void add_bias(const data_T &inputs, res_T &out, const bias_t &bias) { +ADD_BIAS_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = inputs[i] + bias[i]; + } +} + +template +void multiply_vectors(const data1_T &in1, const data2_T &in2, res_T &out) { +MULTIPLY_VECT_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = in1[i] * in2[i]; + } +} + +template +void add_vectors(const data1_T &in1, const data2_T &in2, res_T &out) { +ADD_VECTOR_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = in1[i] + in2[i]; + } +} + +//---------------------- +// GRU +//---------------------- + +struct gru_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_units = 1; + static const unsigned n_timesteps = 1; + static const unsigned n_outputs = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias, + const typename CONFIG_T::recurrent_bias_t &recurrent_bias) { + static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor; + // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and + // candidate state gates, for each of the units + + using accum_array_T = array; + + [[intel::fpga_register]] accum_array_T mat_mul_x_w; + nnet::dense_resource(x, mat_mul_x_w, weights, bias); + + // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights), + // for update, reset and candidate state gates, for each of the units + [[intel::fpga_register]] accum_array_T mat_mul_h_wr; + nnet::dense_resource(h, mat_mul_h_wr, recurrent_weights, + recurrent_bias); + + // A vector containing both the values of z(t) and r(t) for every state + using z_activ_array_T = array; + [[intel::fpga_register]] z_activ_array_T z_r; + + // Add the individual vectors from the multiplication of mat_mul_x_w = Wx*x(t) and mat_mul_h_wr = Wh*h(t-1) + // Unrolled fully, no DSPs used + #pragma unroll + for (int i = 0; i < (2 * CONFIG_T::n_units); i++) { + z_r[i] = mat_mul_x_w[i] + mat_mul_h_wr[i]; + } + + // Activation on z(t) and r(t) + [[intel::fpga_register]] z_activ_array_T z_r_act; + CONFIG_T::template activation_recr::activation(z_r, z_r_act); + + // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h + using h_activ_array_T = array; + [[intel::fpga_register]] h_activ_array_T hadamard_r_h; + #pragma unroll recurrent_unroll_factor + for (int i = 0; i < (CONFIG_T::n_units); i++) { + hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units]; + } + + // The candidate state; X * W_{hx} + hadmard(r(t), h_(t-1)) * W_{hh} + b_{h} + [[intel::fpga_register]] h_activ_array_T h_cand; + // Addition - can unroll fully; no DSPs used here + #pragma unroll + for (int i = 0; i < (CONFIG_T::n_units); i++) { + h_cand[i] = mat_mul_x_w[i + 2 * CONFIG_T::n_units] + hadamard_r_h[i]; + } + + // Activation on candidate state + [[intel::fpga_register]] h_activ_array_T h_cand_act; + CONFIG_T::template activation::activation(h_cand, + h_cand_act); + + // Update state + #pragma unroll recurrent_unroll_factor + for (int i = 0; i < (CONFIG_T::n_units); i++) { + h[i] = static_cast(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]); + } +} + +template +void gru(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias, + const typename CONFIG_T::recurrent_bias_t &recurrent_bias) { + + using h_T = array; + [[intel::fpga_register]] data_T x; + [[intel::fpga_register]] h_T h; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + h[i] = 0; + } + + // Loop depedency - cannot pipeline + [[intel::disable_loop_pipelining]] for (int t = 0; t < CONFIG_T::n_timesteps; t++) { + // Get data at current time step + #pragma unroll + for (int j = 0; j < CONFIG_T::n_in; j++) { + x[j] = data[j + t * CONFIG_T::n_in]; + } + + nnet::gru_cell(x, h, weights, recurrent_weights, bias, recurrent_bias); + + if (CONFIG_T::return_sequences) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + res[CONFIG_T::n_units * t + i] = h[i]; + } + } + } + + if (!CONFIG_T::return_sequences) { + #pragma unroll + for (int i = 0; i < (CONFIG_T::n_units); i++) { + res[i] = h[i]; + } + } +} + +//---------------------- +// SimpleRNN +//---------------------- + +struct simpleRNN_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_outputs = 1; + static const unsigned n_timesteps = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, const typename CONFIG_T::weight_t &kernel, + const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) { + + using accum_array_T = array; + // Weight multiplication + [[intel::fpga_register]] accum_array_T afterW; + multiply_W(inputs, afterW, kernel); + + // Bias addition + [[intel::fpga_register]] accum_array_T afterBias; + add_bias(afterW, afterBias, bias); + + // Hidden state + [[intel::fpga_register]] accum_array_T hiddenCand; + multiply_U(hidden_state, hiddenCand, + rec_kernel); + + // Vector addition + [[intel::fpga_register]] accum_array_T afterAdd; + add_vectors(afterBias, hiddenCand, afterAdd); + + // Activation + CONFIG_T::template activation::activation(afterAdd, hidden_state_o); +} + +template +void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &kernel, + const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) { + + using in_T = array; + using h_T = array; + + [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1]; + [[intel::fpga_register]] h_T hidden_state_temp; + [[intel::fpga_register]] h_T h; + [[intel::fpga_register]] in_T in; + +// Set initially hidden state (output) to zero +INIT_LOOP: + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[0][x] = 0; + } + + [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) { + + // Data at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_in; x++) { + in[x] = data[x + i * CONFIG_T::n_in]; + } + + // Hidden state at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state_temp[x] = hidden_state[i][x]; + } + + // Do SimpleRNN + simple_rnn_cell(in, hidden_state_temp, h, kernel, rec_kernel, bias); + + // Write result + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[i + 1][x] = h[x]; + } + } + + if (CONFIG_T::return_sequences == 0) { + // Output when return_sequences is false + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + res[x] = hidden_state[CONFIG_T::n_timesteps][x]; + } + } else { + // Output when return_sequences is true + #pragma unroll + for (int x = 0; x < CONFIG_T::n_timesteps; x++) { + #pragma unroll + for (int h = 0; h < CONFIG_T::n_out; h++) { + res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h]; + } + } + } +} + +//---------------------- +// LSTM +//---------------------- + +struct lstm_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_outputs = 1; + + static const unsigned n_timesteps = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &cell_state, h_T &cell_state_o, + const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF, + const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO, + const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF, + const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO, + const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t BF, + const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t BO) { + + using accum_array_T = array; + + // Internals definitions + [[intel::fpga_register]] accum_array_T i_afterW; + [[intel::fpga_register]] accum_array_T i_afterBias; + [[intel::fpga_register]] accum_array_T c_afterW; + [[intel::fpga_register]] accum_array_T c_afterBias; + [[intel::fpga_register]] accum_array_T o_afterW; + [[intel::fpga_register]] accum_array_T o_afterBias; + [[intel::fpga_register]] accum_array_T f_afterW; + [[intel::fpga_register]] accum_array_T f_afterBias; + + // Hidden state Gate candidates, intermediate variables + [[intel::fpga_register]] accum_array_T i_hiddenCand; + [[intel::fpga_register]] accum_array_T f_hiddenCand; + [[intel::fpga_register]] accum_array_T c_hiddenCand; + [[intel::fpga_register]] accum_array_T o_hiddenCand; + + // After addition, intermediate variables + [[intel::fpga_register]] accum_array_T i_afterAdd; + [[intel::fpga_register]] accum_array_T f_afterAdd; + [[intel::fpga_register]] accum_array_T c_afterAdd; + [[intel::fpga_register]] accum_array_T o_afterAdd; + + // Gate outputs + [[intel::fpga_register]] accum_array_T gate_i; + [[intel::fpga_register]] accum_array_T gate_f; + [[intel::fpga_register]] accum_array_T gate_c; + [[intel::fpga_register]] accum_array_T gate_o; + [[intel::fpga_register]] accum_array_T gate_ic; + [[intel::fpga_register]] accum_array_T gate_forget; + [[intel::fpga_register]] accum_array_T h; + + // Intermediate variable cell calculation + [[intel::fpga_register]] accum_array_T cell_act_multp; + [[intel::fpga_register]] accum_array_T cell_act_add; + + //-----------Gate I Calculations + // Weight multiplication + multiply_W(inputs, i_afterW, WI); + + // Bias addition + add_bias(i_afterW, i_afterBias, BI); + + // Hidden Candidate + multiply_U(hidden_state, i_hiddenCand, + RWI); + + // Vector addition + add_vectors(i_afterBias, i_hiddenCand, i_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation( + i_afterAdd, gate_i); + + //-----------Gate F Calculations + // Weight multiplication + multiply_W(inputs, f_afterW, WF); + + // Bias addition + add_bias(f_afterW, f_afterBias, BF); + + // Hidden Candidate + multiply_U(hidden_state, f_hiddenCand, + RWF); + + // Vector addition + add_vectors(f_afterBias, f_hiddenCand, f_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation( + f_afterAdd, gate_f); + + //-----------Gate C Calculations + // Weight multiplication + multiply_W(inputs, c_afterW, WC); + + // Bias addition + add_bias(c_afterW, c_afterBias, BC); + + // Hidden Candidate + multiply_U(hidden_state, c_hiddenCand, + RWC); + + // Vector addition + add_vectors(c_afterBias, c_hiddenCand, c_afterAdd); + + // Activation + CONFIG_T::template activation::activation( + c_afterAdd, gate_c); + + //-----------gate I and C multiply + // Vector multiplication + multiply_vectors(gate_i, gate_c, gate_ic); + + //-----------Gate O Calculations + // Weight multiplication + multiply_W(inputs, o_afterW, WO); + + // Bias addition + add_bias(o_afterW, o_afterBias, BO); + + // Hidden Candidate + multiply_U(hidden_state, o_hiddenCand, + RWO); + + // Vector addition + add_vectors(o_afterBias, o_hiddenCand, o_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation( + o_afterAdd, gate_o); + + //-----------Cell State Calculation + // Vector multiplication + multiply_vectors(gate_f, cell_state, cell_act_multp); + + // Vector addition + add_vectors(gate_ic, cell_act_multp, cell_act_add); + + //-----------Forget gate Calculation + // Activation + CONFIG_T::template activation::activation( + cell_act_add, gate_forget); + + // Vector multiplication + multiply_vectors(gate_o, gate_forget, h); + +OUTPUT_WRITE_LOOP: + #pragma unroll + for (int x = (CONFIG_T::n_out - 1); x >= 0; x--) { + hidden_state_o[x] = h[x]; + cell_state_o[x] = cell_act_add[x]; + } +} + +template +void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF, + const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO, + const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF, + const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO, + const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t &BF, + const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t &BO) { + + // Note: currently this does not support recurrent bias + + using in_T = array; + using h_T = array; + + [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1]; + [[intel::fpga_register]] h_T hidden_state_temp; + [[intel::fpga_register]] h_T cell_state[CONFIG_T::n_timesteps + 1]; + [[intel::fpga_register]] h_T cell_state_temp; + [[intel::fpga_register]] h_T h; + [[intel::fpga_register]] h_T c; + [[intel::fpga_register]] in_T in; + +// Set initially hidden state (output) to zero +INIT_LOOP: + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[0][x] = 0; + cell_state[0][x] = 0; + } + + // Input dimension + [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) { + // Data at current time step + for (int x = 0; x < CONFIG_T::n_in; x++) { + in[x] = data[x + i * CONFIG_T::n_in]; + } + + // Hidden state at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state_temp[x] = hidden_state[i][x]; + cell_state_temp[x] = cell_state[i][x]; + } + + // Do LSTM + lstm_cell(in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO, BI, + BF, BC, BO); + + // Write result + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[i + 1][x] = h[x]; + cell_state[i + 1][x] = c[x]; + } + } + + if (CONFIG_T::return_sequences == 0) { + // Output when return_sequences is false + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + res[x] = hidden_state[CONFIG_T::n_timesteps][x]; + } + } else { + // Output when return_sequences is true + #pragma unroll + for (int x = 0; x < CONFIG_T::n_timesteps; x++) { + for (int h = 0; h < CONFIG_T::n_out; h++) { + res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h new file mode 100644 index 0000000000..893fd027c1 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h @@ -0,0 +1,47 @@ +#ifndef NNET_RECR_ACTIVATION_H_ +#define NNET_RECR_ACTIVATION_H_ + +#include "nnet_activation.h" +#include "nnet_common.h" + +namespace nnet { + +namespace activation { + +template class Activation { + public: + // ************************************************* + // Blank Activation + // ************************************************* + static void activation(const data_T &data, res_T &res) {} +}; + +template class relu : public Activation { + public: + // ************************************************* + // Relu Activation + // ************************************************* + static void activation(const data_T &data, res_T &res) { nnet::relu(data, res); } +}; + +template class sigmoid : public Activation { + public: + // ************************************************* + // Sigmoid Activation + // ************************************************* + static void activation(const data_T &data, res_T &res) { nnet::sigmoid(data, res); } +}; + +template class tanh : public Activation { + public: + // ************************************************* + // TanH Activation + // ************************************************* + static void activation(const data_T &data, res_T &res) { nnet::dense_tanh(data, res); } +}; + +} // namespace activation + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h new file mode 100644 index 0000000000..7429419cda --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h @@ -0,0 +1,68 @@ +#ifndef NNET_RECURRENT_STREAM_H_ +#define NNET_RECURRENT_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_recurrent_activation.h" + +namespace nnet { +template +void gru_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::recurrent_weight_t recurrent_weights, + typename CONFIG_T::bias_t bias, typename CONFIG_T::recurrent_bias_t recurrent_bias) { + + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + using h_T = array; + + constexpr auto datasize = std::tuple_size{}; + constexpr auto ressize = std::tuple_size{}; + + [[intel::fpga_register]] h_T h; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + h[i] = 0; + } + + [[intel::fpga_register]] data_T x; + +DataPropagation: + for (int i_in = 0; i_in < CONFIG_T::n_timesteps * CONFIG_T::n_in / datasize; i_in++) { + auto data_pack = data_pipe::read(); + + DataPack: + #pragma unroll + for (int i_pack = 0; i_pack < datasize; i_pack++) { + x[i_pack] = data_pack[i_pack]; + } + + nnet::gru_cell(x, h, weights, recurrent_weights, bias, recurrent_bias); + + if (CONFIG_T::return_sequences) { + res_T res_pack; + + ResPackRetSeq: + #pragma unroll + for (int i_pack = 0; i_pack < ressize; i_pack++) { + res_pack[i_pack] = h[i_pack]; + } + + res_pipe::write(res_pack); + } + } + + if (!CONFIG_T::return_sequences) { + res_T res_pack; + + ResPackNoRetSeq: + #pragma unroll + for (int i_pack = 0; i_pack < ressize; i_pack++) { + res_pack[i_pack] = h[i_pack]; + } + + res_pipe::write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h new file mode 100644 index 0000000000..c461e337da --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h @@ -0,0 +1,36 @@ +#ifndef NNET_IMAGE_H_ +#define NNET_IMAGE_H_ + +namespace nnet { + +struct resize_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + static const unsigned n_chan = 10; +}; + +template void resize_nearest(const data_T &image, res_T &resized) { + int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1; + int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1; + + for (int i = 0; i < CONFIG_T::new_height; i++) { + for (int j = 0; j < CONFIG_T::new_width; j++) { + int x = ((j * x_ratio) >> 16); + int y = ((i * y_ratio) >> 16); + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] = + image[(y * CONFIG_T::width * CONFIG_T::n_chan) + x * CONFIG_T::n_chan + k]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h new file mode 100644 index 0000000000..9a37f098e4 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h @@ -0,0 +1,58 @@ +#ifndef NNET_IMAGE_STREAM_H_ +#define NNET_IMAGE_STREAM_H_ + +#include "nnet_common.h" + +namespace nnet { + +template void resize_nearest_stream() { + assert(CONFIG_T::new_height % CONFIG_T::height == 0); + assert(CONFIG_T::new_width % CONFIG_T::width == 0); + + using data_T = typename ExtractPipeType::value_type; + + constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height; + constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width; + +ImageHeight: + for (unsigned h = 0; h < CONFIG_T::height; h++) { + [[intel::fpga_register]] data_T data_in_row[CONFIG_T::width]; + + ImageWidth: + for (unsigned i = 0; i < CONFIG_T::width; i++) { + [[intel::fpga_register]] auto in_data = data_pipe::read(); + + ImageChan: + #pragma unroll + for (unsigned j = 0; j < CONFIG_T::n_chan; j++) { + data_in_row[i][j] = in_data[j]; + } + } + + ResizeHeight: + for (unsigned i = 0; i < ratio_height; i++) { + + ImageWidth2: + for (unsigned l = 0; l < CONFIG_T::width; l++) { + + ResizeWidth: + for (unsigned j = 0; j < ratio_width; j++) { + + [[intel::fpga_register]] data_T out_data; + + ResizeChan: + #pragma unroll + for (unsigned k = 0; k < CONFIG_T::n_chan; k++) { + out_data[k] = data_in_row[l][k]; + } + + res_pipe::write(out_data); + } + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h new file mode 100644 index 0000000000..6e5e86a581 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h @@ -0,0 +1,126 @@ +#ifndef NNET_CLONE_H +#define NNET_CLONE_H + +#include "nnet_common.h" + +namespace nnet { + +struct broadcast_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_chan = 1; + static const unsigned n_dupl = 2; +}; + +template void clone_stream() { + using data_T = typename ExtractPipeType::value_type; + using res1_T = typename ExtractPipeType::value_type; + using res2_T = typename ExtractPipeType::value_type; + constexpr auto datasize = std::tuple_size{}; +CloneLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) { + data_T in_data = data_pipe::read(); + res1_T out_data1; + res2_T out_data2; + + ClonePack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + } + + res1_pipe::write(out_data1); + res2_pipe::write(out_data2); + } +} + +template void clone_stream() { + using data_T = typename ExtractPipeType::value_type; + using res1_T = typename ExtractPipeType::value_type; + using res2_T = typename ExtractPipeType::value_type; + using res3_T = typename ExtractPipeType::value_type; + constexpr auto datasize = std::tuple_size{}; +CloneLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) { + data_T in_data = data_pipe::read(); + res1_T out_data1; + res2_T out_data2; + res3_T out_data3; + + ClonePack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + out_data3[j] = in_data[j]; + } + + res1_pipe::write(out_data1); + res2_pipe::write(out_data2); + res3_pipe::write(out_data3); + } +} + +template void repack_stream() { + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + constexpr auto datasize = std::tuple_size{}; + constexpr auto ressize = std::tuple_size{}; + + if constexpr (datasize == ressize) { + [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) { + + [[intel::fpga_memory]] auto in_data = data_pipe::read(); + [[intel::fpga_memory]] res_T out_data; + + #pragma unroll + for (int j = 0; j < datasize; j++) { + out_data[j] = in_data[j]; + } + + res_pipe::write(out_data); + } + } else if constexpr (datasize > ressize) { + constexpr unsigned pack_diff = datasize / ressize; + + for (int i = 0; i < N / datasize; i++) { + + [[intel::fpga_memory]] auto in_data = data_pipe::read(); + [[intel::fpga_memory]] res_T out_data; + + [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) { + + #pragma unroll + for (int k = 0; k < ressize; k++) { + out_data[k] = in_data[j * ressize + k]; + } + res_pipe::write(out_data); + } + } + } else { // datasize < ressize + [[intel::fpga_memory]] res_T out_data; + constexpr unsigned pack_diff = ressize / datasize; + unsigned pack_cnt = 0; + [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) { + + [[intel::fpga_memory]] auto in_data = data_pipe::read(); + + #pragma unroll + for (int j = 0; j < datasize; j++) { + out_data[pack_cnt * datasize + j] = in_data[j]; + } + + if (pack_cnt == pack_diff - 1) { + res_pipe::write(out_data); + pack_cnt = 0; + } else { + pack_cnt++; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h new file mode 100644 index 0000000000..2c4991a13b --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h @@ -0,0 +1,48 @@ +#ifndef NNET_TRANSPOSE_H_ +#define NNET_TRANSPOSE_H_ + +namespace nnet { + +struct transpose_config { + static const unsigned height = 10; + static const unsigned width = 10; + static const unsigned depth = 10; + static constexpr unsigned perm[3] = {2, 0, 1}; +}; + +template void transpose_2d(const data_T &data, res_T &res) { + for (int i = 0; i < CONFIG_T::height; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::width; j++) { + res[j * CONFIG_T::height + i] = static_cast(data[i * CONFIG_T::width + j]); + } + } +} + +template void transpose_3d(const data_T &data, res_T &res) { + static constexpr unsigned dim_data[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width}; + static constexpr unsigned dim_res[3] = {dim_data[CONFIG_T::perm[0]], dim_data[CONFIG_T::perm[1]], + dim_data[CONFIG_T::perm[2]]}; + + int index_data[3] = {0}, index_res[3] = {0}; + + for (index_data[0] = 0; index_data[0] < dim_data[0]; index_data[0]++) { + #pragma unroll + for (index_data[1] = 0; index_data[1] < dim_data[1]; index_data[1]++) { + #pragma unroll + for (index_data[2] = 0; index_data[2] < dim_data[2]; index_data[2]++) { + index_res[0] = index_data[CONFIG_T::perm[0]]; + index_res[1] = index_data[CONFIG_T::perm[1]]; + index_res[2] = index_data[CONFIG_T::perm[2]]; + + res[index_res[0] * dim_res[1] * dim_res[2] + index_res[1] * dim_res[2] + index_res[2]] = + static_cast( + data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]]); + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h new file mode 100644 index 0000000000..e15f63c139 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h @@ -0,0 +1,39 @@ +#ifndef NNET_TRANSPOSE_STREAM_H_ +#define NNET_TRANSPOSE_STREAM_H_ + +namespace nnet { + +template void transpose_2d_stream() { + + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + + constexpr auto data_size = std::tuple_size::value_type>{}; + constexpr auto res_size = std::tuple_size::value_type>{}; + + [[intel::fpga_register]] typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width]; + + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_size; i++) { + [[intel::fpga_register]] data_T in_data = data_pipe::read(); + + #pragma unroll + for (int j = 0; j < data_size; j++) { + data_array[i * data_size + j] = typename data_T::value_type(in_data[j]); + } + } + + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_size; i++) { + [[intel::fpga_register]] res_T out_data; + + #pragma unroll + for (int j = 0; j < res_size; j++) { + out_data[j] = typename res_T::value_type(data_array[j * data_size + i]); + } + + res_pipe::write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h new file mode 100644 index 0000000000..8cf883c1d5 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -0,0 +1,71 @@ +#ifndef NNET_TYPES_H_ +#define NNET_TYPES_H_ + +#include +#include +#include +#include +#include +#include + +namespace nnet { + +// Define the pipe type that we use +template using array = std::array; + +// T should be an array +template constexpr T zero_array() { + T ar; + #pragma unroll + for (auto &a : ar) { + a = 0; + } + return ar; +} + +// This is a helper to extract the value_type of a pipe +template struct ExtractPipeType { typedef T value_type; }; + +template