// -*- Metal -*- //===-- MetalTensorOpsMatMul2dImpl //------------------------------------------------------===// // Copyright (c) 2025 Apple Inc. All rights reserved //===----------------------------------------------------------------------===// #ifndef __MetalTensorOpsMatMul2dImpl__ #define __MetalTensorOpsMatMul2dImpl__ #if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__) namespace __mutmul2d_detail { #ifndef EXTERNALLY_DEFINED_ATTR #define EXTERNALLY_DEFINED_ATTR \ __attribute__((section("air.externally_defined"))) #endif #define TENSOROPS_EXPORT [[gnu::visibility("default")]] #define INLINE __attribute__((__always_inline__)) using __matmul2d_descriptor = matmul2d_descriptor; using __reduction_operation = reduction_operation; enum class __matmul2d_cooperative_operand_index { left, right, destination, }; constexpr bool matmul2d_descriptor_is_equal(matmul2d_descriptor a, matmul2d_descriptor b) { return a.m == b.m && a.n == b.n && a.k == b.k && a.transpose_left == b.transpose_left && a.transpose_right == b.transpose_right && a.relaxed_precision == b.relaxed_precision && a.matmul_mode == b.matmul_mode; } extern "C" EXTERNALLY_DEFINED_ATTR size_t __tensorops_impl_matmul2d_op_cooperative_tensor_data_size( __matmul2d_cooperative_operand_index, __matmul2d_descriptor descriptor, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR uint16_t __tensorops_impl_matmul2d_op_cooperative_tensor_num_elements( __matmul2d_cooperative_operand_index, __matmul2d_descriptor descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR thread void * __tensorops_impl_matmul2d_op_cooperative_tensor_get_element_pointer( __matmul2d_cooperative_operand_index, __matmul2d_descriptor descriptor, __tensor_ops_detail::__thread_void_t, uint16_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR thread uint16_t __tensorops_impl_matmul2d_op_cooperative_tensor_get_element_index( __matmul2d_cooperative_operand_index, __matmul2d_descriptor descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_get_coordinate( __matmul2d_cooperative_operand_index, __matmul2d_descriptor descriptor, __tensor_ops_detail::__const_thread_void_t, uint16_t, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_init( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR bool __tensorops_impl_matmul2d_op_cooperative_tensor_is_valid_element( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, uint16_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_copy( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR bool __tensorops_impl_matmul2d_op_cooperative_tensor_is_compatible_as_input( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_f16( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_f16( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_i32( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_i32( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_i8( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_i8( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_ui8( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_ui8( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_f32( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_f32( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_b16( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_b16( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_f16( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_f16( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_i32( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_i32( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_i8( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_i8( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_ui8( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_ui8( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_f32( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_f32( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_b16( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_b16( __matmul2d_cooperative_operand_index, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int threads); extern "C" EXTERNALLY_DEFINED_ATTR size_t __tensorops_impl_matmul2d_op_cooperative_reduction_destination_data_size( __matmul2d_descriptor, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR uint16_t __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_num_elements( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR thread void * __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_element_pointer( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, uint16_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR thread uint16_t __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_element_index( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_coordinate( __matmul2d_descriptor, int, __tensor_ops_detail::__const_thread_void_t, uint16_t, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_init( __tensor_ops_detail::__thread_void_t, __matmul2d_descriptor, int, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR bool __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_is_valid_element( __matmul2d_descriptor descriptor, __tensor_ops_detail::__const_thread_void_t, int, uint16_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, int); extern "C" EXTERNALLY_DEFINED_ATTR uint16_t __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_map_index( __tensor_ops_detail::__const_thread_void_t, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __matmul2d_descriptor, int, int, uint16_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR bool __tensorops_impl_matmul2d_op_cooperative_destination_is_iterator_compatible( __matmul2d_descriptor, __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_f16( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_f16( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_i32( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_i32( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_f32( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_f32( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_b16( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_b16( __matmul2d_descriptor, __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_f16( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_f16( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_i32( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_i32( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_f32( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_f32( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_b16( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_b16( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type, int, int, __tensor_ops_detail::__tensor_ops_datatype leftDataType, __tensor_ops_detail::__tensor_ops_datatype rightDataType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f16( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__thread_void_t, half, __reduction_operation, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f32( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__thread_void_t, float, __reduction_operation, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_i32( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__thread_void_t, int, __reduction_operation, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_b16( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__thread_void_t, bfloat, __reduction_operation, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f16( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__thread_void_t, half, __reduction_operation, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f32( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__thread_void_t, float, __reduction_operation, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_i32( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__thread_void_t, int, __reduction_operation, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_b16( __matmul2d_descriptor, __tensor_ops_detail::__const_thread_void_t, __tensor_ops_detail::__thread_void_t, bfloat, __reduction_operation, __tensor_ops_detail::__tensor_ops_datatype, __tensor_ops_detail::__tensor_ops_datatype); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_dv_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_tg_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_f16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_dv_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_tg_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_i32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_dv_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_tg_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_b16(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_dv_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_tg_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_f32(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_ui8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_ui8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_ui8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_ui8_th_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_ui8_dv_i32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_th_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_dv_b16(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_th_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_dv_f32(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType); template <__matmul2d_descriptor descriptor, __matmul2d_cooperative_operand_index operand_index, typename scope, typename left_element_type, typename right_element_type, typename destination_element_type, typename coord_type, typename... args> struct __operand_layout { static_assert(__tensor_ops_detail::__is_same_v || __tensor_ops_detail::__is_same_v || #if __HAVE_INT4B_FORMAT_TYPE__ __tensor_ops_detail::__is_same_v || __tensor_ops_detail::__is_same_v || #endif __tensor_ops_detail::__is_same_v || #if __HAVE_BFLOAT__ __tensor_ops_detail::__is_same_v || #endif __tensor_ops_detail::__is_same_v, "cooperative tensor source data type can only be one of " "uint8_t/int8_t/uint4b_format/int4b_format/float/half/bfloat"); static_assert(__tensor_ops_detail::__is_same_v || __tensor_ops_detail::__is_same_v || #if __HAVE_INT4B_FORMAT_TYPE__ __tensor_ops_detail::__is_same_v || __tensor_ops_detail::__is_same_v || #endif __tensor_ops_detail::__is_same_v || #if __HAVE_BFLOAT__ __tensor_ops_detail::__is_same_v || #endif __tensor_ops_detail::__is_same_v, "cooperative tensor source data type can only be one of " "uint8_t/int8_t/uint4b_format/int4b_format/float/half/bfloat"); static_assert(__tensor_ops_detail::__is_same_v || __tensor_ops_detail::__is_same_v || #if __HAVE_BFLOAT__ __tensor_ops_detail::__is_same_v || #endif __tensor_ops_detail::__is_same_v, "cooperative tensor destination data type can only be one of " "float/half/bfloat/int32_t"); static constant constexpr __tensor_ops_detail::__rank_t rank = 2; using element_t = metal::conditional_t>; using destination_element_t = destination_element_type; using coord_t = coord_type; using extent_t = metal::dextents; using thread_storage_t = thread void *; using const_thread_storage_t = const thread void *; using index_t = uint16_t; using operand_layout_t = __operand_layout; using cooperative_tensor_t = metal::cooperative_tensor; using scope_t = scope; using left_element_t = left_element_type; using right_element_t = right_element_type; static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v, "scope should be of type __tensorops_scope"); static constexpr constant __matmul2d_cooperative_operand_index __operand_index = operand_index; static constexpr constant bool __is_matmul2d_cooperative_tensor_layout = true; static constexpr constant __matmul2d_descriptor matmul2d_desc = descriptor; // Returns the alignment of the storage allocated in each thread // for this cooperative_tensor. static constexpr size_t thread_storage_align() { return alignof(element_t); }; // Copy-constructs from the cooperative_tensor `other`. static void copy_construct(thread void *this_, thread void *other) { thread element_t *this_e = (thread element_t *)(this_); thread element_t *other_e = (thread element_t *)(other); for (size_t i = 0, e = get_capacity(this_); i != e; ++i) { other_e[i] = this_e[i]; } }; // Move-constructs from the cooperative_tensor `other`. static void move_construct(thread void *this_, thread void *other) { thread element_t *this_e = (thread element_t *)(this_); thread element_t *other_e = this_e; }; // Copy-assigns from the cooperative_tensor `other`. static void copy_assign(thread void *this_, thread void *other) { thread element_t *this_e = (thread element_t *)(this_); thread element_t *other_e = (thread element_t *)(other); for (size_t i = 0, e = get_capacity(this_); i != e; ++i) { other_e[i] = this_e[i]; } }; // Move-assigns from the cooperative_tensor `other`. static void move_assign(thread void *this_, thread void *other) { thread element_t *this_e = (thread element_t *)(this_); thread element_t *other_e = this_e; }; // Destroys the per-thread object. static void destroy(thread void *) {}; static size_t thread_storage_size() { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype destinationDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return __tensorops_impl_matmul2d_op_cooperative_tensor_data_size( operand_index, descriptor, leftDataType, rightDataType, destinationDataType, threads); } template static void load(thread_storage_t storage, const thread metal::tensor &sourceT) { using elem_t = __tensor_ops_detail::__remove_addrspace_t; static_assert(__tensor_ops_detail::__is_same_v, "Source tensor datatype does not match cooperative tensor"); static_assert(Extents::rank() == 1 || Extents::rank() == 2, "Source tensor must be rank 1 or 2"); int sourceRank = Extents::rank(); metal::execution_threads t = scope(); int threads = t.size(); __matmul2d_descriptor desc = descriptor; using tensorType = metal::tensor; using sourcePtrType = typename tensorType::data_handle_type; __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type< tensorType>(); const thread void *source = (const thread void *)(&sourceT); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype destinationDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_f16( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_f16( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_i32( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_i32( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_f32( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_f32( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_b16( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_b16( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_i8( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_i8( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_tensor_load_dv_ui8( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_load_tg_ui8( operand_index, desc, storage, source, sourceDescType, sourceRank, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); }; template static void store(const_thread_storage_t storage, const thread metal::tensor &destinationT) { using elem_t = __tensor_ops_detail::__remove_addrspace_t; static_assert(__tensor_ops_detail::__is_same_v, "Tensor datatype does not match cooperative tensor"); static_assert(Extents::rank() == 1 || Extents::rank() == rank, "Tensor must be rank 1 or 2"); __matmul2d_descriptor desc = descriptor; metal::execution_threads t = scope(); int threads = t.size(); using tensorType = metal::tensor; using destinationPtrType = typename tensorType::data_handle_type; __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type< tensorType>(); const thread void *destination = (const thread void *)(&destinationT); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype destinationDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_f16( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_f16( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_i32( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_i32( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_f32( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_f32( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_b16( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_b16( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_i8( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_i8( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_dv_ui8( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_tensor_store_tg_ui8( operand_index, desc, storage, destination, destinationDescType, leftDataType, rightDataType, destinationDataType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); }; static uint16_t get_capacity(const_thread_storage_t storage) { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return __tensorops_impl_matmul2d_op_cooperative_tensor_num_elements( operand_index, descriptor, storage, leftDataType, rightDataType, threads); } static thread element_t *get_element_pointer(const_thread_storage_t storage, index_t idx) { __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype destinationDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return (thread element_t *) __tensorops_impl_matmul2d_op_cooperative_tensor_get_element_pointer( operand_index, descriptor, (thread_storage_t)storage, idx, leftDataType, rightDataType, destinationDataType); } static index_t get_element_index(const_thread_storage_t storage, const thread element_t *element) { __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype destinationDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return (index_t) __tensorops_impl_matmul2d_op_cooperative_tensor_get_element_index( operand_index, descriptor, (thread_storage_t)storage, element, leftDataType, rightDataType, destinationDataType); } static bool is_valid_element(const_thread_storage_t storage, index_t idx) { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype destinationDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return __tensorops_impl_matmul2d_op_cooperative_tensor_is_valid_element( operand_index, descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx, leftDataType, rightDataType, destinationDataType, threads); } template static metal::array get_multidimensional_index(const_thread_storage_t storage, index_t idx) { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype destinationDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; if constexpr (__tensor_ops_detail::__is_same_v) { ushort coords[2]; __tensorops_impl_matmul2d_op_cooperative_tensor_get_coordinate( operand_index, descriptor, (__tensor_ops_detail::__const_thread_void_t)storage, idx, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16, threads, leftDataType, rightDataType, destinationDataType); return {coords[0], coords[1]}; } else if constexpr (__tensor_ops_detail::__is_same_v) { short coords[2]; __tensorops_impl_matmul2d_op_cooperative_tensor_get_coordinate( operand_index, descriptor, (__tensor_ops_detail::__const_thread_void_t)storage, idx, coords, __tensor_ops_detail::__tensor_ops_datatype_int16, threads, leftDataType, rightDataType, destinationDataType); return {coords[0], coords[1]}; } else if constexpr (__tensor_ops_detail::__is_same_v) { uint coords[2]; __tensorops_impl_matmul2d_op_cooperative_tensor_get_coordinate( operand_index, descriptor, (__tensor_ops_detail::__const_thread_void_t)storage, idx, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32, threads, leftDataType, rightDataType, destinationDataType); return {coords[0], coords[1]}; } else if constexpr (__tensor_ops_detail::__is_same_v) { int coords[2]; __tensorops_impl_matmul2d_op_cooperative_tensor_get_coordinate( operand_index, descriptor, (__tensor_ops_detail::__const_thread_void_t)storage, idx, coords, __tensor_ops_detail::__tensor_ops_datatype_int32, threads, leftDataType, rightDataType, destinationDataType); return {coords[0], coords[1]}; } else { static_assert(__tensor_ops_detail::__assert_false_v, "unsupported coordinate data type"); } } static void construct(thread_storage_t storage) { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype destinationDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensorops_impl_matmul2d_op_cooperative_tensor_init( operand_index, descriptor, (__tensor_ops_detail::__thread_void_t)storage, leftDataType, rightDataType, destinationDataType, threads); } }; template <__matmul2d_descriptor descriptor, __matmul2d_cooperative_operand_index operand_index, typename scope, typename left_element_type, typename right_element_type, typename element_type, typename coord_type, typename... args> using __cooperative_tensor_t = typename __operand_layout::cooperative_tensor_t; template <__matmul2d_descriptor descriptor, typename scope, typename left_operand, typename right_operand, typename element_type, typename coord_type, typename... args> using __cooperative_tensor_destination_t = __cooperative_tensor_t>::element_type>, typename __tensor_ops_detail::__remove_addrspace_t>::element_type>, element_type, coord_type, args...>; template <__matmul2d_descriptor descriptor, typename scope, typename left_operand, typename right_operand, typename element_type, typename coord_type, typename... args> __cooperative_tensor_destination_t __get_destination_cooperative_tensor() { static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v, "scope should be of type __tensorops_scope"); return __cooperative_tensor_destination_t(); } template <__matmul2d_descriptor descriptor, typename scope, typename left_element_type, typename right_element_type, typename element_type, typename coord_type, typename... args> using __cooperative_tensor_left_input_t = __cooperative_tensor_t; template <__matmul2d_descriptor descriptor, typename scope, typename left_element_type, typename right_element_type, typename element_type, typename coord_type, typename... args> __cooperative_tensor_left_input_t __get_left_input_cooperative_tensor() { static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "coord_type must be int"); #if __HAVE_INT4B_FORMAT_TYPE__ static_assert(!metal::is_numeric_format_v, "Input cooperative tensor element type cannot be a format type"); #endif return __cooperative_tensor_left_input_t(); } template __cooperative_tensor_left_input_t __get_left_input_cooperative_tensor(const thread metal::cooperative_tensor & src) { static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(src_layout::__is_matmul2d_cooperative_tensor_layout, "Source must be matmul2d cooperative destination tensor"); static_assert(src_layout::__operand_index == __matmul2d_cooperative_operand_index::destination, "Source must be matmul2d cooperative destination tensor"); static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(src_extents::rank() == 2, "Source rank must be 2"); static_assert(__tensor_ops_detail::__is_same_v, "src_extents::index_type must be int"); static_assert(__tensor_ops_detail::__is_same_v, "coord_type must be int"); static_assert(__tensor_ops_detail::__is_same_v, "Source cooperative tensor element type must match matmul2d left input element type"); constexpr __matmul2d_descriptor dstDesc = descriptor; constexpr __matmul2d_descriptor srcDesc = src_layout::matmul2d_desc; static_assert(dstDesc.k != static_cast(metal::dynamic_extent) && dstDesc.k != dynamic_length_v, "Inner dimension cannot be dynamic with input cooperative tensors"); static_assert(dstDesc.transpose_left ? (srcDesc.n == dstDesc.m) : (srcDesc.m == dstDesc.m), "Source height must match matmul2d op height"); static_assert(dstDesc.transpose_left ? (srcDesc.m == dstDesc.k) : (srcDesc.n == dstDesc.k), "Source width must match matmul2d op inner dimension"); static_assert(!dstDesc.transpose_left, "Input cooperative tensor cannot be transposed"); auto dst = __cooperative_tensor_left_input_t(); thread void *dstStorage = (thread void *)&dst[__tensor_ops_detail::__tensor_ops_reserved_index]; const thread void *srcStorage = (const thread void *)&src[__tensor_ops_detail::__tensor_ops_reserved_index]; __tensor_ops_detail::__tensor_ops_datatype dstLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstElementDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcElementDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensorops_impl_matmul2d_op_cooperative_tensor_copy( __matmul2d_cooperative_operand_index::left, dstDesc, srcDesc, dstStorage, srcStorage, dstLeftDataType, dstRightDataType, dstElementDataType, srcLeftDataType, srcRightDataType, srcElementDataType, 32); return dst; } template <__matmul2d_descriptor descriptor, typename scope, typename left_element_type, typename right_element_type, typename element_type, typename coord_type, typename... args> using __cooperative_tensor_right_input_t = __cooperative_tensor_t; template <__matmul2d_descriptor descriptor, typename scope, typename left_element_type, typename right_element_type, typename element_type, typename coord_type, typename... args> __cooperative_tensor_right_input_t __get_right_input_cooperative_tensor() { static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "coord_type must be int"); #if __HAVE_INT4B_FORMAT_TYPE__ static_assert(!metal::is_numeric_format_v, "Input cooperative tensor element type cannot be a format type"); #endif return __cooperative_tensor_right_input_t(); } template __cooperative_tensor_right_input_t __get_right_input_cooperative_tensor(const thread metal::cooperative_tensor & src) { static_assert(src_layout::__is_matmul2d_cooperative_tensor_layout, "Source must be matmul2d cooperative destination tensor"); static_assert(src_layout::__operand_index == __matmul2d_cooperative_operand_index::destination, "Source must be matmul2d cooperative destination tensor"); static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(src_extents::rank() == 2, "Source rank must be 2"); static_assert(__tensor_ops_detail::__is_same_v, "Source cooperative tensor element type must match matmul2d right input element type"); static_assert(__tensor_ops_detail::__is_same_v, "src_extents::index_type must be int"); static_assert(__tensor_ops_detail::__is_same_v, "coord_type must be int"); constexpr __matmul2d_descriptor dstDesc = descriptor; constexpr __matmul2d_descriptor srcDesc = src_layout::matmul2d_desc; static_assert(dstDesc.k != static_cast(metal::dynamic_extent) && dstDesc.k != dynamic_length_v, "Inner dimension cannot be dynamic with input cooperative tensors"); static_assert(dstDesc.transpose_right ? (srcDesc.n == dstDesc.k) : (srcDesc.m == dstDesc.k), "Source height must match matmul2d op inner dimension"); static_assert(dstDesc.transpose_right ? (srcDesc.m == dstDesc.n) : (srcDesc.n == dstDesc.n), "Source width must match matmul2d op width"); static_assert(!dstDesc.transpose_right, "Input cooperative tensor cannot be transposed"); auto dst = __cooperative_tensor_right_input_t(); thread void *dstStorage = (thread void *)&dst[__tensor_ops_detail::__tensor_ops_reserved_index]; const thread void *srcStorage = (const thread void *)&src[__tensor_ops_detail::__tensor_ops_reserved_index]; __tensor_ops_detail::__tensor_ops_datatype dstLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstElementDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcElementDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensorops_impl_matmul2d_op_cooperative_tensor_copy( __matmul2d_cooperative_operand_index::right, dstDesc, srcDesc, dstStorage, srcStorage, dstLeftDataType, dstRightDataType, dstElementDataType, srcLeftDataType, srcRightDataType, srcElementDataType, 32); return dst; } template inline bool __is_compatible_as_left_input( const thread metal::cooperative_tensor & src) { static_assert(src_layout::__is_matmul2d_cooperative_tensor_layout, "Source must be matmul2d cooperative destination tensor"); static_assert(src_layout::__operand_index == __matmul2d_cooperative_operand_index::destination, "Source must be matmul2d cooperative destination tensor"); static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(src_extents::rank() == 2, "Source rank must be 2"); static_assert(__tensor_ops_detail::__is_same_v, "Source cooperative tensor element type must match matmul2d left input element type"); static_assert(__tensor_ops_detail::__is_same_v, "src_extents::index_type must be int"); constexpr __matmul2d_descriptor dstDesc = descriptor; constexpr __matmul2d_descriptor srcDesc = src_layout::matmul2d_desc; static_assert(dstDesc.k != static_cast(metal::dynamic_extent) && dstDesc.k != dynamic_length_v, "Inner dimension cannot be dynamic with input cooperative tensors"); static_assert(dstDesc.transpose_left ? (srcDesc.n == dstDesc.m) : (srcDesc.m == dstDesc.m), "Source height must match matmul2d op height"); static_assert(dstDesc.transpose_left ? (srcDesc.m == dstDesc.k) : (srcDesc.n == dstDesc.k), "Source width must match matmul2d op inner dimension"); static_assert(!dstDesc.transpose_left, "Input cooperative tensor cannot be transposed"); const thread void *srcStorage = (const thread void *)&src[__tensor_ops_detail::__tensor_ops_reserved_index]; __tensor_ops_detail::__tensor_ops_datatype dstLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstElementDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcElementDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return __tensorops_impl_matmul2d_op_cooperative_tensor_is_compatible_as_input( __matmul2d_cooperative_operand_index::left, dstDesc, srcDesc, srcStorage, dstLeftDataType, dstRightDataType, dstElementDataType, srcLeftDataType, srcRightDataType, srcElementDataType, 32); } template inline bool __is_compatible_as_right_input( const thread metal::cooperative_tensor & src) { static_assert(src_layout::__is_matmul2d_cooperative_tensor_layout, "Source must be matmul2d cooperative destination tensor"); static_assert(src_layout::__operand_index == __matmul2d_cooperative_operand_index::destination, "Source must be matmul2d cooperative destination tensor"); static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(src_extents::rank() == 2, "Source rank must be 2"); static_assert(__tensor_ops_detail::__is_same_v, "Source cooperative tensor element type must match matmul2d right input element type"); static_assert(__tensor_ops_detail::__is_same_v, "src_extents::index_type must be int"); constexpr __matmul2d_descriptor dstDesc = descriptor; constexpr __matmul2d_descriptor srcDesc = src_layout::matmul2d_desc; static_assert(dstDesc.k != static_cast(metal::dynamic_extent) && dstDesc.k != dynamic_length_v, "Inner dimension cannot be dynamic with input cooperative tensors"); static_assert(dstDesc.transpose_right ? (srcDesc.n == dstDesc.k) : (srcDesc.m == dstDesc.k), "Source height must match matmul2d op inner dimension"); static_assert(dstDesc.transpose_right ? (srcDesc.m == dstDesc.n) : (srcDesc.n == dstDesc.n), "Source width must match matmul2d op width"); static_assert(!dstDesc.transpose_right, "Input cooperative tensor cannot be transposed"); const thread void *srcStorage = (const thread void *)&src[__tensor_ops_detail::__tensor_ops_reserved_index]; __tensor_ops_detail::__tensor_ops_datatype dstLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstElementDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcElementDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return __tensorops_impl_matmul2d_op_cooperative_tensor_is_compatible_as_input( __matmul2d_cooperative_operand_index::right, dstDesc, srcDesc, srcStorage, dstLeftDataType, dstRightDataType, dstElementDataType, srcLeftDataType, srcRightDataType, srcElementDataType, 32); } template <__matmul2d_descriptor descriptor, int reduction_dim, typename scope, typename left_operand, typename right_operand, typename element_type, typename coord_type, typename... args> struct __reduction_operand_layout { static_assert(__tensor_ops_detail::__is_same_v || __tensor_ops_detail::__is_same_v || __tensor_ops_detail::__is_same_v || __tensor_ops_detail::__is_same_v, "cooperative tensor data type can only be one of " "float/half/bfloat/int32_t"); static constant constexpr __tensor_ops_detail::__rank_t rank = 1; using element_t = element_type; using coord_t = coord_type; using extent_t = metal::dextents; using thread_storage_t = thread void *; using const_thread_storage_t = const thread void *; using index_t = uint16_t; using operand_layout_t = __reduction_operand_layout; using cooperative_tensor_t = metal::cooperative_tensor; using scope_t = scope; using left_t = __tensor_ops_detail::__remove_addrspace_t<__tensor_ops_detail::__remove_reference_t>; using right_t = __tensor_ops_detail::__remove_addrspace_t<__tensor_ops_detail::__remove_reference_t>; using left_elem_t = typename left_t::element_type; using right_elem_t = typename right_t::element_type; using left_value_t = __tensor_ops_detail::__remove_addrspace_t; using right_value_t = __tensor_ops_detail::__remove_addrspace_t; static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v, "scope should be of type __tensorops_scope"); static_assert(reduction_dim == 0 || reduction_dim == 1, "Reduction dimension must be 0 or 1"); static constexpr constant bool is_matmul2d_reduction_cooperative_destination_layout = true; static constexpr constant int __reduction_dim = reduction_dim; static constexpr constant __matmul2d_descriptor matmul2d_desc = descriptor; // Returns the alignment of the storage allocated in each thread // for this cooperative_tensor. static constexpr size_t thread_storage_align() { return alignof(element_t); }; // Copy-constructs from the cooperative_tensor `other`. static void copy_construct(thread void *this_, thread void *other) { thread element_t *this_e = (thread element_t *)(this_); thread element_t *other_e = (thread element_t *)(other); for (size_t i = 0, e = get_capacity(this_); i != e; ++i) { other_e[i] = this_e[i]; } }; // Move-constructs from the cooperative_tensor `other`. static void move_construct(thread void *this_, thread void *other) { thread element_t *this_e = (thread element_t *)(this_); thread element_t *other_e = this_e; }; // Copy-assigns from the cooperative_tensor `other`. static void copy_assign(thread void *this_, thread void *other) { thread element_t *this_e = (thread element_t *)(this_); thread element_t *other_e = (thread element_t *)(other); for (size_t i = 0, e = get_capacity(this_); i != e; ++i) { other_e[i] = this_e[i]; } }; // Move-assigns from the cooperative_tensor `other`. static void move_assign(thread void *this_, thread void *other) { thread element_t *this_e = (thread element_t *)(this_); thread element_t *other_e = this_e; }; // Destroys the per-thread object. static void destroy(thread void *) {}; static size_t thread_storage_size() { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype elementDataType = __tensor_ops_detail::__element_type_to_tensor_ops_datatype(); return __tensorops_impl_matmul2d_op_cooperative_reduction_destination_data_size( descriptor, reduction_dim, leftDataType, rightDataType, elementDataType, threads); } template static void load(thread_storage_t storage, const thread metal::tensor &sourceT) { using elem_t = __tensor_ops_detail::__remove_addrspace_t; static_assert(__tensor_ops_detail::__is_same_v, "Source tensor datatype does not match cooperative tensor"); static_assert(Extents::rank() == 1, "Source tensor must be rank 1"); metal::execution_threads t = scope(); int threads = t.size(); __matmul2d_descriptor desc = descriptor; using tensorType = metal::tensor; using sourcePtrType = typename tensorType::data_handle_type; __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type< tensorType>(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; const thread void *source = (const thread void *)(&sourceT); if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_f16( desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_f16( desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_i32( desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_i32( desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_f32( desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_f32( desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_b16( desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< sourcePtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_b16( desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); }; template static void store(const_thread_storage_t storage, const thread metal::tensor &destinationT) { using elem_t = __tensor_ops_detail::__remove_addrspace_t; static_assert(__tensor_ops_detail::__is_same_v, "Tensor datatype does not match cooperative tensor"); static_assert(Extents::rank() == 1, "Tensor must be rank 1"); __matmul2d_descriptor desc = descriptor; metal::execution_threads t = scope(); int threads = t.size(); using tensorType = metal::tensor; using destinationPtrType = typename tensorType::data_handle_type; __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type< tensorType>(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; const thread void *destination = (const thread void *)(&destinationT); if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_f16( desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_f16( desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_i32( desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_i32( desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_f32( desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_f32( desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_b16( desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v< destinationPtrType>) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_b16( desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); }; static uint16_t get_capacity(const_thread_storage_t storage) { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_num_elements( descriptor, storage, reduction_dim, leftDataType, rightDataType, threads); } static thread element_t *get_element_pointer(const_thread_storage_t storage, index_t idx) { __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dataType = __tensor_ops_detail::__element_type_to_tensor_ops_datatype(); return (thread element_t *) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_element_pointer( descriptor, (thread_storage_t)storage, idx, leftDataType, rightDataType, dataType); } static index_t get_element_index(const_thread_storage_t storage, const thread element_type *element) { __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dataType = __tensor_ops_detail::__element_type_to_tensor_ops_datatype(); return (index_t) __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_element_index( descriptor, (thread_storage_t)storage, element, leftDataType, rightDataType, dataType); } static bool is_valid_element(const_thread_storage_t storage, index_t idx) { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dataType = __tensor_ops_detail::__element_type_to_tensor_ops_datatype(); return __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_is_valid_element( descriptor, (__tensor_ops_detail::__thread_void_t)storage, reduction_dim, idx, leftDataType, rightDataType, dataType, threads); } template static metal::array get_multidimensional_index(const_thread_storage_t storage, index_t idx) { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype elementDataType = __tensor_ops_detail::__element_type_to_tensor_ops_datatype(); if constexpr (__tensor_ops_detail::__is_same_v) { ushort coords[1]; __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_coordinate( descriptor, reduction_dim, (__tensor_ops_detail::__thread_void_t)storage, idx, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16, threads, leftDataType, rightDataType, elementDataType); return { coords[0] }; } else if constexpr (__tensor_ops_detail::__is_same_v) { short coords[1]; __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_coordinate( descriptor, reduction_dim, (__tensor_ops_detail::__thread_void_t)storage, idx, coords, __tensor_ops_detail::__tensor_ops_datatype_int16, threads, leftDataType, rightDataType, elementDataType); return { coords[0] }; } else if constexpr (__tensor_ops_detail::__is_same_v) { uint coords[1]; __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_coordinate( descriptor, reduction_dim, (__tensor_ops_detail::__thread_void_t)storage, idx, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32, threads, leftDataType, rightDataType, elementDataType); ; return { coords[0] }; } else if constexpr (__tensor_ops_detail::__is_same_v) { int coords[1]; __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_coordinate( descriptor, reduction_dim, (__tensor_ops_detail::__thread_void_t)storage, idx, coords, __tensor_ops_detail::__tensor_ops_datatype_int32, threads, leftDataType, rightDataType, elementDataType); return { coords[0] }; } else { static_assert(__tensor_ops_detail::__assert_false_v, "unsupported coordinate data type"); } } static void construct(thread_storage_t storage) { metal::execution_threads t = scope(); int threads = t.size(); __tensor_ops_detail::__tensor_ops_datatype elementDataType = __tensor_ops_detail::__element_type_to_tensor_ops_datatype(); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_init( (__tensor_ops_detail::__thread_void_t)storage, descriptor, reduction_dim, leftDataType, rightDataType, elementDataType, threads); } template static uint16_t map_index(const thread void *from_storage, uint16_t from_idx, const thread void *to_storage) { using sourceLayout = typename FromIterator::layout; using destLayout = typename ToIterator::layout; static_assert(sourceLayout::__is_matmul2d_cooperative_tensor_layout, "Source must be a matmul2d destination cooperative tensor"); static_assert(sourceLayout::__operand_index == __matmul2d_cooperative_operand_index::destination, "Source must be a matmul2d destination cooperative tensor"); static_assert(destLayout::is_matmul2d_reduction_cooperative_destination_layout, "Destination must be a matmul2d reduction destination cooperative tensor"); static_assert(__tensor_ops_detail::__is_same_v, "map_index requires a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "map_index requires a single SIMD group"); metal::execution_threads t = scope(); int threads = t.size(); constexpr __matmul2d_descriptor sourceDesc = sourceLayout::matmul2d_desc; constexpr __matmul2d_descriptor destDesc = destLayout::matmul2d_desc; static_assert(reduction_dim == 0 || sourceDesc.n == destDesc.n, "Source and destination must have matching N dimension if reduction_dim = 1"); static_assert(reduction_dim == 1 || sourceDesc.m == destDesc.m, "Source and destination must have matching M dimension if reduction_dim = 0"); static_assert(__tensor_ops_detail::__is_same_v, "Source and destination element types must match"); __tensor_ops_detail::__tensor_ops_datatype srcLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_map_index( from_storage, sourceDesc, to_storage, destDesc, reduction_dim, threads, from_idx, srcLeftDataType, srcRightDataType); } }; template <__matmul2d_descriptor descriptor, typename scope, typename left_operand, typename right_operand, typename element_type, typename coord_type, typename... args> using __cooperative_tensor_row_reduction_destination_t = typename __reduction_operand_layout::cooperative_tensor_t; template <__matmul2d_descriptor descriptor, typename scope, typename left_operand, typename right_operand, typename element_type, typename coord_type, typename... args> using __cooperative_tensor_column_reduction_destination_t = typename __reduction_operand_layout::cooperative_tensor_t; template <__matmul2d_descriptor descriptor, typename scope, typename left_operand, typename right_operand, typename element_type, typename coord_type, typename... args> __cooperative_tensor_row_reduction_destination_t __get_row_reduction_destination_cooperative_tensor() { static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v, "scope should be of type __tensorops_scope"); return __cooperative_tensor_row_reduction_destination_t(); } template <__matmul2d_descriptor descriptor, typename scope, typename left_operand, typename right_operand, typename element_type, typename coord_type, typename... args> __cooperative_tensor_column_reduction_destination_t __get_column_reduction_destination_cooperative_tensor() { static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v, "scope should be of type __tensorops_scope"); return __cooperative_tensor_column_reduction_destination_t(); } template struct __cooperative_tensor_layout; template struct __cooperative_tensor_layout> { using layout = L; }; template struct __cooperative_tensor_left_elem_type; template struct __cooperative_tensor_left_elem_type> { using type = typename L::left_element_t; }; template struct __cooperative_tensor_right_elem_type; template struct __cooperative_tensor_right_elem_type> { using type = typename L::right_element_t; }; template struct __cooperative_tensor_destination_elem_type; template struct __cooperative_tensor_destination_elem_type> { using type = typename L::destination_element_t; }; template<__matmul2d_cooperative_operand_index operand_index, class T> constexpr bool __is_cooperative_tensor_operand() { if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v) { using layout = typename __cooperative_tensor_layout::layout; return layout::__is_matmul2d_cooperative_tensor_layout && layout::__operand_index == operand_index; } else return false; } template <__matmul2d_descriptor descriptor, typename scope, typename left_operand, typename right_operand, typename destination_operand, typename... args> void __run(thread left_operand &leftIn, thread right_operand &rightIn, thread destination_operand &destinationT) { using leftTensorType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t>; using rightTensorType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t>; using destinationTensorType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t>; metal::execution_threads t = scope(); int threads = t.size(); static_assert(__tensor_ops_detail::__is_tensor_type_v || __is_cooperative_tensor_operand<__matmul2d_cooperative_operand_index::left, leftTensorType>(), "Left operand must be a tensor or matmul2d left input cooperative tensor"); static_assert(__tensor_ops_detail::__is_tensor_type_v || __is_cooperative_tensor_operand<__matmul2d_cooperative_operand_index::right, rightTensorType>(), "Right operand must be a tensor or matmul2d right input cooperative tensor"); static_assert(__tensor_ops_detail::__is_tensor_type_v || __is_cooperative_tensor_operand<__matmul2d_cooperative_operand_index::destination, destinationTensorType>(), "Destination operand must be a tensor or matmul2d destination cooperative tensor"); static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v, "scope should be of type __tensorops_scope"); static_assert(__tensor_ops_detail::__get_rank() == 2, "Operand must have rank 2"); static_assert(__tensor_ops_detail::__get_rank() == 2, "Operand must have rank 2"); static_assert(__tensor_ops_detail::__get_rank() == 2, "Operand must have rank 2"); static_assert( __tensor_ops_detail::__is_same_v, "Index type must be int"); static_assert( __tensor_ops_detail::__is_same_v, "Index type must be int"); static_assert(__tensor_ops_detail::__is_same_v< typename destinationTensorType::index_type, int>, "Index type must be int"); using leftPtrType = typename leftTensorType::data_handle_type; using rightPtrType = typename rightTensorType::data_handle_type; using destinationPtrType = typename destinationTensorType::data_handle_type; using leftValueType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t< typename leftTensorType::element_type>>; using rightValueType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t< typename rightTensorType::element_type>>; using destinationValueType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t< typename destinationTensorType::element_type>>; if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v || __tensor_ops_detail::__is_cooperative_tensor_type_v) { static_assert(__tensor_ops_detail::__is_same_v, "Input cooperative tensors require a single SIMD group"); static_assert(descriptor.k != static_cast(metal::dynamic_extent) && descriptor.k != dynamic_length_v, "Inner dimension cannot be dynamic with input cooperative tensors"); } __matmul2d_descriptor desc = descriptor; // Check types declared on cooperative tensors match actual inputs to run() if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v) { using _leftType = typename __cooperative_tensor_left_elem_type::type; using _rightType = typename __cooperative_tensor_right_elem_type::type; using _destinationType = typename __cooperative_tensor_destination_elem_type::type; static_assert(__tensor_ops_detail::__is_same_v<_leftType, leftValueType>, "Input types must match cooperative tensor types"); static_assert(__tensor_ops_detail::__is_same_v<_rightType, rightValueType>, "Input types must match cooperative tensor types"); static_assert(__tensor_ops_detail::__is_same_v<_destinationType, destinationValueType>, "Input types must match cooperative tensor types"); } if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v) { using _leftType = typename __cooperative_tensor_left_elem_type::type; using _rightType = typename __cooperative_tensor_right_elem_type::type; using _destinationType = typename __cooperative_tensor_destination_elem_type::type; static_assert(__tensor_ops_detail::__is_same_v<_leftType, leftValueType>, "Input types must match cooperative tensor types"); static_assert(__tensor_ops_detail::__is_same_v<_rightType, rightValueType>, "Input types must match cooperative tensor types"); static_assert(__tensor_ops_detail::__is_same_v<_destinationType, destinationValueType>, "Input types must match cooperative tensor types"); } if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v) { using _leftType = typename __cooperative_tensor_left_elem_type::type; using _rightType = typename __cooperative_tensor_right_elem_type::type; using _destinationType = typename __cooperative_tensor_destination_elem_type::type; static_assert(__tensor_ops_detail::__is_same_v<_leftType, leftValueType>, "Input types must match cooperative tensor types"); static_assert(__tensor_ops_detail::__is_same_v<_rightType, rightValueType>, "Input types must match cooperative tensor types"); static_assert(__tensor_ops_detail::__is_same_v<_destinationType, destinationValueType>, "Input types must match cooperative tensor types"); } if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v) { static_assert(descriptor.m == 32 || descriptor.n == 32 || descriptor.k == 32, "At least one of M, N, or K must be 32 if both inputs are cooperative tensors"); static_assert(descriptor.m == 16 || descriptor.m == 32, "M must be 16 or 32 if both inputs are cooperative tensors"); static_assert(descriptor.n == 16 || descriptor.n == 32, "N must be 16 or 32 if both inputs are cooperative tensors"); static_assert(descriptor.k == 16 || descriptor.k == 32, "K must be 16 or 32 if both inputs are cooperative tensors"); } if constexpr (!__tensor_ops_detail::__is_same_v) { // SIMD group(s) scope static_assert((descriptor.m % 8) == 0 || (descriptor.m % 16) == 0, "M must be a multiple of 8 or 16"); static_assert((descriptor.n % 8) == 0 || (descriptor.n % 16) == 0, "N must be a multiple of 8 or 16"); static_assert((descriptor.m % 16) == 0 || (descriptor.n % 16) == 0, "At least one of M or N must be a multiple of 16"); if constexpr (descriptor.k != static_cast(metal::dynamic_extent) && descriptor.k != dynamic_length_v) { #if __HAVE_INT4B_FORMAT_TYPE__ if constexpr (metal::is_same_v || metal::is_same_v || metal::is_same_v || metal::is_same_v) { static_assert((descriptor.k % 32) == 0, "K must be dynamic or a multiple of 32 with sub-byte element types"); } else #endif { static_assert((descriptor.k % 16) == 0, "K must be dynamic or a multiple of 16"); } } } else { // Single thread scope static_assert(descriptor.m == 1 || descriptor.m == 2 || descriptor.m == 4 || (descriptor.m % 8) == 0, "M must be 1, 2, 4, or a multiple of 8 with execution_thread"); static_assert(descriptor.n == 1 || descriptor.n == 2 || descriptor.n == 4 || (descriptor.n % 8) == 0, "N must be 1, 2, 4, or a multiple of 8 with execution_thread"); if constexpr (descriptor.k != static_cast(metal::dynamic_extent) && descriptor.k != dynamic_length_v) static_assert((descriptor.k % 16) == 0, "K must be dynamic or a multiple of 16"); } // single thread if constexpr (__tensor_ops_detail::__is_same_v) { if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v) { const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); thread void *left = (thread void *)(&leftIn); thread void *right = (thread void *)(&rightIn); thread void *destination = (thread void *)(&destinationT); if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_ui8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_ui8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_ui8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_ui8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_ui8_th_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_ui8_th_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_ui8_th_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_ui8_th_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_ui8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_dv_ui8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v && __tensor_ops_detail::__is_thread_addrspace_v) __tensorops_impl_matmul2d_op_run_single_thread_th_ui8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } else static_assert( __tensor_ops_detail::__assert_false_v, "Operands cannot be cooperative tensor with execution_thread "); } else { // multiple threads if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v) { const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); thread void *left = (thread void *)(&leftIn); thread void *right = (thread void *)(&rightIn); thread void *destination = (thread void *)(&destinationT); if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_i4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_i4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_i4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_i4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_i4_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_i4_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_i4_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_i4_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_dv_i4_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_dv_i4_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_i8_tg_i4_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_i8_tg_i4_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui4_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui4_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui4_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui4_dv_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui4_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui4_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui4_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui4_tg_i32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_i4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_i4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_i4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_i4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_dv_ui4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_dv_ui4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_dv_b16_tg_ui4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_tg_b16_tg_ui4_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } else if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v) { const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); thread void *left = (thread void *)(&leftIn); thread void *right = (thread void *)(&rightIn); thread void *destination = (thread void *)&destinationT[__tensor_ops_detail::__tensor_ops_reserved_index]; if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i8_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i8_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i8_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i8_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_ui8_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_ui8_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_ui8_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_ui8_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui8_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui8_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui8_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui8_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui8_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i4_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i4_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i4_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i4_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui4_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui4_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui4_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui4_f16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i4_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i4_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i4_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i4_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_ui4_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_ui4_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_ui4_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_ui4_i32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i4_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i4_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i4_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i4_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui4_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui4_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui4_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui4_b16(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_ui4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_ui4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_ui4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_ui4_f32(desc, left, leftDescType, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } else if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v) { const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); thread void *left = (thread void *)(&leftIn); thread void *right = (thread void *)&rightIn[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *destination = (thread void *)(&destinationT); if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_dv_i32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_dv_i32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_tg_i32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_tg_i32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_dv_i32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_dv_i32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_tg_i32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_tg_i32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_dv_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_tg_f16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_dv_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_tg_b16(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_dv_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_tg_f32(desc, left, leftDescType, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } else if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v) { const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); thread void *left = (thread void *)(&leftIn); thread void *right = (thread void *)&rightIn[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *destination = (thread void *)&destinationT[__tensor_ops_detail::__tensor_ops_reserved_index]; if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_f16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_f16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_f16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_f16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_f16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_f16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_i32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_i32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_i32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_i32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_b16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_b16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_b16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_b16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_b16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_b16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_b16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_b16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_f16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_f16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_b16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_b16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_f16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_f16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_b16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_b16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_b16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_b16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_ui8_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_ui8_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_b16(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_b16(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_b16_f32(desc, left, leftDescType, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_b16_f32(desc, left, leftDescType, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } else if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v) { const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); thread void *left = (thread void *)&leftIn[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *right = (thread void *)(&rightIn); thread void *destination = (thread void *)(&destinationT); if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_dv_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_dv_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_tg_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_tg_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_dv_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_dv_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_tg_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_tg_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_dv_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_tg_f16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_dv_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_dv_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_tg_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_tg_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_dv_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_dv_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_tg_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_tg_i32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_dv_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_tg_b16(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_dv_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_device_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v && __tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_tg_f32(desc, left, right, rightDescType, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } else if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v) { const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); thread void *left = (thread void *)&leftIn[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *right = (thread void *)(&rightIn); thread void *destination = (thread void *)&destinationT[__tensor_ops_detail::__tensor_ops_reserved_index]; if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_i32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_i32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_i32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_i32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_b16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_b16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui8_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui8_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_b16_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_b16_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i4_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i4_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_f16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_f16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui4_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui4_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i4_i32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i4_i32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui4_i32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui4_i32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_b16(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_b16(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i4_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i4_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif #if __HAVE_INT4B_FORMAT_TYPE__ else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_ui4_f32(desc, left, right, rightDescType, destination, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_ui4_f32(desc, left, right, rightDescType, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } #endif else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } else if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v && !__tensor_ops_detail::__is_cooperative_tensor_type_v) { const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType = __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type(); thread void *left = (thread void *)&leftIn[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *right = (thread void *)&rightIn[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *destination = (thread void *)(&destinationT); if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_dv_f16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_tg_f16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_dv_i32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_tg_i32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_dv_i32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_tg_i32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_dv_b16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_tg_b16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_dv_b16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_tg_b16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_dv_b16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_tg_b16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_b16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_b16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_f16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_f16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_dv_b16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_tg_b16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_dv_f16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_tg_f16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_dv_b16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_tg_b16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_dv_b16(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_tg_b16(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) { if constexpr (__tensor_ops_detail::__is_device_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_dv_f32(desc, left, right, destination, destinationDescType, threads); else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_tg_f32(desc, left, right, destination, destinationDescType, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported address space"); } else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } else if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v && __tensor_ops_detail::__is_cooperative_tensor_type_v) { thread void *left = (thread void *)&leftIn[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *right = (thread void *)&rightIn[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *destination = (thread void *)&destinationT[__tensor_ops_detail::__tensor_ops_reserved_index]; if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_f16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_i32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_i32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_b16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_b16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_b16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_b16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_f16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_b16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_f16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_f16_b16_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_b16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_b16_ui8_f32(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_b16(desc, left, right, destination, threads); else if constexpr (__tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v && __tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_run_cooperative_ui8_b16_f32(desc, left, right, destination, threads); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } } } template inline void __reduce_rows( thread metal::cooperative_tensor &sourceT, thread metal::cooperative_tensor &destT, ElementType identity = (ElementType)0, __reduction_operation op = reduction_operation::sum) { static_assert(SrcLayout::__is_matmul2d_cooperative_tensor_layout, "Source must be matmul2d cooperative destination tensor"); static_assert(SrcLayout::__operand_index == __matmul2d_cooperative_operand_index::destination, "Source must be matmul2d cooperative destination tensor"); static_assert(DstLayout::is_matmul2d_reduction_cooperative_destination_layout, "Destination must be matmul2d row reduction cooperative destination tensor"); static_assert(DstLayout::__reduction_dim == 0, "Destination must be matmul2d row reduction cooperative destination tensor"); static_assert(__tensor_ops_detail::__is_same_v, "reduce_rows requires a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "reduce_rows requires a single SIMD group"); static_assert(SrcExtents::rank() == 2, "Source rank must be 2"); static_assert(DstExtents::rank() == 1, "Destination rank must be 1"); constexpr __matmul2d_descriptor sourceDesc = SrcLayout::matmul2d_desc; constexpr __matmul2d_descriptor destDesc = DstLayout::matmul2d_desc; using dstLeftValueType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t< typename DstLayout::left_elem_t>>; using dstRightValueType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t< typename DstLayout::right_elem_t>>; static_assert(matmul2d_descriptor_is_equal(sourceDesc, destDesc), "Source and destination matmul2d descriptor must match"); static_assert(__tensor_ops_detail::__is_same_v, "Source and destination operand types must match"); static_assert(__tensor_ops_detail::__is_same_v, "Source and destination operand types must match"); static_assert(__tensor_ops_detail::__is_same_v, "Source and destination element types must match"); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; thread void *src = (thread void *)&sourceT[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *dst = (thread void *)&destT[__tensor_ops_detail::__tensor_ops_reserved_index]; __matmul2d_descriptor desc = SrcLayout::matmul2d_desc; if constexpr (__tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f16( desc, src, dst, identity, op, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_i32( desc, src, dst, identity, op, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f32( desc, src, dst, identity, op, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_b16( desc, src, dst, identity, op, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } template inline void __reduce_columns( thread metal::cooperative_tensor &sourceT, thread metal::cooperative_tensor &destT, ElementType identity = (ElementType)0, __reduction_operation op = reduction_operation::sum) { static_assert(SrcLayout::__is_matmul2d_cooperative_tensor_layout, "Source must be matmul2d cooperative destination tensor"); static_assert(SrcLayout::__operand_index == __matmul2d_cooperative_operand_index::destination, "Source must be matmul2d cooperative destination tensor"); static_assert(DstLayout::__reduction_dim == 1, "Destination must be matmul2d column reduction cooperative destination tensor"); static_assert(__tensor_ops_detail::__is_same_v, "reduce_columns requires a single SIMD group"); static_assert(__tensor_ops_detail::__is_same_v, "reduce_columns requires a single SIMD group"); static_assert(SrcExtents::rank() == 2, "Source rank must be 2"); static_assert(DstExtents::rank() == 1, "Destination rank must be 1"); constexpr __matmul2d_descriptor sourceDesc = SrcLayout::matmul2d_desc; constexpr __matmul2d_descriptor destDesc = DstLayout::matmul2d_desc; using dstLeftValueType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t< typename DstLayout::left_elem_t>>; using dstRightValueType = __tensor_ops_detail::__remove_addrspace_t< __tensor_ops_detail::__remove_reference_t< typename DstLayout::right_elem_t>>; static_assert(matmul2d_descriptor_is_equal(sourceDesc, destDesc), "Source and destination matmul2d descriptor must match"); static_assert(__tensor_ops_detail::__is_same_v, "Source and destination operand types must match"); static_assert(__tensor_ops_detail::__is_same_v, "Source and destination operand types must match"); static_assert(__tensor_ops_detail::__is_same_v, "Source and destination element types must match"); __tensor_ops_detail::__tensor_ops_datatype leftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype rightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; thread void *src = (thread void *)&sourceT[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *dst = (thread void *)&destT[__tensor_ops_detail::__tensor_ops_reserved_index]; __matmul2d_descriptor desc = SrcLayout::matmul2d_desc; if constexpr (__tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f16( desc, src, dst, identity, op, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_i32( desc, src, dst, identity, op, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f32( desc, src, dst, identity, op, leftDataType, rightDataType); else if constexpr (__tensor_ops_detail::__is_same_v) __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_b16( desc, src, dst, identity, op, leftDataType, rightDataType); else static_assert(__tensor_ops_detail::__assert_false_v, "Unsupported type"); } template inline bool __is_iterator_compatible( const thread metal::cooperative_tensor &sourceT, const thread metal::cooperative_tensor &destT) { if (!SrcLayout::__is_matmul2d_cooperative_tensor_layout || SrcLayout::__operand_index != __matmul2d_cooperative_operand_index::destination || !DstLayout::is_matmul2d_reduction_cooperative_destination_layout || !__tensor_ops_detail::__is_same_v || !__tensor_ops_detail::__is_same_v || !__tensor_ops_detail::__is_same_v || SrcExtents::rank() != 2 || DstExtents::rank() != 1) { return false; } constexpr __matmul2d_descriptor sourceDesc = SrcLayout::matmul2d_desc; constexpr __matmul2d_descriptor destDesc = DstLayout::matmul2d_desc; constexpr int reduction_dim = DstLayout::__reduction_dim; if ((reduction_dim == 0 && sourceDesc.m != destDesc.m) || (reduction_dim == 1 && sourceDesc.n == destDesc.n)) { return false; } thread void *src = (thread void *)&sourceT[__tensor_ops_detail::__tensor_ops_reserved_index]; thread void *dst = (thread void *)&destT[__tensor_ops_detail::__tensor_ops_reserved_index]; __tensor_ops_detail::__tensor_ops_datatype srcLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype srcElemDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstLeftDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstRightDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; __tensor_ops_detail::__tensor_ops_datatype dstElemDataType = __tensor_ops_detail::__type_to_tensor_ops_datatype::value; return __tensorops_impl_matmul2d_op_cooperative_destination_is_iterator_compatible( sourceDesc, destDesc, src, dst, srcLeftDataType, srcRightDataType, srcElemDataType, dstLeftDataType, dstRightDataType, dstElemDataType); } #undef EXTERNALLY_DEFINED_ATTR } // namespace __mutmul2d_detail #endif #endif // __TensorOpsMatMul2DImpl__