21template <
typename Gr
idwiseGemm,
bool HasMainKBlockLoop>
23#if CK_USE_LAUNCH_BOUNDS
26#if CK_USE_WAVES_PER_EU
27 __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
31#if(defined(__gfx103__) || defined(__gfx11__))
32 __shared__
char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
35 GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(karg.M, karg.K, karg.AK0, karg.StrideA));
37 GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(karg.K, karg.N, karg.BK0, karg.StrideB));
39 GridwiseGemm::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC));
41 GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid,
45 a_grid_desc_ak0_m_ak1,
46 b_grid_desc_bk0_n_bk1,
61 typename AElementwiseOperation,
62 typename BElementwiseOperation,
63 typename CElementwiseOperation,
74 typename ABlockTransferThreadClusterLengths_K0_M_K1,
75 typename ABlockTransferThreadClusterArrangeOrder,
76 typename ABlockTransferSrcAccessOrder,
77 index_t ABlockTransferSrcVectorDim,
78 index_t ABlockTransferSrcScalarPerVector,
79 index_t ABlockTransferDstScalarPerVector_K1,
80 bool AThreadTransferSrcResetCoordinateAfterRun,
82 typename BBlockTransferThreadClusterLengths_K0_N_K1,
83 typename BBlockTransferThreadClusterArrangeOrder,
84 typename BBlockTransferSrcAccessOrder,
85 index_t BBlockTransferSrcVectorDim,
86 index_t BBlockTransferSrcScalarPerVector,
87 index_t BBlockTransferDstScalarPerVector_K1,
88 bool BThreadTransferSrcResetCoordinateAfterRun,
90 typename CThreadTransferSrcDstAccessOrder,
91 index_t CThreadTransferSrcDstVectorDim,
92 index_t CThreadTransferDstScalarPerVector,
93 index_t NumGemmKPrefetchStage = 1,
117 return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
157 std::cout <<
"problem {" <<
"M:" <<
M <<
", " <<
"N:" <<
N <<
", " <<
"K:" <<
K <<
", "
160 <<
"AK0:" <<
AK0 <<
", " <<
"BK0:" <<
BK0 <<
"}" << std::endl;
179 const ABDataType* p_b_grid_,
180 CDataType* p_c_grid_,
187 :
Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
205 constexpr auto a_block_desc_ak0_m_ak1 = [&]() {
206 if constexpr(ABlockLdsExtraM)
219 return a_block_desc_ak0_m_ak1;
225 constexpr auto b_block_desc_bk0_n_bk1 = [&]() {
226 if constexpr(BBlockLdsExtraN)
239 return b_block_desc_bk0_n_bk1;
249 a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
max_lds_align);
251 b_block_desc_bk0_n_bk1.GetElementSpaceSize(),
max_lds_align);
253 return (a_block_space_size_aligned + b_block_space_size_aligned) *
sizeof(ABDataType);
259 "Wrong! AK1 must be known at the time of compilation.");
261 "Wrong! BK1 must be known at the time of compilation.");
264 MPerBlock % (MPerDpp * MDppPerWave) == 0,
265 "Invalid tuning parameters! MPerBlock must be divisible by MPerDpp * MDppPerWave.");
267 NPerBlock % (NPerDpp * NDppPerWave) == 0,
268 "Invalid tuning parameters! NPerBlock must be divisible by NPerDpp * NDppPerWave.");
271 KPerBlock % AK1Value == 0 && KPerBlock % BK1Value == 0,
272 "Invalid tuning parameters! KPerBlock must be divisible by both AK1 and BK1.");
274 static_assert(AK1Value % ABlockTransferDstScalarPerVector_K1 == 0,
275 "Invalid tuning parameters! AK1Value must be divisible by "
276 "ABlockTransferDstScalarPerVector_K1");
278 static_assert(BK1Value % BBlockTransferDstScalarPerVector_K1 == 0,
279 "Invalid tuning parameters! BK1Value must be divisible by "
280 "BBlockTransferDstScalarPerVector_K1");
287 if(!(problem.M % MPerBlock == 0))
298 if(!(problem.N % NPerBlock == 0))
306 if(problem.K % ABlockTransferSrcScalarPerVector != 0)
313 if(problem.M % ABlockTransferSrcScalarPerVector != 0)
321 if(problem.N % BBlockTransferSrcScalarPerVector != 0)
328 if(problem.K % BBlockTransferSrcScalarPerVector != 0)
334 if(problem.K % KPerBlock != 0)
340 const auto num_k_loop = problem.K / KPerBlock;
341 if(!GridwiseGemmPipe::IsSupported(num_k_loop))
351 const auto num_loop = K / KPerBlock;
353 return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
356 template <
typename CGr
idDesc>
357 __host__ __device__
static constexpr auto
366 using BlockwiseGemm =
370 decltype(a_block_desc_ak0_m_ak1),
371 decltype(b_block_desc_bk0_n_bk1),
378 return BlockwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_N2(c_grid_desc_m_n);
383 MPerBlock, NPerBlock, KPerBlock};
385 __device__
static auto
388 const auto a_grid_desc_mraw_kraw = [&]() {
399 const auto a_grid_desc_m_k =
matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
408 __device__
static auto
411 const auto b_grid_desc_nraw_kraw = [&]() {
422 const auto b_grid_desc_n_k =
matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
433 const auto c_grid_desc_mraw_nraw = [&]() {
444 return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
447 template <
bool HasMainKBlockLoop,
448 typename AGridDesc_AK0_M_AK1,
449 typename BGridDesc_BK0_N_BK1,
450 typename CGridDesc_M_N>
451 __device__
static void Run(
const ABDataType* __restrict__ p_a_grid,
452 const ABDataType* __restrict__ p_b_grid,
453 CDataType* __restrict__ p_c_grid,
454 void* __restrict__ p_shared,
455 const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
456 const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
457 const CGridDesc_M_N& c_grid_desc_m_n)
459 const auto c_grid_desc_m0_n0_m1_n1_m2_n2 =
463 p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
465 p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
467 p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_n2.GetElementSpaceSize());
469 const AElementwiseOperation a_element_op{};
470 const BElementwiseOperation b_element_op{};
471 const CElementwiseOperation c_element_op{};
473 const auto block_2_ctile_map =
477 const auto block_work_idx =
480 if(!block_2_ctile_map.ValidCTileIndex(
483 c_grid_desc_m0_n0_m1_n1_m2_n2.GetLength(
I1))))
489 const index_t m_block_data_idx_on_grid =
490 __builtin_amdgcn_readfirstlane(block_work_idx[
I0] * MPerBlock);
491 const index_t n_block_data_idx_on_grid =
492 __builtin_amdgcn_readfirstlane(block_work_idx[
I1] * NPerBlock);
499 auto a_blockwise_copy =
501 AElementwiseOperation,
505 ABlockTransferThreadClusterLengths_K0_M_K1,
506 ABlockTransferThreadClusterArrangeOrder,
509 decltype(a_grid_desc_ak0_m_ak1),
510 decltype(a_block_desc_ak0_m_ak1),
511 ABlockTransferSrcAccessOrder,
513 ABlockTransferSrcVectorDim,
515 ABlockTransferSrcScalarPerVector,
516 ABlockTransferDstScalarPerVector_K1,
519 AThreadTransferSrcResetCoordinateAfterRun,
521 NumGemmKPrefetchStage>(
522 a_grid_desc_ak0_m_ak1,
525 a_block_desc_ak0_m_ak1,
529 auto b_blockwise_copy =
531 BElementwiseOperation,
535 BBlockTransferThreadClusterLengths_K0_N_K1,
536 BBlockTransferThreadClusterArrangeOrder,
539 decltype(b_grid_desc_bk0_n_bk1),
540 decltype(b_block_desc_bk0_n_bk1),
541 BBlockTransferSrcAccessOrder,
543 BBlockTransferSrcVectorDim,
545 BBlockTransferSrcScalarPerVector,
546 BBlockTransferDstScalarPerVector_K1,
549 BThreadTransferSrcResetCoordinateAfterRun,
551 NumGemmKPrefetchStage>(
552 b_grid_desc_bk0_n_bk1,
555 b_block_desc_bk0_n_bk1,
567 auto blockwise_gemm =
571 decltype(a_block_desc_ak0_m_ak1),
572 decltype(b_block_desc_bk0_n_bk1),
579 auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
583 a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
max_lds_align);
586 static_cast<ABDataType*
>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
589 static_cast<ABDataType*
>(p_shared) + a_block_space_size_aligned,
590 b_block_desc_bk0_n_bk1.GetElementSpaceSize());
596 const auto AK0 = a_grid_desc_ak0_m_ak1.GetLength(
I0);
598 const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(AK0 /
AK0PerBlock);
601 a_block_desc_ak0_m_ak1,
605 a_block_slice_copy_step,
606 b_grid_desc_bk0_n_bk1,
607 b_block_desc_bk0_n_bk1,
611 b_block_slice_copy_step,
614 num_k_block_main_loop);
618 constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2 =
619 blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2();
621 constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
622 blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2();
624 constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2.GetLength(
I0);
625 constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2.GetLength(
I1);
626 constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2.GetLength(
I2);
627 constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2.GetLength(
I3);
628 constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2.GetLength(
I4);
629 constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2.GetLength(
I5);
631 constexpr auto MPerThread = c_thread_desc_m0_n0_m1_n1_m2_n2.GetLength(
I4);
632 constexpr auto NPerThread = c_thread_desc_m0_n0_m1_n1_m2_n2.GetLength(
I5);
636 const auto c_thread_mtx_on_block =
637 blockwise_gemm.CalculateCThreadOriginDataIndex(
I0,
I0);
639 const index_t m_thread_data_on_grid =
640 m_block_data_idx_on_grid + c_thread_mtx_on_block[
I0];
642 const index_t n_thread_data_on_grid =
643 n_block_data_idx_on_grid + c_thread_mtx_on_block[
I1];
650 const auto m_thread_data_on_grid_idx =
651 m_thread_data_on_grid_to_m0_m1_m2_adaptor.CalculateBottomIndex(
659 const auto n_thread_data_on_grid_idx =
660 n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex(
666 decltype(c_thread_desc_m0_n0_m1_n1_m2_n2),
667 decltype(c_grid_desc_m0_n0_m1_n1_m2_n2),
668 CElementwiseOperation,
670 CThreadTransferSrcDstAccessOrder,
671 CThreadTransferSrcDstVectorDim,
672 CThreadTransferDstScalarPerVector,
673 CGlobalMemoryDataOperation,
676 c_grid_desc_m0_n0_m1_n1_m2_n2,
678 n_thread_data_on_grid_idx[
I0],
679 m_thread_data_on_grid_idx[
I1],
680 n_thread_data_on_grid_idx[
I1],
681 m_thread_data_on_grid_idx[
I2],
682 n_thread_data_on_grid_idx[
I2]),
685 c_thread_copy.Run(c_thread_desc_m0_n0_m1_n1_m2_n2,
688 c_grid_desc_m0_n0_m1_n1_m2_n2,
#define CK_MIN_BLOCK_PER_CU
Definition ck.hpp:31
#define CK_MAX_THREAD_PER_BLOCK
Definition ck.hpp:30
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
Definition utility/math.hpp:78
__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
Definition utility/math.hpp:66
__host__ __device__ constexpr T max(T x)
Definition utility/math.hpp:84
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
__host__ __device__ constexpr auto lcm(X x, Y y)
Definition utility/math.hpp:198
GemmSpecialization
Definition gemm_specialization.hpp:11
@ MKPadding
Definition gemm_specialization.hpp:18
@ NPadding
Definition gemm_specialization.hpp:15
@ MPadding
Definition gemm_specialization.hpp:14
@ MNKPadding
Definition gemm_specialization.hpp:20
@ MNPadding
Definition gemm_specialization.hpp:17
@ NKPadding
Definition gemm_specialization.hpp:19
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
constexpr auto GridwiseGemmPipeline_Selector()
Definition gridwise_gemm_pipeline_selector.hpp:31
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
__global__ void kernel_gemm_dpp(const typename GridwiseGemm::Argument karg)
Definition gridwise_gemm_dpp.hpp:29
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
integral_constant< index_t, N > Number
Definition number.hpp:12
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
__device__ uint32_t amd_wave_read_first_lane(uint32_t value)
Definition amd_wave_read_first_lane.hpp:100
__device__ index_t get_block_1d_id()
Definition get_id.hpp:47
__host__ __device__ constexpr auto make_naive_tensor_descriptor_aligned(const Tuple< Lengths... > &lengths, Align align)
Definition tensor_descriptor_helper.hpp:132
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
PipelineVersion
Definition gridwise_gemm_pipeline_selector.hpp:18
@ v1
Definition gridwise_gemm_pipeline_selector.hpp:19
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
typename remove_cv< T >::type remove_cv_t
Definition type.hpp:295
__host__ __device__ constexpr auto make_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition dynamic_buffer.hpp:472
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
Definition block_to_ctile_map.hpp:261
Definition blockwise_gemm_dpp.hpp:33
static constexpr auto selected_dpp
Definition dpp_gemm.hpp:380
const ABDataType * p_a_grid
Definition gridwise_gemm_dpp.hpp:194
const ABDataType * p_b_grid
Definition gridwise_gemm_dpp.hpp:195
CDataType * p_c_grid
Definition gridwise_gemm_dpp.hpp:196
__host__ Argument(const ABDataType *p_a_grid_, const ABDataType *p_b_grid_, CDataType *p_c_grid_, index_t M_, index_t N_, index_t K_, index_t StrideA_, index_t StrideB_, index_t StrideC_)
Definition gridwise_gemm_dpp.hpp:178
index_t NPadded
Definition gridwise_gemm_dpp.hpp:170
index_t BK0
Definition gridwise_gemm_dpp.hpp:172
index_t StrideB
Definition gridwise_gemm_dpp.hpp:167
index_t N
Definition gridwise_gemm_dpp.hpp:164
index_t K
Definition gridwise_gemm_dpp.hpp:165
index_t StrideC
Definition gridwise_gemm_dpp.hpp:168
index_t M
Definition gridwise_gemm_dpp.hpp:163
index_t AK0
Definition gridwise_gemm_dpp.hpp:171
index_t MPadded
Definition gridwise_gemm_dpp.hpp:169
__host__ Problem(index_t M_, index_t N_, index_t K_, index_t StrideA_, index_t StrideB_, index_t StrideC_)
Definition gridwise_gemm_dpp.hpp:136
__host__ void Print() const
Definition gridwise_gemm_dpp.hpp:155
index_t StrideA
Definition gridwise_gemm_dpp.hpp:166
Definition gridwise_gemm_dpp.hpp:96
static __host__ constexpr bool CalculateHasMainKBlockLoop(index_t K)
Definition gridwise_gemm_dpp.hpp:349
static __host__ auto CalculateAK0(index_t K)
Definition gridwise_gemm_dpp.hpp:130
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_N2(const CGridDesc &c_grid_desc_m_n)
Definition gridwise_gemm_dpp.hpp:358
static __device__ void Run(const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, CDataType *__restrict__ p_c_grid, void *__restrict__ p_shared, const AGridDesc_AK0_M_AK1 &a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 &b_grid_desc_bk0_n_bk1, const CGridDesc_M_N &c_grid_desc_m_n)
Definition gridwise_gemm_dpp.hpp:451
static __device__ auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
Definition gridwise_gemm_dpp.hpp:431
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::BK0PerBlock static constexpr auto BK0PerBlock
Definition gridwise_gemm_dpp.hpp:107
static __host__ auto CalculateBK0(index_t K)
Definition gridwise_gemm_dpp.hpp:131
__host__ static __device__ constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
Definition gridwise_gemm_dpp.hpp:202
static __device__ auto MakeBGridDescriptor_BK0_N_BK1(index_t K, index_t N, index_t BK0, index_t StrideB)
Definition gridwise_gemm_dpp.hpp:409
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::ThisThreadBlock ThisThreadBlock< BlockSize > ThisThreadBlock
Definition gridwise_gemm_dpp.hpp:111
__host__ static __device__ constexpr index_t GetSharedMemoryNumberOfByte()
Definition gridwise_gemm_dpp.hpp:242
static __host__ auto CalculateGridSize(index_t M, index_t N)
Definition gridwise_gemm_dpp.hpp:115
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::I4 static constexpr auto I4
Definition gridwise_gemm_dpp.hpp:101
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::Block2CTileMap BlockToCTileMap_M00_N0_M01Adapt< MPerBlock, NPerBlock > Block2CTileMap
Definition gridwise_gemm_dpp.hpp:113
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::matrix_padder static constexpr auto matrix_padder
Definition gridwise_gemm_dpp.hpp:381
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::I5 static constexpr auto I5
Definition gridwise_gemm_dpp.hpp:102
static __host__ auto CalculateMPadded(index_t M)
Definition gridwise_gemm_dpp.hpp:120
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::AK0PerBlock static constexpr auto AK0PerBlock
Definition gridwise_gemm_dpp.hpp:106
static __host__ constexpr bool CheckValidity(const Problem &problem)
Definition gridwise_gemm_dpp.hpp:256
__host__ static __device__ constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
Definition gridwise_gemm_dpp.hpp:222
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::I3 static constexpr auto I3
Definition gridwise_gemm_dpp.hpp:100
static __host__ auto CalculateNPadded(index_t N)
Definition gridwise_gemm_dpp.hpp:125
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::BK1 static constexpr auto BK1
Definition gridwise_gemm_dpp.hpp:105
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::I2 static constexpr auto I2
Definition gridwise_gemm_dpp.hpp:99
static __device__ auto MakeAGridDescriptor_AK0_M_AK1(index_t M, index_t K, index_t AK0, index_t StrideA)
Definition gridwise_gemm_dpp.hpp:386
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::I1 static constexpr auto I1
Definition gridwise_gemm_dpp.hpp:98
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::GridwiseGemmPipe remove_cvref_t< decltype(GridwiseGemmPipeline_Selector< PipelineVer, NumGemmKPrefetchStage >())> GridwiseGemmPipe
Definition gridwise_gemm_dpp.hpp:199
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::I0 static constexpr auto I0
Definition gridwise_gemm_dpp.hpp:97
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::AK1 static constexpr auto AK1
Definition gridwise_gemm_dpp.hpp:104
ck::GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp< BlockSize, ADataType, AccDataType, CDataType, InMemoryDataOperationEnum::Set, ALayout, BLayout, CLayout, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, MPerBlock, NPerBlock, KPerBlock, MPerDpp, NPerDpp, AK1, BK1, MDppPerWave, NDppPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BBlockLdsAddExtraN, Sequence< 0, 2, 4, 1, 3, 5 >, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, NumPrefetch, PipelineVer >::max_lds_align static constexpr auto max_lds_align
Definition gridwise_gemm_dpp.hpp:109
Definition utility/sequence.hpp:43
Blockwise data transfer.
Definition thread_group_tensor_slice_transfer_v4r1.hpp:46
Definition threadwise_tensor_slice_transfer.hpp:39
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition is_known_at_compile_time.hpp:14
Definition device_base.hpp:197
Definition matrix_padder.hpp:180
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340