20 typename ComputeTypeA,
21 typename ComputeTypeB,
23 typename AWmmaTileDesc,
24 typename BWmmaTileDesc,
25 index_t ABlockTransferSrcScalarPerVector,
26 index_t BBlockTransferSrcScalarPerVector,
35 bool TransposeC =
false>
43 typename ComputeTypeA,
44 typename ComputeTypeB,
46 typename AWmmaTileDesc,
47 typename BWmmaTileDesc,
48 index_t ABlockTransferSrcScalarPerVector,
49 index_t BBlockTransferSrcScalarPerVector,
68 ABlockTransferSrcScalarPerVector,
69 BBlockTransferSrcScalarPerVector,
87 ABlockTransferSrcScalarPerVector,
88 BBlockTransferSrcScalarPerVector,
107 ABlockTransferSrcScalarPerVector,
108 BBlockTransferSrcScalarPerVector,
132 GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
135 GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
137 GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs;
142 using typename Base::Empty;
280 template <
typename ABlockBuffer,
281 typename AThreadBuffer,
282 typename BBlockBuffer,
283 typename BThreadBuffer,
284 typename BScaleStruct>
285 __device__
inline void LocalLoad(ABlockBuffer& a_block_buf,
286 AThreadBuffer& a_thread_buf,
287 BBlockBuffer& b_block_buf,
288 BThreadBuffer& b_thread_buf,
289 BScaleStruct& b_scale_struct)
const
321 b_scale_struct.b_scale_thread_bufs(
322 I0)[
Number<n0 * BScaleStruct::num_scale_k_block +
323 k0 / BScaleStruct::num_scale_krepeat>{}],
332 template <
bool HasMainLoop,
336 typename ABlockTransfer,
337 typename AGridBuffer,
338 typename ABlockBuffer,
339 typename ABlockTransferStep,
342 typename BBlockTransfer,
343 typename BGridBuffer,
344 typename BBlockBuffer,
345 typename BBlockTransferStep,
346 typename CThreadBuffer,
347 typename BScaleStruct>
348 __device__
void Run(
const AGridDesc& a_grid_desc,
349 const ABlockDesc& a_block_desc,
350 ABlockTransfer& a_blockwise_copy,
351 const AGridBuffer& a_grid_buf,
352 ABlockBuffer& a_block_buf,
353 const ABlockTransferStep& a_block_copy_step,
354 const BGridDesc& b_grid_desc,
355 const BBlockDesc& b_block_desc,
356 BBlockTransfer& b_blockwise_copy,
357 const BGridBuffer& b_grid_buf,
358 BBlockBuffer& b_block_buf,
359 const BBlockTransferStep& b_block_copy_step,
360 CThreadBuffer& c_thread_buf,
362 BScaleStruct& b_scale_struct,
364 index_t num_loop_per_scale)
const
366 __builtin_amdgcn_sched_barrier(0);
373 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
374 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
376 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
377 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
379 b_scale_struct.template GlobalLoad<0>(num_loop_per_scale == 1);
382 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
383 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
388 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
389 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
391 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
392 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
396 c_thread_buf.Clear();
401 LocalLoad(a_block_buf, a_thread_buf, b_block_buf, b_thread_buf, b_scale_struct);
403 __builtin_amdgcn_sched_barrier(0);
406 if constexpr(HasMainLoop)
413 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
414 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
416 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
417 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
419 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
420 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
422 b_scale_struct.template GlobalLoad<0>((i + 2) % num_loop_per_scale == 0);
431 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
441 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
451 using wmma_input_type_a =
453 using wmma_input_type_b =
459 wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
460 b_thread_vec.template AsType<wmma_input_type_b>(),
468 LocalLoad(a_block_buf, a_thread_buf, b_block_buf, b_thread_buf, b_scale_struct);
471 __builtin_amdgcn_sched_barrier(0);
474 }
while(i < (num_loop - 2));
482 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
483 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
487 b_scale_struct.template GlobalLoad<0>(num_loop % num_loop_per_scale == 0);
496 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
501 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
506 using wmma_input_type_a =
508 using wmma_input_type_b =
514 wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
515 b_thread_vec.template AsType<wmma_input_type_b>(),
523 LocalLoad(a_block_buf, a_thread_buf, b_block_buf, b_thread_buf, b_scale_struct);
526 __builtin_amdgcn_sched_barrier(0);
538 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
543 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
548 using wmma_input_type_a =
550 using wmma_input_type_b =
556 wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
557 b_thread_vec.template AsType<wmma_input_type_b>(),
569 using Base::a_thread_copy_;
570 using Base::a_thread_desc_;
571 using Base::b_thread_copy_;
572 using Base::b_thread_desc_;
573 using Base::c_thread_desc_;
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
@ Full
Definition blkgemmpipe_scheduler.hpp:49
constexpr bool is_same_v
Definition type.hpp:283
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ void block_sync_lds()
Definition synchronization.hpp:16
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::HotLoopInstList ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerWmma, NPerWmma, wmma_gemm.wmma_instr.k_per_wmma > HotLoopInstList
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:70
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::GetCThreadBuffer __host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:166
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_block_desc_k0_n0_n1_n2_k1 static constexpr BWmmaTileDesc b_block_desc_k0_n0_n1_n2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:337
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:57
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::CalculateCThreadOriginDataIndex static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >)
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:217
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BlockwiseGemmWmmaops_pipeline_base __host__ __device__ BlockwiseGemmWmmaops_pipeline_base(Tuple6 a_origin=CalculateAThreadOriginDataIndex(), Tuple6 b_origin=CalculateBThreadOriginDataIndex())
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:264
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_block_desc_k0_m0_m1_m2_k1 static constexpr AWmmaTileDesc a_block_desc_k0_m0_m1_m2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:336
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::wmma_gemm static constexpr auto wmma_gemm
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:63
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_KRow static constexpr index_t B_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:54
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:36
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_K1 static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:58
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::WmmaK static constexpr auto WmmaK
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:68
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:66
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_KRow static constexpr index_t A_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:53
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:340
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:394
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::LocalLoad __device__ void LocalLoad(ABlockBuffer &a_block_buf, AThreadBuffer &a_thread_buf, BBlockBuffer &b_block_buf, BThreadBuffer &b_thread_buf, BScaleStruct &b_scale_struct) const
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:285
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_block_desc_k0_n0_n1_n2_k1 static constexpr BWmmaTileDesc b_block_desc_k0_n0_n1_n2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:337
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::HotLoopScheduler static __device__ constexpr auto HotLoopScheduler()
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:172
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BlockLoopTailNum __host__ static __device__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:153
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BlockHasHotloop __host__ static __device__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:148
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:354
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_block_desc_k0_m0_m1_m2_k1 static constexpr AWmmaTileDesc a_block_desc_k0_m0_m1_m2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:336
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::wmma_gemm static constexpr auto wmma_gemm
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:63
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_KRow static constexpr index_t B_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:54
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:146
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::Base BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC > Base
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:99
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:36
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:145
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::WmmaK static constexpr auto WmmaK
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:68
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:395
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, BScaleStruct &b_scale_struct, index_t num_loop, index_t num_loop_per_scale) const
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:348
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:144
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_KRow static constexpr index_t A_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:53
ck::BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:369
Definition blockwise_gemm_pipeline_wmmaops_v3.hpp:37
Definition functional2.hpp:33
Definition dtype_vector.hpp:10