mirror of https://github.com/xemu-project/xemu.git
88 lines
3.4 KiB
Plaintext
88 lines
3.4 KiB
Plaintext
/*
|
|
* Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
#version 450 core
|
|
#pragma use_vulkan_memory_model
|
|
#extension GL_EXT_scalar_block_layout : enable
|
|
#extension GL_KHR_memory_scope_semantics : enable
|
|
#extension GL_KHR_cooperative_matrix : enable
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
|
|
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
|
|
#extension GL_EXT_buffer_reference : enable
|
|
#extension GL_EXT_control_flow_attributes : enable
|
|
|
|
layout(constant_id = 0) const uint lM = 1;
|
|
layout(constant_id = 1) const uint lN = 1;
|
|
layout(constant_id = 2) const uint lK = 1;
|
|
layout(constant_id = 3) const uint TILE_M = 1;
|
|
layout(constant_id = 4) const uint TILE_N = 1;
|
|
layout(constant_id = 5) const uint TILE_K = 1;
|
|
layout(constant_id = 6) const uint K = 1;
|
|
|
|
#define A_BITS 16
|
|
#define A_TYPE float16_t
|
|
#define C_BITS 16
|
|
#define C_TYPE float16_t
|
|
|
|
buffer Output { C_TYPE x[]; } outputO;
|
|
|
|
shared uvec4 Ash[128];
|
|
shared uvec4 Bsh[128];
|
|
|
|
const uint C_ROWS = 2;
|
|
const uint C_COLS = 2;
|
|
coopmat<C_TYPE, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator> result[C_ROWS][C_COLS];
|
|
|
|
void main()
|
|
{
|
|
[[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
|
|
[[unroll]] for (uint j = 0; j < C_COLS; ++j) {
|
|
result[i][j] = coopmat<C_TYPE, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator>(0.0);
|
|
}
|
|
}
|
|
|
|
for (uint chunkK = 0; chunkK < K; chunkK += TILE_K) {
|
|
[[unroll]] for (uint k = 0; k < TILE_K / lK; ++k)
|
|
{
|
|
coopmat<A_TYPE, gl_ScopeSubgroup, lM, lK, gl_MatrixUseA> matA[C_ROWS];
|
|
[[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
|
|
coopMatLoad(matA[i], Ash, 0, 0, gl_CooperativeMatrixLayoutRowMajor);
|
|
}
|
|
|
|
coopmat<A_TYPE, gl_ScopeSubgroup, lK, lN, gl_MatrixUseB> matB;
|
|
[[unroll]] for (uint j = 0; j < C_COLS; ++j) {
|
|
coopMatLoad(matB, Bsh, 0, 0, gl_CooperativeMatrixLayoutRowMajor);
|
|
|
|
[[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
|
|
result[i][j] = coopMatMulAdd(matA[i], matB, result[i][j]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
[[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
|
|
[[unroll]] for (uint j = 0; j < C_COLS; ++j) {
|
|
coopMatStore(result[i][j], outputO.x, 0, 0, gl_CooperativeMatrixLayoutRowMajor);
|
|
}
|
|
}
|
|
}
|