/* * Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #version 450 core #pragma use_vulkan_memory_model #extension GL_EXT_scalar_block_layout : enable #extension GL_KHR_memory_scope_semantics : enable #extension GL_KHR_cooperative_matrix : enable #extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable #extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable #extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable #extension GL_EXT_buffer_reference : enable #extension GL_EXT_control_flow_attributes : enable layout(constant_id = 0) const uint lM = 1; layout(constant_id = 1) const uint lN = 1; layout(constant_id = 2) const uint lK = 1; layout(constant_id = 3) const uint TILE_M = 1; layout(constant_id = 4) const uint TILE_N = 1; layout(constant_id = 5) const uint TILE_K = 1; layout(constant_id = 6) const uint K = 1; #define A_BITS 16 #define A_TYPE float16_t #define C_BITS 16 #define C_TYPE float16_t buffer Output { C_TYPE x[]; } outputO; shared uvec4 Ash[128]; shared uvec4 Bsh[128]; const uint C_ROWS = 2; const uint C_COLS = 2; coopmat result[C_ROWS][C_COLS]; void main() { [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { [[unroll]] for (uint j = 0; j < C_COLS; ++j) { result[i][j] = coopmat(0.0); } } for (uint chunkK = 0; chunkK < K; chunkK += TILE_K) { [[unroll]] for (uint k = 0; k < TILE_K / lK; ++k) { coopmat matA[C_ROWS]; [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { coopMatLoad(matA[i], Ash, 0, 0, gl_CooperativeMatrixLayoutRowMajor); } coopmat matB; [[unroll]] for (uint j = 0; j < C_COLS; ++j) { coopMatLoad(matB, Bsh, 0, 0, gl_CooperativeMatrixLayoutRowMajor); [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { result[i][j] = coopMatMulAdd(matA[i], matB, result[i][j]); } } } } [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { [[unroll]] for (uint j = 0; j < C_COLS; ++j) { coopMatStore(result[i][j], outputO.x, 0, 0, gl_CooperativeMatrixLayoutRowMajor); } } }