#version 450 core #extension GL_KHR_memory_scope_semantics : enable #extension GL_KHR_cooperative_matrix : enable #extension GL_EXT_shader_explicit_arithmetic_types : enable #extension GL_NV_cooperative_matrix2 : enable layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; buffer BufType { float16_t x[]; } Buf; uint32_t addr(const in uint32_t x, const in uint32_t y) { return y*64+x; } uint32_t foo() { return 124; } float16_t relu(const in uint32_t row, const in uint32_t col, const in float16_t x) { return max(x, float16_t(0)); } float16_t add(const in uint32_t row, const in uint32_t col, const in float16_t x, const in float16_t y) { return x+y; } float16_t combineSum(const in float16_t a, const in float16_t b) { return a + b; } float16_t combineMax(const in float16_t a, const in float16_t b) { return max(a, b); } layout(constant_id = 0) const uint32_t Dim = 32; void main() { coopmat A; coopmat B; coopmat Acc; A = coopmat(Acc); B = coopmat(Acc); coopmat tr; coopMatTransposeNV(tr, Acc); coopMatReduceNV(Acc, Acc, gl_CooperativeMatrixReduceRowNV, combineSum); coopMatReduceNV(Acc, Acc, gl_CooperativeMatrixReduceColumnNV, combineSum); coopMatReduceNV(Acc, Acc, gl_CooperativeMatrixReduceRowAndColumnNV, combineSum); coopmat Acc2x2; coopMatReduceNV(Acc2x2, Acc, gl_CooperativeMatrixReduce2x2NV, combineMax); //coopMatLoadTensorNV(A, Buf.x, foo(), addr); //coopMatStoreTensorNV(A, Buf.x, foo(), addr); coopMatPerElementNV(Acc, Acc, relu); coopMatPerElementNV(Acc, Acc, add, float16_t(1.0)); coopmat Accf16; coopmat Accf32; coopmat Accu32; coopmat Accs32; coopmat(Accu32); coopmat(Accu32); coopmat(Accu32); coopmat(Accs32); coopmat(Accs32); coopmat(Accs32); coopmat(Accf16); coopmat(Accf16); coopmat(Accf16); coopmat(Accf32); coopmat(Accf32); coopmat(Accf32); coopmat li, mijm1; mijm1 = coopmat(-1.0/0.0); }