#include "cuda.h" #include <dlfcn.h> #include <stdbool.h> #define PY_SSIZE_T_CLEAN #include <Python.h> #include <stdatomic.h> // Raises a Python exception and returns false if code is not CUDA_SUCCESS. static bool gpuAssert(CUresult code, const char *file, int line) { if (code == CUDA_SUCCESS) return true; const char *prefix = "Triton Error [CUDA]: "; const char *str; cuGetErrorString(code, &str); char err[1024] = {0}; strcat(err, prefix); strcat(err, str); PyGILState_STATE gil_state; gil_state = PyGILState_Ensure(); PyErr_SetString(PyExc_RuntimeError, err); PyGILState_Release(gil_state); return false; } // To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block. #define CUDA_CHECK_AND_RETURN_NULL(ans) \ do { \ if (!gpuAssert((ans), __FILE__, __LINE__)) \ return NULL; \ } while (0) // To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block. #define CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans) \ do { \ if (!gpuAssert((ans), __FILE__, __LINE__)) { \ PyEval_RestoreThread(_save); \ return NULL; \ } \ } while (0) // Used to check if functions exist in old CUDA driver versions. #define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction) \ do { \ if ((funcPointer) == NULL) { \ (funcPointer) = (initializerFunction)(); \ if ((funcPointer) == NULL) { \ return NULL; \ } \ } \ } while (0) static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { int device_id; if (!PyArg_ParseTuple(args, "i", &device_id)) return NULL; // Get device handle CUdevice device; cuDeviceGet(&device, device_id); // create a struct to hold device properties int max_shared_mem; int max_num_regs; int multiprocessor_count; int warp_size; int sm_clock_rate; int mem_clock_rate; int mem_bus_width; CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &max_num_regs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); CUDA_CHECK_AND_RETURN_NULL( cuDeviceGetAttribute(&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &sm_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", max_shared_mem, "max_num_regs", max_num_regs, "multiprocessor_count", multiprocessor_count, "warpSize", warp_size, "sm_clock_rate", sm_clock_rate, "mem_clock_rate", mem_clock_rate, "mem_bus_width", mem_bus_width); } static PyObject *loadBinary(PyObject *self, PyObject *args) { const char *name; const char *data; Py_ssize_t data_size; int shared; int device; if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, &device)) { return NULL; } CUfunction fun; CUmodule mod; int32_t n_regs = 0; int32_t n_spills = 0; // create driver handles CUcontext pctx = 0; Py_BEGIN_ALLOW_THREADS; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&pctx)); if (!pctx) { CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuDevicePrimaryCtxRetain(&pctx, device)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(pctx)); } CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuModuleLoadData(&mod, data)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuModuleGetFunction(&fun, mod, name)); // get allocated registers and spilled registers from the function CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun)); n_spills /= 4; // set dynamic shared memory if necessary int shared_optin; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute( &shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); if (shared > 49152 && shared_optin > 49152) { CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED)); int shared_total, shared_static; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute( &shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute( &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static)); } Py_END_ALLOW_THREADS; if (PyErr_Occurred()) { return NULL; } return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills); } typedef CUresult (*cuOccupancyMaxActiveClusters_t)( int *numClusters, CUfunction func, const CUlaunchConfig *config); typedef CUresult (*cuTensorMapEncodeTiled_t)( CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill); #define defineGetFunctionHandle(name, symbolName) \ static symbolName##_t name() { \ /* Open the shared library */ \ void *libHandle = dlopen("libcuda.so.1", RTLD_LAZY); \ if (!libHandle) { \ PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1"); \ return NULL; \ } \ /* Clear any existing error */ \ dlerror(); \ symbolName##_t funcHandle = (symbolName##_t)dlsym(libHandle, #symbolName); \ /* Check for errors */ \ const char *err = dlerror(); \ if (err) { \ PyErr_SetString(PyExc_RuntimeError, \ "Failed to retrieve " #symbolName " from libcuda.so.1"); \ dlclose(libHandle); \ return NULL; \ } \ return funcHandle; \ } defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle, cuOccupancyMaxActiveClusters); defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle, cuTensorMapEncodeTiled); static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) { int clusterDimX = -1, clusterDimY = -1, clusterDimZ = -1, maxActiveClusters = -1; int shared = 0; CUfunction func; if (!PyArg_ParseTuple(args, "Kiiii", &func, &shared, &clusterDimX, &clusterDimY, &clusterDimZ)) { return NULL; } // Let each SM have one block int maxActiveBlocks = 1; Py_BEGIN_ALLOW_THREADS; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute( func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared)); Py_END_ALLOW_THREADS; CUlaunchAttribute launchAttr[1]; launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; launchAttr[0].value.clusterDim.x = clusterDimX; launchAttr[0].value.clusterDim.y = clusterDimY; launchAttr[0].value.clusterDim.z = clusterDimZ; CUlaunchConfig config; config.gridDimX = clusterDimX; config.gridDimY = maxActiveBlocks * clusterDimY; config.gridDimZ = clusterDimZ; config.blockDimX = 128; config.blockDimY = 1; config.blockDimZ = 1; config.sharedMemBytes = shared; config.hStream = 0; config.numAttrs = 1; config.attrs = launchAttr; static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL; INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters, getCuOccupancyMaxActiveClustersHandle); Py_BEGIN_ALLOW_THREADS; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute( func, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuOccupancyMaxActiveClusters(&maxActiveClusters, func, &config)); Py_END_ALLOW_THREADS; return PyLong_FromLong(maxActiveClusters); } static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) { long size; if (!PyArg_ParseTuple(args, "l", &size)) { return NULL; } if (size < 0) { PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative"); return NULL; } Py_BEGIN_ALLOW_THREADS; // Ensure we have an active context. CUcontext ctx = NULL; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&ctx)); if (!ctx) { CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuDevicePrimaryCtxRetain(&ctx, /*device=*/0)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(ctx)); } // We can't set the fifo size after running a kernel that calls printf. This // is true even if the set() call is a nop and the new size is the same as the // old size. // // This is unfriendly, so check if the old size matches the new size, and skip // the set() call if so. size_t oldSize = 0; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE)); if (oldSize != size) { CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size)); } Py_END_ALLOW_THREADS; return Py_None; } // Simple helper to experiment creating TMA descriptors on the host. // This is a useful to test TMA operations independently. static PyObject *fill1DTMADescriptor(PyObject *self, PyObject *args) { unsigned long long global_address; uint64_t dim; uint32_t tensorDim; int elementSize; Py_buffer desc_buffer; if (!PyArg_ParseTuple(args, "KKiiy*", &global_address, &dim, &tensorDim, &elementSize, &desc_buffer)) { return NULL; } char *desc = (char *)desc_buffer.buf; uint64_t dims[1] = {dim}; uint64_t globalStrides[1] = {dim * elementSize}; uint32_t boxDim[1] = {tensorDim}; uint32_t elementStrides[1] = {1}; CUtensorMapDataType type; switch (elementSize) { case 1: type = CU_TENSOR_MAP_DATA_TYPE_UINT8; break; case 2: type = CU_TENSOR_MAP_DATA_TYPE_UINT16; break; case 4: type = CU_TENSOR_MAP_DATA_TYPE_UINT32; break; default: PyErr_SetString(PyExc_ValueError, "elementSize must be 1, 2, or 4"); } assert((elementSize * tensorDim) >= 32 && "block size too small."); int rank = 1; static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL; INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled, getCuTensorMapEncodeTiledHandle); CUresult result = cuTensorMapEncodeTiled( (CUtensorMap *)desc, type, rank, (void *)global_address, dims, globalStrides, boxDim, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); assert(result == CUDA_SUCCESS); return Py_None; } // Simple helper to experiment creating TMA descriptors on the host. // This is a useful to test TMA operations independently. static PyObject *fill2DTMADescriptor(PyObject *self, PyObject *args) { unsigned long long global_address; uint64_t dims[2]; uint32_t tensorDims[2]; int elementSize; Py_buffer desc_buffer; if (!PyArg_ParseTuple(args, "KKKiiiy*", &global_address, &dims[1], &dims[0], &tensorDims[1], &tensorDims[0], &elementSize, &desc_buffer)) { return NULL; } char *desc = (char *)desc_buffer.buf; uint64_t globalStrides[2] = {dims[0] * elementSize, dims[0] * dims[1] * elementSize}; uint32_t elementStrides[2] = {1, 1}; CUtensorMapDataType type; switch (elementSize) { case 1: type = CU_TENSOR_MAP_DATA_TYPE_UINT8; break; case 2: type = CU_TENSOR_MAP_DATA_TYPE_UINT16; break; case 4: type = CU_TENSOR_MAP_DATA_TYPE_UINT32; break; default: PyErr_SetString(PyExc_ValueError, "elementSize must be 1, 2, or 4"); } int rank = 2; // Swizzling should be picked in codegen but since we need to set it on the // descriptor we rely on a convention between this function and codegen. CUtensorMapSwizzle swizzle = CU_TENSOR_MAP_SWIZZLE_128B; uint32_t contigDimSizeInByte = elementSize * tensorDims[0]; if (contigDimSizeInByte >= 128) { swizzle = CU_TENSOR_MAP_SWIZZLE_128B; } else if (contigDimSizeInByte >= 64) { swizzle = CU_TENSOR_MAP_SWIZZLE_64B; } else if (contigDimSizeInByte >= 32) { swizzle = CU_TENSOR_MAP_SWIZZLE_32B; } else { assert(false && "block size too small."); } // The bounding box inner dimension must be less than or equal to the swizzle // size. // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html#group__CUDA__TENSOR__MEMORY_1ga7c7d2aaac9e49294304e755e6f341d7 // We clamp the block size and the codegen will emit multiple copy operations. if (contigDimSizeInByte > 128) { tensorDims[0] = 128 / elementSize; } static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL; INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled, getCuTensorMapEncodeTiledHandle); CUresult result = cuTensorMapEncodeTiled( (CUtensorMap *)desc, type, rank, (void *)global_address, dims, globalStrides, tensorDims, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); assert(result == CUDA_SUCCESS); return Py_None; } static PyMethodDef ModuleMethods[] = { {"load_binary", loadBinary, METH_VARARGS, "Load provided cubin into CUDA driver"}, {"get_device_properties", getDeviceProperties, METH_VARARGS, "Get the properties for a given device"}, {"cuOccupancyMaxActiveClusters", occupancyMaxActiveClusters, METH_VARARGS, "Python interface for cuOccupancyMaxActiveClusters function"}, {"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS, "Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which " "controls how many bytes can be streamed from kernels before data starts " "being dropped. This inherits all the limitations of this call; in " "particular it's an error to change this value after launching any kernel " "that calls printf()."}, {"fill_1d_tma_descriptor", fill1DTMADescriptor, METH_VARARGS, "doc"}, {"fill_2d_tma_descriptor", fill2DTMADescriptor, METH_VARARGS, "doc"}, {NULL, NULL, 0, NULL} // sentinel }; static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils", NULL, // documentation -1, // size ModuleMethods}; PyMODINIT_FUNC PyInit_cuda_utils(void) { PyObject *m = PyModule_Create(&ModuleDef); if (m == NULL) { return NULL; } PyModule_AddFunctions(m, ModuleMethods); return m; }