/* * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ /** * CUDA Occupancy Calculator * * NAME * * cudaOccMaxActiveBlocksPerMultiprocessor, * cudaOccMaxPotentialOccupancyBlockSize, * cudaOccMaxPotentialOccupancyBlockSizeVariableSMem * cudaOccAvailableDynamicSMemPerBlock * * DESCRIPTION * * The CUDA occupancy calculator provides a standalone, programmatical * interface to compute the occupancy of a function on a device. It can also * provide occupancy-oriented launch configuration suggestions. * * The function and device are defined by the user through * cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState * structures. All APIs require all 3 of them. * * See the structure definition for more details about the device / function * descriptors. * * See each API's prototype for API usage. * * COMPATIBILITY * * The occupancy calculator will be updated on each major CUDA toolkit * release. It does not provide forward compatibility, i.e. new hardwares * released after this implementation's release will not be supported. * * NOTE * * If there is access to CUDA runtime, and the sole intent is to calculate * occupancy related values on one of the accessible CUDA devices, using CUDA * runtime's occupancy calculation APIs is recommended. * */ #ifndef __cuda_occupancy_h__ #define __cuda_occupancy_h__ #include <stddef.h> #include <limits.h> #include <string.h> // __OCC_INLINE will be undefined at the end of this header // #ifdef __CUDACC__ #define __OCC_INLINE inline __host__ __device__ #elif defined _MSC_VER #define __OCC_INLINE __inline #else // GNUCC assumed #define __OCC_INLINE inline #endif enum cudaOccError_enum { CUDA_OCC_SUCCESS = 0, // no error encountered CUDA_OCC_ERROR_INVALID_INPUT = 1, // input parameter is invalid CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, // requested device is not supported in // current implementation or device is // invalid }; typedef enum cudaOccError_enum cudaOccError; typedef struct cudaOccResult cudaOccResult; typedef struct cudaOccDeviceProp cudaOccDeviceProp; typedef struct cudaOccFuncAttributes cudaOccFuncAttributes; typedef struct cudaOccDeviceState cudaOccDeviceState; /** * The CUDA occupancy calculator computes the occupancy of the function * described by attributes with the given block size (blockSize), static device * properties (properties), dynamic device states (states) and per-block dynamic * shared memory allocation (dynamicSMemSize) in bytes, and output it through * result along with other useful information. The occupancy is computed in * terms of the maximum number of active blocks per multiprocessor. The user can * then convert it to other metrics, such as number of active warps. * * RETURN VALUE * * The occupancy and related information is returned through result. * * If result->activeBlocksPerMultiprocessor is 0, then the given parameter * combination cannot run on the device. * * ERRORS * * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in * current implementation or device is invalid */ static __OCC_INLINE cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccResult *result, // out const cudaOccDeviceProp *properties, // in const cudaOccFuncAttributes *attributes, // in const cudaOccDeviceState *state, // in int blockSize, // in size_t dynamicSmemSize); // in /** * The CUDA launch configurator C API suggests a grid / block size pair (in * minGridSize and blockSize) that achieves the best potential occupancy * (i.e. maximum number of active warps with the smallest number of blocks) for * the given function described by attributes, on a device described by * properties with settings in state. * * If per-block dynamic shared memory allocation is not needed, the user should * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0. * * If per-block dynamic shared memory allocation is needed, then if the dynamic * shared memory size is constant regardless of block size, the size should be * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be * NULL. * * Otherwise, if the per-block dynamic shared memory size varies with different * block sizes, the user needs to provide a pointer to an unary function through * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by * a block of the function for any given block size. dynamicSMemSize is * ignored. An example signature is: * * // Take block size, returns dynamic shared memory needed * size_t blockToSmem(int blockSize); * * RETURN VALUE * * The suggested block size and the minimum number of blocks needed to achieve * the maximum occupancy are returned through blockSize and minGridSize. * * If *blockSize is 0, then the given combination cannot run on the device. * * ERRORS * * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in * current implementation or device is invalid * */ static __OCC_INLINE cudaOccError cudaOccMaxPotentialOccupancyBlockSize( int *minGridSize, // out int *blockSize, // out const cudaOccDeviceProp *properties, // in const cudaOccFuncAttributes *attributes, // in const cudaOccDeviceState *state, // in size_t (*blockSizeToDynamicSMemSize)(int), // in size_t dynamicSMemSize); // in /** * The CUDA launch configurator C++ API suggests a grid / block size pair (in * minGridSize and blockSize) that achieves the best potential occupancy * (i.e. the maximum number of active warps with the smallest number of blocks) * for the given function described by attributes, on a device described by * properties with settings in state. * * If per-block dynamic shared memory allocation is 0 or constant regardless of * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to * configure the launch. A constant dynamic shared memory allocation size in * bytes can be passed through dynamicSMemSize. * * Otherwise, if the per-block dynamic shared memory size varies with different * block sizes, the user needs to use * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that * computes the dynamic shared memory needed by func for any given block * size. An example signature is: * * // Take block size, returns per-block dynamic shared memory needed * size_t blockToSmem(int blockSize); * * RETURN VALUE * * The suggested block size and the minimum number of blocks needed to achieve * the maximum occupancy are returned through blockSize and minGridSize. * * If *blockSize is 0, then the given combination cannot run on the device. * * ERRORS * * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in * current implementation or device is invalid * */ #if defined(__cplusplus) namespace { __OCC_INLINE cudaOccError cudaOccMaxPotentialOccupancyBlockSize( int *minGridSize, // out int *blockSize, // out const cudaOccDeviceProp *properties, // in const cudaOccFuncAttributes *attributes, // in const cudaOccDeviceState *state, // in size_t dynamicSMemSize = 0); // in template <typename UnaryFunction> __OCC_INLINE cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem( int *minGridSize, // out int *blockSize, // out const cudaOccDeviceProp *properties, // in const cudaOccFuncAttributes *attributes, // in const cudaOccDeviceState *state, // in UnaryFunction blockSizeToDynamicSMemSize); // in } // namespace anonymous #endif // defined(__cplusplus) /** * * The CUDA dynamic shared memory calculator computes the maximum size of * per-block dynamic shared memory if we want to place numBlocks blocks * on an SM. * * RETURN VALUE * * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow * numBlocks blocks per SM. * * ERRORS * * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in * current implementation or device is invalid * */ static __OCC_INLINE cudaOccError cudaOccAvailableDynamicSMemPerBlock( size_t *dynamicSmemSize, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, const cudaOccDeviceState *state, int numBlocks, int blockSize); /** * Data structures * * These structures are subject to change for future architecture and CUDA * releases. C users should initialize the structure as {0}. * */ /** * Device descriptor * * This structure describes a device. */ struct cudaOccDeviceProp { int computeMajor; // Compute capability major version int computeMinor; // Compute capability minor // version. None supported minor version // may cause error int maxThreadsPerBlock; // Maximum number of threads per block int maxThreadsPerMultiprocessor; // Maximum number of threads per SM // i.e. (Max. number of warps) x (warp // size) int regsPerBlock; // Maximum number of registers per block int regsPerMultiprocessor; // Maximum number of registers per SM int warpSize; // Warp size size_t sharedMemPerBlock; // Maximum shared memory size per block size_t sharedMemPerMultiprocessor; // Maximum shared memory size per SM int numSms; // Number of SMs available size_t sharedMemPerBlockOptin; // Maximum optin shared memory size per block size_t reservedSharedMemPerBlock; // Shared memory per block reserved by driver #ifdef __cplusplus // This structure can be converted from a cudaDeviceProp structure for users // that use this header in their CUDA applications. // // If the application have access to the CUDA Runtime API, the application // can obtain the device properties of a CUDA device through // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the // cudaDeviceProp structure. // // Example: /* { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, ...); cudaOccDeviceProp occProp = prop; ... cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...); } */ // template<typename DeviceProp> __OCC_INLINE cudaOccDeviceProp(const DeviceProp &props) : computeMajor (props.major), computeMinor (props.minor), maxThreadsPerBlock (props.maxThreadsPerBlock), maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor), regsPerBlock (props.regsPerBlock), regsPerMultiprocessor (props.regsPerMultiprocessor), warpSize (props.warpSize), sharedMemPerBlock (props.sharedMemPerBlock), sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor), numSms (props.multiProcessorCount), sharedMemPerBlockOptin (props.sharedMemPerBlockOptin), reservedSharedMemPerBlock (props.reservedSharedMemPerBlock) {} __OCC_INLINE cudaOccDeviceProp() : computeMajor (0), computeMinor (0), maxThreadsPerBlock (0), maxThreadsPerMultiprocessor (0), regsPerBlock (0), regsPerMultiprocessor (0), warpSize (0), sharedMemPerBlock (0), sharedMemPerMultiprocessor (0), numSms (0), sharedMemPerBlockOptin (0), reservedSharedMemPerBlock (0) {} #endif // __cplusplus }; /** * Partitioned global caching option */ typedef enum cudaOccPartitionedGCConfig_enum { PARTITIONED_GC_OFF, // Disable partitioned global caching PARTITIONED_GC_ON, // Prefer partitioned global caching PARTITIONED_GC_ON_STRICT // Force partitioned global caching } cudaOccPartitionedGCConfig; /** * Per function opt in maximum dynamic shared memory limit */ typedef enum cudaOccFuncShmemConfig_enum { FUNC_SHMEM_LIMIT_DEFAULT, // Default shmem limit FUNC_SHMEM_LIMIT_OPTIN, // Use the optin shmem limit } cudaOccFuncShmemConfig; /** * Function descriptor * * This structure describes a CUDA function. */ struct cudaOccFuncAttributes { int maxThreadsPerBlock; // Maximum block size the function can work with. If // unlimited, use INT_MAX or any value greater than // or equal to maxThreadsPerBlock of the device int numRegs; // Number of registers used. When the function is // launched on device, the register count may change // due to internal tools requirements. size_t sharedSizeBytes; // Number of static shared memory used cudaOccPartitionedGCConfig partitionedGCConfig; // Partitioned global caching is required to enable // caching on certain chips, such as sm_52 // devices. Partitioned global caching can be // automatically disabled if the occupancy // requirement of the launch cannot support caching. // // To override this behavior with caching on and // calculate occupancy strictly according to the // preference, set partitionedGCConfig to // PARTITIONED_GC_ON_STRICT. This is especially // useful for experimenting and finding launch // configurations (MaxPotentialOccupancyBlockSize) // that allow global caching to take effect. // // This flag only affects the occupancy calculation. cudaOccFuncShmemConfig shmemLimitConfig; // Certain chips like sm_70 allow a user to opt into // a higher per block limit of dynamic shared memory // This optin is performed on a per function basis // using the cuFuncSetAttribute function size_t maxDynamicSharedSizeBytes; // User set limit on maximum dynamic shared memory // usable by the kernel // This limit is set using the cuFuncSetAttribute // function. int numBlockBarriers; // Number of block barriers used (default to 1) #ifdef __cplusplus // This structure can be converted from a cudaFuncAttributes structure for // users that use this header in their CUDA applications. // // If the application have access to the CUDA Runtime API, the application // can obtain the function attributes of a CUDA kernel function through // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the // cudaFuncAttributes structure. // // Example: /* __global__ void foo() {...} ... { cudaFuncAttributes attr; cudaFuncGetAttributes(&attr, foo); cudaOccFuncAttributes occAttr = attr; ... cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...); } */ // template<typename FuncAttributes> __OCC_INLINE cudaOccFuncAttributes(const FuncAttributes &attr) : maxThreadsPerBlock (attr.maxThreadsPerBlock), numRegs (attr.numRegs), sharedSizeBytes (attr.sharedSizeBytes), partitionedGCConfig (PARTITIONED_GC_OFF), shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN), maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes), numBlockBarriers (1) {} __OCC_INLINE cudaOccFuncAttributes() : maxThreadsPerBlock (0), numRegs (0), sharedSizeBytes (0), partitionedGCConfig (PARTITIONED_GC_OFF), shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT), maxDynamicSharedSizeBytes (0), numBlockBarriers (0) {} #endif }; typedef enum cudaOccCacheConfig_enum { CACHE_PREFER_NONE = 0x00, // no preference for shared memory or L1 (default) CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache CACHE_PREFER_L1 = 0x02, // prefer larger L1 cache and smaller shared memory CACHE_PREFER_EQUAL = 0x03 // prefer equal sized L1 cache and shared memory } cudaOccCacheConfig; typedef enum cudaOccCarveoutConfig_enum { SHAREDMEM_CARVEOUT_DEFAULT = -1, // no preference for shared memory or L1 (default) SHAREDMEM_CARVEOUT_MAX_SHARED = 100, // prefer maximum available shared memory, minimum L1 cache SHAREDMEM_CARVEOUT_MAX_L1 = 0, // prefer maximum available L1 cache, minimum shared memory SHAREDMEM_CARVEOUT_HALF = 50 // prefer half of maximum available shared memory, with the rest as L1 cache } cudaOccCarveoutConfig; /** * Device state descriptor * * This structure describes device settings that affect occupancy calculation. */ struct cudaOccDeviceState { // Cache / shared memory split preference. Deprecated on Volta cudaOccCacheConfig cacheConfig; // Shared memory / L1 split preference. Supported on only Volta int carveoutConfig; #ifdef __cplusplus __OCC_INLINE cudaOccDeviceState() : cacheConfig (CACHE_PREFER_NONE), carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT) {} #endif }; typedef enum cudaOccLimitingFactor_enum { // Occupancy limited due to: OCC_LIMIT_WARPS = 0x01, // - warps available OCC_LIMIT_REGISTERS = 0x02, // - registers available OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available OCC_LIMIT_BLOCKS = 0x08, // - blocks available OCC_LIMIT_BARRIERS = 0x10 // - barrier available } cudaOccLimitingFactor; /** * Occupancy output * * This structure contains occupancy calculator's output. */ struct cudaOccResult { int activeBlocksPerMultiprocessor; // Occupancy unsigned int limitingFactors; // Factors that limited occupancy. A bit // field that counts the limiting // factors, see cudaOccLimitingFactor int blockLimitRegs; // Occupancy due to register // usage, INT_MAX if the kernel does not // use any register. int blockLimitSharedMem; // Occupancy due to shared memory // usage, INT_MAX if the kernel does not // use shared memory. int blockLimitWarps; // Occupancy due to block size limit int blockLimitBlocks; // Occupancy due to maximum number of blocks // managable per SM int blockLimitBarriers; // Occupancy due to block barrier usage int allocatedRegistersPerBlock; // Actual number of registers allocated per // block size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated // per block cudaOccPartitionedGCConfig partitionedGCConfig; // Report if partitioned global caching // is actually enabled. }; /** * Partitioned global caching support * * See cudaOccPartitionedGlobalCachingModeSupport */ typedef enum cudaOccPartitionedGCSupport_enum { PARTITIONED_GC_NOT_SUPPORTED, // Partitioned global caching is not supported PARTITIONED_GC_SUPPORTED, // Partitioned global caching is supported } cudaOccPartitionedGCSupport; /** * Implementation */ /** * Max compute capability supported */ #define __CUDA_OCC_MAJOR__ 9 #define __CUDA_OCC_MINOR__ 0 ////////////////////////////////////////// // Mathematical Helper Functions // ////////////////////////////////////////// static __OCC_INLINE int __occMin(int lhs, int rhs) { return rhs < lhs ? rhs : lhs; } static __OCC_INLINE int __occDivideRoundUp(int x, int y) { return (x + (y - 1)) / y; } static __OCC_INLINE int __occRoundUp(int x, int y) { return y * __occDivideRoundUp(x, y); } ////////////////////////////////////////// // Architectural Properties // ////////////////////////////////////////// /** * Granularity of shared memory allocation */ static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties) { int value; switch(properties->computeMajor) { case 3: case 5: case 6: case 7: value = 256; break; case 8: case 9: value = 128; break; default: return CUDA_OCC_ERROR_UNKNOWN_DEVICE; } *limit = value; return CUDA_OCC_SUCCESS; } /** * Maximum number of registers per thread */ static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties) { int value; switch(properties->computeMajor) { case 3: case 5: case 6: value = 255; break; case 7: case 8: case 9: value = 256; break; default: return CUDA_OCC_ERROR_UNKNOWN_DEVICE; } *limit = value; return CUDA_OCC_SUCCESS; } /** * Granularity of register allocation */ static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties) { int value; switch(properties->computeMajor) { case 3: case 5: case 6: case 7: case 8: case 9: value = 256; break; default: return CUDA_OCC_ERROR_UNKNOWN_DEVICE; } *limit = value; return CUDA_OCC_SUCCESS; } /** * Number of sub-partitions */ static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties) { int value; switch(properties->computeMajor) { case 3: case 5: case 7: case 8: case 9: value = 4; break; case 6: value = properties->computeMinor ? 4 : 2; break; default: return CUDA_OCC_ERROR_UNKNOWN_DEVICE; } *limit = value; return CUDA_OCC_SUCCESS; } /** * Maximum number of blocks that can run simultaneously on a multiprocessor */ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties) { int value; switch(properties->computeMajor) { case 3: value = 16; break; case 5: case 6: value = 32; break; case 7: { int isTuring = properties->computeMinor == 5; value = (isTuring) ? 16 : 32; break; } case 8: if (properties->computeMinor == 0) { value = 32; } else if (properties->computeMinor == 9) { value = 24; } else { value = 16; } break; case 9: value = 32; break; default: return CUDA_OCC_ERROR_UNKNOWN_DEVICE; } *limit = value; return CUDA_OCC_SUCCESS; } /** * Align up shared memory based on compute major configurations */ static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties) { // Volta and Turing have shared L1 cache / shared memory, and support cache // configuration to trade one for the other. These values are needed to // map carveout config ratio to the next available architecture size size_t size = *shMemSize; switch (properties->computeMajor) { case 7: { // Turing supports 32KB and 64KB shared mem. int isTuring = properties->computeMinor == 5; if (isTuring) { if (size <= 32 * 1024) { *shMemSize = 32 * 1024; } else if (size <= 64 * 1024) { *shMemSize = 64 * 1024; } else { return CUDA_OCC_ERROR_INVALID_INPUT; } } // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem. else { if (size == 0) { *shMemSize = 0; } else if (size <= 8 * 1024) { *shMemSize = 8 * 1024; } else if (size <= 16 * 1024) { *shMemSize = 16 * 1024; } else if (size <= 32 * 1024) { *shMemSize = 32 * 1024; } else if (size <= 64 * 1024) { *shMemSize = 64 * 1024; } else if (size <= 96 * 1024) { *shMemSize = 96 * 1024; } else { return CUDA_OCC_ERROR_INVALID_INPUT; } } break; } case 8: if (properties->computeMinor == 0 || properties->computeMinor == 7) { if (size == 0) { *shMemSize = 0; } else if (size <= 8 * 1024) { *shMemSize = 8 * 1024; } else if (size <= 16 * 1024) { *shMemSize = 16 * 1024; } else if (size <= 32 * 1024) { *shMemSize = 32 * 1024; } else if (size <= 64 * 1024) { *shMemSize = 64 * 1024; } else if (size <= 100 * 1024) { *shMemSize = 100 * 1024; } else if (size <= 132 * 1024) { *shMemSize = 132 * 1024; } else if (size <= 164 * 1024) { *shMemSize = 164 * 1024; } else { return CUDA_OCC_ERROR_INVALID_INPUT; } } else { if (size == 0) { *shMemSize = 0; } else if (size <= 8 * 1024) { *shMemSize = 8 * 1024; } else if (size <= 16 * 1024) { *shMemSize = 16 * 1024; } else if (size <= 32 * 1024) { *shMemSize = 32 * 1024; } else if (size <= 64 * 1024) { *shMemSize = 64 * 1024; } else if (size <= 100 * 1024) { *shMemSize = 100 * 1024; } else { return CUDA_OCC_ERROR_INVALID_INPUT; } } break; case 9: { if (size == 0) { *shMemSize = 0; } else if (size <= 8 * 1024) { *shMemSize = 8 * 1024; } else if (size <= 16 * 1024) { *shMemSize = 16 * 1024; } else if (size <= 32 * 1024) { *shMemSize = 32 * 1024; } else if (size <= 64 * 1024) { *shMemSize = 64 * 1024; } else if (size <= 100 * 1024) { *shMemSize = 100 * 1024; } else if (size <= 132 * 1024) { *shMemSize = 132 * 1024; } else if (size <= 164 * 1024) { *shMemSize = 164 * 1024; } else if (size <= 196 * 1024) { *shMemSize = 196 * 1024; } else if (size <= 228 * 1024) { *shMemSize = 228 * 1024; } else { return CUDA_OCC_ERROR_INVALID_INPUT; } break; } default: return CUDA_OCC_ERROR_UNKNOWN_DEVICE; } return CUDA_OCC_SUCCESS; } /** * Shared memory based on the new carveoutConfig API introduced with Volta */ static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) { cudaOccError status = CUDA_OCC_SUCCESS; size_t preferenceShmemSize; // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported // devices. This preference will take precedence over the older cacheConfig setting. // Map cacheConfig to its effective preference value. int effectivePreference = state->carveoutConfig; if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) { return CUDA_OCC_ERROR_INVALID_INPUT; } if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) { switch (state->cacheConfig) { case CACHE_PREFER_L1: effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1; break; case CACHE_PREFER_SHARED: effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED; break; case CACHE_PREFER_EQUAL: effectivePreference = SHAREDMEM_CARVEOUT_HALF; break; default: effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT; break; } } if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) { preferenceShmemSize = properties->sharedMemPerMultiprocessor; } else { preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100; } status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties); *limit = preferenceShmemSize; return status; } /** * Shared memory based on the cacheConfig */ static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) { size_t bytes = 0; size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor; cudaOccCacheConfig cacheConfig = state->cacheConfig; // Kepler has shared L1 cache / shared memory, and support cache // configuration to trade one for the other. These values are needed to // calculate the correct shared memory size for user requested cache // configuration. // size_t minCacheSize = 16384; size_t maxCacheSize = 49152; size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize; size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize; switch (properties->computeMajor) { case 3: // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest // is shared memory. // switch (cacheConfig) { default : case CACHE_PREFER_NONE: case CACHE_PREFER_SHARED: bytes = sharedMemPerMultiprocessorHigh; break; case CACHE_PREFER_L1: bytes = sharedMemPerMultiprocessorLow; break; case CACHE_PREFER_EQUAL: // Equal is the mid-point between high and low. It should be // equivalent to low + 16KB. // bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2; break; } break; case 5: case 6: // Maxwell and Pascal have dedicated shared memory. // bytes = sharedMemPerMultiprocessorHigh; break; default: return CUDA_OCC_ERROR_UNKNOWN_DEVICE; } *limit = bytes; return CUDA_OCC_SUCCESS; } /** * Shared memory based on config requested by User */ static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) { // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference, // it is handled separately from the cache config preference. if (properties->computeMajor >= 7) { return cudaOccSMemPreferenceVoltaPlus(limit, properties, state); } return cudaOccSMemPreference(limit, properties, state); } /** * Return the per block shared memory limit based on function config */ static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta) { switch (properties->computeMajor) { case 2: case 3: case 4: case 5: case 6: *limit = properties->sharedMemPerBlock; break; case 7: case 8: case 9: switch (shmemLimitConfig) { default: case FUNC_SHMEM_LIMIT_DEFAULT: *limit = properties->sharedMemPerBlock; break; case FUNC_SHMEM_LIMIT_OPTIN: if (smemPerCta > properties->sharedMemPerBlock) { *limit = properties->sharedMemPerBlockOptin; } else { *limit = properties->sharedMemPerBlock; } break; } break; default: return CUDA_OCC_ERROR_UNKNOWN_DEVICE; } // Starting Ampere, CUDA driver reserves additional shared memory per block if (properties->computeMajor >= 8) { *limit += properties->reservedSharedMemPerBlock; } return CUDA_OCC_SUCCESS; } /** * Partitioned global caching mode support */ static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties) { *limit = PARTITIONED_GC_NOT_SUPPORTED; if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) || properties->computeMajor == 6) { *limit = PARTITIONED_GC_SUPPORTED; } if (properties->computeMajor == 6 && properties->computeMinor == 0) { *limit = PARTITIONED_GC_NOT_SUPPORTED; } return CUDA_OCC_SUCCESS; } /////////////////////////////////////////////// // User Input Sanity // /////////////////////////////////////////////// static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties) { // Verify device properties // // Each of these limits must be a positive number. // // Compute capacity is checked during the occupancy calculation // if (properties->maxThreadsPerBlock <= 0 || properties->maxThreadsPerMultiprocessor <= 0 || properties->regsPerBlock <= 0 || properties->regsPerMultiprocessor <= 0 || properties->warpSize <= 0 || properties->sharedMemPerBlock <= 0 || properties->sharedMemPerMultiprocessor <= 0 || properties->numSms <= 0) { return CUDA_OCC_ERROR_INVALID_INPUT; } return CUDA_OCC_SUCCESS; } static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes) { // Verify function attributes // if (attributes->maxThreadsPerBlock <= 0 || attributes->numRegs < 0) { // Compiler may choose not to use // any register (empty kernels, // etc.) return CUDA_OCC_ERROR_INVALID_INPUT; } return CUDA_OCC_SUCCESS; } static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state) { (void)state; // silence unused-variable warning // Placeholder // return CUDA_OCC_SUCCESS; } static __OCC_INLINE cudaOccError cudaOccInputCheck( const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, const cudaOccDeviceState *state) { cudaOccError status = CUDA_OCC_SUCCESS; status = cudaOccDevicePropCheck(properties); if (status != CUDA_OCC_SUCCESS) { return status; } status = cudaOccFuncAttributesCheck(attributes); if (status != CUDA_OCC_SUCCESS) { return status; } status = cudaOccDeviceStateCheck(state); if (status != CUDA_OCC_SUCCESS) { return status; } return status; } /////////////////////////////////////////////// // Occupancy calculation Functions // /////////////////////////////////////////////// static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected( const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes) { cudaOccPartitionedGCSupport gcSupport; cudaOccPartitionedGCConfig gcConfig; cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties); gcConfig = attributes->partitionedGCConfig; if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) { gcConfig = PARTITIONED_GC_OFF; } return gcConfig; } // Warp limit // static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit( int *limit, cudaOccPartitionedGCConfig gcConfig, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, int blockSize) { cudaOccError status = CUDA_OCC_SUCCESS; int maxWarpsPerSm; int warpsAllocatedPerCTA; int maxBlocks; (void)attributes; // silence unused-variable warning if (blockSize > properties->maxThreadsPerBlock) { maxBlocks = 0; } else { maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize; warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize); maxBlocks = 0; if (gcConfig != PARTITIONED_GC_OFF) { int maxBlocksPerSmPartition; int maxWarpsPerSmPartition; // If partitioned global caching is on, then a CTA can only use a SM // partition (a half SM), and thus a half of the warp slots // available per SM // maxWarpsPerSmPartition = maxWarpsPerSm / 2; maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA; maxBlocks = maxBlocksPerSmPartition * 2; } // On hardware that supports partitioned global caching, each half SM is // guaranteed to support at least 32 warps (maximum number of warps of a // CTA), so caching will not cause 0 occupancy due to insufficient warp // allocation slots. // else { maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA; } } *limit = maxBlocks; return status; } // Shared memory limit // static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit( int *limit, cudaOccResult *result, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, const cudaOccDeviceState *state, int blockSize, size_t dynamicSmemSize) { cudaOccError status = CUDA_OCC_SUCCESS; int allocationGranularity; size_t userSmemPreference = 0; size_t totalSmemUsagePerCTA; size_t maxSmemUsagePerCTA; size_t smemAllocatedPerCTA; size_t staticSmemSize; size_t sharedMemPerMultiprocessor; size_t smemLimitPerCTA; int maxBlocks; int dynamicSmemSizeExceeded = 0; int totalSmemSizeExceeded = 0; (void)blockSize; // silence unused-variable warning status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties); if (status != CUDA_OCC_SUCCESS) { return status; } // Obtain the user preferred shared memory size. This setting is ignored if // user requests more shared memory than preferred. // status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state); if (status != CUDA_OCC_SUCCESS) { return status; } staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock; totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize; smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity); maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes; dynamicSmemSizeExceeded = 0; totalSmemSizeExceeded = 0; // Obtain the user set maximum dynamic size if it exists // If so, the current launch dynamic shared memory must not // exceed the set limit if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT && dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) { dynamicSmemSizeExceeded = 1; } status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA); if (status != CUDA_OCC_SUCCESS) { return status; } if (smemAllocatedPerCTA > smemLimitPerCTA) { totalSmemSizeExceeded = 1; } if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) { maxBlocks = 0; } else { // User requested shared memory limit is used as long as it is greater // than the total shared memory used per CTA, i.e. as long as at least // one CTA can be launched. if (userSmemPreference >= smemAllocatedPerCTA) { sharedMemPerMultiprocessor = userSmemPreference; } else { // On Volta+, user requested shared memory will limit occupancy // if it's less than shared memory per CTA. Otherwise, the // maximum shared memory limit is used. if (properties->computeMajor >= 7) { sharedMemPerMultiprocessor = smemAllocatedPerCTA; status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties); if (status != CUDA_OCC_SUCCESS) { return status; } } else { sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor; } } if (smemAllocatedPerCTA > 0) { maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA); } else { maxBlocks = INT_MAX; } } result->allocatedSharedMemPerBlock = smemAllocatedPerCTA; *limit = maxBlocks; return status; } static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMRegsLimit( int *limit, cudaOccPartitionedGCConfig *gcConfig, cudaOccResult *result, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, int blockSize) { cudaOccError status = CUDA_OCC_SUCCESS; int allocationGranularity; int warpsAllocatedPerCTA; int regsAllocatedPerCTA; int regsAssumedPerCTA; int regsPerWarp; int regsAllocatedPerWarp; int numSubPartitions; int numRegsPerSubPartition; int numWarpsPerSubPartition; int numWarpsPerSM; int maxBlocks; int maxRegsPerThread; status = cudaOccRegAllocationGranularity( &allocationGranularity, properties); if (status != CUDA_OCC_SUCCESS) { return status; } status = cudaOccRegAllocationMaxPerThread( &maxRegsPerThread, properties); if (status != CUDA_OCC_SUCCESS) { return status; } status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties); if (status != CUDA_OCC_SUCCESS) { return status; } warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize); // GPUs of compute capability 2.x and higher allocate registers to warps // // Number of regs per warp is regs per thread x warp size, rounded up to // register allocation granularity // regsPerWarp = attributes->numRegs * properties->warpSize; regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity); regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA; // Hardware verifies if a launch fits the per-CTA register limit. For // historical reasons, the verification logic assumes register // allocations are made to all partitions simultaneously. Therefore, to // simulate the hardware check, the warp allocation needs to be rounded // up to the number of partitions. // regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions); if (properties->regsPerBlock < regsAssumedPerCTA || // Hardware check properties->regsPerBlock < regsAllocatedPerCTA || // Software check attributes->numRegs > maxRegsPerThread) { // Per thread limit check maxBlocks = 0; } else { if (regsAllocatedPerWarp > 0) { // Registers are allocated in each sub-partition. The max number // of warps that can fit on an SM is equal to the max number of // warps per sub-partition x number of sub-partitions. // numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions; numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp; maxBlocks = 0; if (*gcConfig != PARTITIONED_GC_OFF) { int numSubPartitionsPerSmPartition; int numWarpsPerSmPartition; int maxBlocksPerSmPartition; // If partitioned global caching is on, then a CTA can only // use a half SM, and thus a half of the registers available // per SM // numSubPartitionsPerSmPartition = numSubPartitions / 2; numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition; maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA; maxBlocks = maxBlocksPerSmPartition * 2; } // Try again if partitioned global caching is not enabled, or if // the CTA cannot fit on the SM with caching on (maxBlocks == 0). In the latter // case, the device will automatically turn off caching, except // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate // occupancy and launch configuration. // if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) { // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since // this is what it will be if we spread CTA across partitions. // *gcConfig = PARTITIONED_GC_OFF; numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions; maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA; } } else { maxBlocks = INT_MAX; } } result->allocatedRegistersPerBlock = regsAllocatedPerCTA; *limit = maxBlocks; return status; } // Barrier limit // static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit( int *limit, int ctaLimitBlocks, const cudaOccFuncAttributes *attributes) { cudaOccError status = CUDA_OCC_SUCCESS; int numBarriersAvailable = ctaLimitBlocks * 2; int numBarriersUsed = attributes->numBlockBarriers; int maxBlocks = INT_MAX; if (numBarriersUsed) { maxBlocks = numBarriersAvailable / numBarriersUsed; } *limit = maxBlocks; return status; } /////////////////////////////////// // API Implementations // /////////////////////////////////// static __OCC_INLINE cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccResult *result, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, const cudaOccDeviceState *state, int blockSize, size_t dynamicSmemSize) { cudaOccError status = CUDA_OCC_SUCCESS; int ctaLimitWarps = 0; int ctaLimitBlocks = 0; int ctaLimitSMem = 0; int ctaLimitRegs = 0; int ctaLimitBars = 0; int ctaLimit = 0; unsigned int limitingFactors = 0; cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF; if (!result || !properties || !attributes || !state || blockSize <= 0) { return CUDA_OCC_ERROR_INVALID_INPUT; } /////////////////////////// // Check user input /////////////////////////// status = cudaOccInputCheck(properties, attributes, state); if (status != CUDA_OCC_SUCCESS) { return status; } /////////////////////////// // Initialization /////////////////////////// gcConfig = cudaOccPartitionedGCExpected(properties, attributes); /////////////////////////// // Compute occupancy /////////////////////////// // Limits due to registers/SM // Also compute if partitioned global caching has to be turned off // status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize); if (status != CUDA_OCC_SUCCESS) { return status; } // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4. // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x. // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0), // we do not let it run on any Pascal processor, even though it may be able to run on GP100. // Therefore, we check the occupancy on GP10x when it can run on GP100 // if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) { cudaOccDeviceProp propertiesGP10x; cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig; int ctaLimitRegsGP10x = 0; // Set up properties for GP10x memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x)); propertiesGP10x.computeMinor = 1; status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize); if (status != CUDA_OCC_SUCCESS) { return status; } if (ctaLimitRegsGP10x == 0) { ctaLimitRegs = 0; } } // Limits due to warps/SM // status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize); if (status != CUDA_OCC_SUCCESS) { return status; } // Limits due to blocks/SM // status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties); if (status != CUDA_OCC_SUCCESS) { return status; } // Limits due to shared memory/SM // status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize); if (status != CUDA_OCC_SUCCESS) { return status; } /////////////////////////// // Overall occupancy /////////////////////////// // Overall limit is min() of limits due to above reasons // ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks))); // Determine occupancy limiting factors // if (ctaLimit == ctaLimitWarps) { limitingFactors |= OCC_LIMIT_WARPS; } if (ctaLimit == ctaLimitRegs) { limitingFactors |= OCC_LIMIT_REGISTERS; } if (ctaLimit == ctaLimitSMem) { limitingFactors |= OCC_LIMIT_SHARED_MEMORY; } if (ctaLimit == ctaLimitBlocks) { limitingFactors |= OCC_LIMIT_BLOCKS; } // For Hopper onwards compute the limits to occupancy based on block barrier count // if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) { // Limits due to barrier/SM // status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes); if (status != CUDA_OCC_SUCCESS) { return status; } // Recompute overall limit based on barrier/SM // ctaLimit = __occMin(ctaLimitBars, ctaLimit); // Determine if this is occupancy limiting factor // if (ctaLimit == ctaLimitBars) { limitingFactors |= OCC_LIMIT_BARRIERS; } } else { ctaLimitBars = INT_MAX; } // Fill in the return values // result->limitingFactors = limitingFactors; result->blockLimitRegs = ctaLimitRegs; result->blockLimitSharedMem = ctaLimitSMem; result->blockLimitWarps = ctaLimitWarps; result->blockLimitBlocks = ctaLimitBlocks; result->blockLimitBarriers = ctaLimitBars; result->partitionedGCConfig = gcConfig; // Final occupancy result->activeBlocksPerMultiprocessor = ctaLimit; return CUDA_OCC_SUCCESS; } static __OCC_INLINE cudaOccError cudaOccAvailableDynamicSMemPerBlock( size_t *bytesAvailable, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, const cudaOccDeviceState *state, int numBlocks, int blockSize) { int allocationGranularity; size_t smemLimitPerBlock; size_t smemAvailableForDynamic; size_t userSmemPreference = 0; size_t sharedMemPerMultiprocessor; cudaOccResult result; cudaOccError status = CUDA_OCC_SUCCESS; if (numBlocks <= 0) return CUDA_OCC_ERROR_INVALID_INPUT; // First compute occupancy of potential kernel launch. // status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0); if (status != CUDA_OCC_SUCCESS) { return status; } // Check if occupancy is achievable given user requested number of blocks. // if (result.activeBlocksPerMultiprocessor < numBlocks) { return CUDA_OCC_ERROR_INVALID_INPUT; } status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties); if (status != CUDA_OCC_SUCCESS) { return status; } // Return the per block shared memory limit based on function config. // status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor); if (status != CUDA_OCC_SUCCESS) { return status; } // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user // preference sets the total limit of available shared memory. // cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state); if (numBlocks == 1) { sharedMemPerMultiprocessor = smemLimitPerBlock; } else { if (!userSmemPreference) { userSmemPreference = 1 ; status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties); if (status != CUDA_OCC_SUCCESS) { return status; } } sharedMemPerMultiprocessor = userSmemPreference; } // Compute total shared memory available per SM // smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks; smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity; // Cap shared memory // if (smemAvailableForDynamic > smemLimitPerBlock) { smemAvailableForDynamic = smemLimitPerBlock; } // Now compute dynamic shared memory size smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes; // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute() // if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes) smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes; *bytesAvailable = smemAvailableForDynamic; return CUDA_OCC_SUCCESS; } static __OCC_INLINE cudaOccError cudaOccMaxPotentialOccupancyBlockSize( int *minGridSize, int *blockSize, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, const cudaOccDeviceState *state, size_t (*blockSizeToDynamicSMemSize)(int), size_t dynamicSMemSize) { cudaOccError status = CUDA_OCC_SUCCESS; cudaOccResult result; // Limits int occupancyLimit; int granularity; int blockSizeLimit; // Recorded maximum int maxBlockSize = 0; int numBlocks = 0; int maxOccupancy = 0; // Temporary int blockSizeToTryAligned; int blockSizeToTry; int blockSizeLimitAligned; int occupancyInBlocks; int occupancyInThreads; /////////////////////////// // Check user input /////////////////////////// if (!minGridSize || !blockSize || !properties || !attributes || !state) { return CUDA_OCC_ERROR_INVALID_INPUT; } status = cudaOccInputCheck(properties, attributes, state); if (status != CUDA_OCC_SUCCESS) { return status; } ///////////////////////////////////////////////////////////////////////////////// // Try each block size, and pick the block size with maximum occupancy ///////////////////////////////////////////////////////////////////////////////// occupancyLimit = properties->maxThreadsPerMultiprocessor; granularity = properties->warpSize; blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock); blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); // Ignore dynamicSMemSize if the user provides a mapping // if (blockSizeToDynamicSMemSize) { dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry); } status = cudaOccMaxActiveBlocksPerMultiprocessor( &result, properties, attributes, state, blockSizeToTry, dynamicSMemSize); if (status != CUDA_OCC_SUCCESS) { return status; } occupancyInBlocks = result.activeBlocksPerMultiprocessor; occupancyInThreads = blockSizeToTry * occupancyInBlocks; if (occupancyInThreads > maxOccupancy) { maxBlockSize = blockSizeToTry; numBlocks = occupancyInBlocks; maxOccupancy = occupancyInThreads; } // Early out if we have reached the maximum // if (occupancyLimit == maxOccupancy) { break; } } /////////////////////////// // Return best available /////////////////////////// // Suggested min grid size to achieve a full machine launch // *minGridSize = numBlocks * properties->numSms; *blockSize = maxBlockSize; return status; } #if defined(__cplusplus) namespace { __OCC_INLINE cudaOccError cudaOccMaxPotentialOccupancyBlockSize( int *minGridSize, int *blockSize, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, const cudaOccDeviceState *state, size_t dynamicSMemSize) { return cudaOccMaxPotentialOccupancyBlockSize( minGridSize, blockSize, properties, attributes, state, NULL, dynamicSMemSize); } template <typename UnaryFunction> __OCC_INLINE cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem( int *minGridSize, int *blockSize, const cudaOccDeviceProp *properties, const cudaOccFuncAttributes *attributes, const cudaOccDeviceState *state, UnaryFunction blockSizeToDynamicSMemSize) { cudaOccError status = CUDA_OCC_SUCCESS; cudaOccResult result; // Limits int occupancyLimit; int granularity; int blockSizeLimit; // Recorded maximum int maxBlockSize = 0; int numBlocks = 0; int maxOccupancy = 0; // Temporary int blockSizeToTryAligned; int blockSizeToTry; int blockSizeLimitAligned; int occupancyInBlocks; int occupancyInThreads; size_t dynamicSMemSize; /////////////////////////// // Check user input /////////////////////////// if (!minGridSize || !blockSize || !properties || !attributes || !state) { return CUDA_OCC_ERROR_INVALID_INPUT; } status = cudaOccInputCheck(properties, attributes, state); if (status != CUDA_OCC_SUCCESS) { return status; } ///////////////////////////////////////////////////////////////////////////////// // Try each block size, and pick the block size with maximum occupancy ///////////////////////////////////////////////////////////////////////////////// occupancyLimit = properties->maxThreadsPerMultiprocessor; granularity = properties->warpSize; blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock); blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); status = cudaOccMaxActiveBlocksPerMultiprocessor( &result, properties, attributes, state, blockSizeToTry, dynamicSMemSize); if (status != CUDA_OCC_SUCCESS) { return status; } occupancyInBlocks = result.activeBlocksPerMultiprocessor; occupancyInThreads = blockSizeToTry * occupancyInBlocks; if (occupancyInThreads > maxOccupancy) { maxBlockSize = blockSizeToTry; numBlocks = occupancyInBlocks; maxOccupancy = occupancyInThreads; } // Early out if we have reached the maximum // if (occupancyLimit == maxOccupancy) { break; } } /////////////////////////// // Return best available /////////////////////////// // Suggested min grid size to achieve a full machine launch // *minGridSize = numBlocks * properties->numSms; *blockSize = maxBlockSize; return status; } } // namespace anonymous #endif /*__cplusplus */ #undef __OCC_INLINE #endif /*__cuda_occupancy_h__*/