/* * Copyright 1993-2021 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ #ifndef _COOPERATIVE_GROUPS_H_ #define _COOPERATIVE_GROUPS_H_ #if defined(__cplusplus) && defined(__CUDACC__) #include "cooperative_groups/details/info.h" #include "cooperative_groups/details/driver_abi.h" #include "cooperative_groups/details/helpers.h" #include "cooperative_groups/details/memory.h" #if defined(_CG_HAS_STL_ATOMICS) #include <cuda/atomic> #define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope; #else #define _CG_THREAD_SCOPE(scope) #endif _CG_BEGIN_NAMESPACE namespace details { _CG_CONST_DECL unsigned int coalesced_group_id = 1; _CG_CONST_DECL unsigned int multi_grid_group_id = 2; _CG_CONST_DECL unsigned int grid_group_id = 3; _CG_CONST_DECL unsigned int thread_block_id = 4; _CG_CONST_DECL unsigned int multi_tile_group_id = 5; _CG_CONST_DECL unsigned int cluster_group_id = 6; } /** * class thread_group; * * Generic thread group type, into which all groups are convertible. * It acts as a container for all storage necessary for the derived groups, * and will dispatch the API calls to the correct derived group. This means * that all derived groups must implement the same interface as thread_group. */ class thread_group { protected: struct group_data { unsigned int _unused : 1; unsigned int type : 7, : 0; }; struct gg_data { details::grid_workspace *gridWs; }; #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) struct mg_data { unsigned long long _unused : 1; unsigned long long type : 7; unsigned long long handle : 56; const details::multi_grid::multi_grid_functions *functions; }; #endif struct tg_data { unsigned int is_tiled : 1; unsigned int type : 7; unsigned int size : 24; // packed to 4b unsigned int metaGroupSize : 16; unsigned int metaGroupRank : 16; // packed to 8b unsigned int mask; // packed to 12b unsigned int _res; }; friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); friend class thread_block; union __align__(8) { group_data group; tg_data coalesced; gg_data grid; #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) mg_data multi_grid; #endif } _data; _CG_QUALIFIER thread_group operator=(const thread_group& src); _CG_QUALIFIER thread_group(unsigned int type) { _data.group.type = type; _data.group._unused = false; } #ifdef _CG_CPP11_FEATURES static_assert(sizeof(tg_data) <= 16, "Failed size check"); static_assert(sizeof(gg_data) <= 16, "Failed size check"); # ifdef _CG_ABI_EXPERIMENTAL static_assert(sizeof(mg_data) <= 16, "Failed size check"); # endif #endif public: _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device) _CG_QUALIFIER unsigned long long size() const; _CG_QUALIFIER unsigned long long num_threads() const; _CG_QUALIFIER unsigned long long thread_rank() const; _CG_QUALIFIER void sync() const; _CG_QUALIFIER unsigned int get_type() const { return _data.group.type; } }; template <unsigned int TyId> struct thread_group_base : public thread_group { _CG_QUALIFIER thread_group_base() : thread_group(TyId) {} _CG_STATIC_CONST_DECL unsigned int id = TyId; }; #if defined(_CG_HAS_MULTI_GRID_GROUP) /** * class multi_grid_group; * * Threads within this this group are guaranteed to be co-resident on the * same system, on multiple devices within the same launched kernels. * To use this group, the kernel must have been launched with * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent), * and the device must support it (queryable device attribute). * * Constructed via this_multi_grid(); */ # if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) class multi_grid_group; // Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls template <typename = void> __device__ _CG_DEPRECATED multi_grid_group this_multi_grid(); class multi_grid_group : public thread_group_base<details::multi_grid_group_id> { private: template <typename = void> _CG_QUALIFIER multi_grid_group() { _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics(); _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle(); } friend multi_grid_group this_multi_grid<void>(); public: _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system) _CG_QUALIFIER bool is_valid() const { return (_data.multi_grid.handle != 0); } _CG_QUALIFIER void sync() const { if (!is_valid()) { _CG_ABORT(); } _data.multi_grid.functions->sync(_data.multi_grid.handle); } _CG_QUALIFIER unsigned long long num_threads() const { _CG_ASSERT(is_valid()); return _data.multi_grid.functions->size(_data.multi_grid.handle); } _CG_QUALIFIER unsigned long long size() const { return num_threads(); } _CG_QUALIFIER unsigned long long thread_rank() const { _CG_ASSERT(is_valid()); return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle); } _CG_QUALIFIER unsigned int grid_rank() const { _CG_ASSERT(is_valid()); return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle)); } _CG_QUALIFIER unsigned int num_grids() const { _CG_ASSERT(is_valid()); return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle)); } }; # else class multi_grid_group { private: unsigned long long _handle; unsigned int _size; unsigned int _rank; friend _CG_QUALIFIER multi_grid_group this_multi_grid(); _CG_QUALIFIER multi_grid_group() { _handle = details::multi_grid::get_intrinsic_handle(); _size = details::multi_grid::size(_handle); _rank = details::multi_grid::thread_rank(_handle); } public: _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system) _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const { return (_handle != 0); } _CG_QUALIFIER _CG_DEPRECATED void sync() const { if (!is_valid()) { _CG_ABORT(); } details::multi_grid::sync(_handle); } _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const { _CG_ASSERT(is_valid()); return _size; } _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const { return num_threads(); } _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const { _CG_ASSERT(is_valid()); return _rank; } _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const { _CG_ASSERT(is_valid()); return (details::multi_grid::grid_rank(_handle)); } _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const { _CG_ASSERT(is_valid()); return (details::multi_grid::num_grids(_handle)); } }; # endif /** * multi_grid_group this_multi_grid() * * Constructs a multi_grid_group */ # if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) template <typename> __device__ #else _CG_QUALIFIER # endif _CG_DEPRECATED multi_grid_group this_multi_grid() { return multi_grid_group(); } #endif /** * class grid_group; * * Threads within this this group are guaranteed to be co-resident on the * same device within the same launched kernel. To use this group, the kernel * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent), * and the device must support it (queryable device attribute). * * Constructed via this_grid(); */ class grid_group : public thread_group_base<details::grid_group_id> { _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id; friend _CG_QUALIFIER grid_group this_grid(); private: _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) { _data.grid.gridWs = gridWs; } public: _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device) _CG_QUALIFIER bool is_valid() const { return (_data.grid.gridWs != NULL); } _CG_QUALIFIER void sync() const { if (!is_valid()) { _CG_ABORT(); } details::grid::sync(&_data.grid.gridWs->barrier); } _CG_STATIC_QUALIFIER unsigned long long size() { return details::grid::size(); } _CG_STATIC_QUALIFIER unsigned long long thread_rank() { return details::grid::thread_rank(); } _CG_STATIC_QUALIFIER dim3 group_dim() { return details::grid::grid_dim(); } _CG_STATIC_QUALIFIER unsigned long long num_threads() { return details::grid::num_threads(); } _CG_STATIC_QUALIFIER dim3 dim_blocks() { return details::grid::dim_blocks(); } _CG_STATIC_QUALIFIER unsigned long long num_blocks() { return details::grid::num_blocks(); } _CG_STATIC_QUALIFIER dim3 block_index() { return details::grid::block_index(); } _CG_STATIC_QUALIFIER unsigned long long block_rank() { return details::grid::block_rank(); } # if defined(_CG_HAS_CLUSTER_GROUP) _CG_STATIC_QUALIFIER dim3 dim_clusters() { return details::grid::dim_clusters(); } _CG_STATIC_QUALIFIER unsigned long long num_clusters() { return details::grid::num_clusters(); } _CG_STATIC_QUALIFIER dim3 cluster_index() { return details::grid::cluster_index(); } _CG_STATIC_QUALIFIER unsigned long long cluster_rank() { return details::grid::cluster_rank(); } # endif }; _CG_QUALIFIER grid_group this_grid() { // Load a workspace from the driver grid_group gg(details::get_grid_workspace()); #ifdef _CG_DEBUG // *all* threads must be available to synchronize gg.sync(); #endif // _CG_DEBUG return gg; } #if defined(_CG_HAS_CLUSTER_GROUP) /** * class cluster_group * * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly * divided along all dimensions to form groups of blocks, each group of which is * a block cluster. Clustered grids are subject to various restrictions and * limitations. Primarily, a cluster consists of at most 8 blocks by default * (although the user is allowed to opt-in to non-standard sizes,) and clustered * grids are subject to additional occupancy limitations due to per-cluster * hardware resource consumption. In exchange, a block cluster is guaranteed to * be a cooperative group, with access to all cooperative group capabilities, as * well as cluster specific capabilities and accelerations. A cluster_group * represents a block cluster. * * Constructed via this_cluster_group(); */ class cluster_group : public thread_group_base<details::cluster_group_id> { // Friends friend _CG_QUALIFIER cluster_group this_cluster(); // Disable constructor _CG_QUALIFIER cluster_group() { } public: //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster) using arrival_token = struct {}; // Functionality exposed by the group _CG_STATIC_QUALIFIER void sync() { return details::cluster::sync(); } _CG_STATIC_QUALIFIER arrival_token barrier_arrive() { details::cluster::barrier_arrive(); return arrival_token(); } _CG_STATIC_QUALIFIER void barrier_wait() { return details::cluster::barrier_wait(); } _CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&) { return details::cluster::barrier_wait(); } _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr) { return details::cluster::query_shared_rank(addr); } template <typename T> _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank) { return details::cluster::map_shared_rank(addr, rank); } _CG_STATIC_QUALIFIER dim3 block_index() { return details::cluster::block_index(); } _CG_STATIC_QUALIFIER unsigned int block_rank() { return details::cluster::block_rank(); } _CG_STATIC_QUALIFIER unsigned int thread_rank() { return details::cluster::thread_rank(); } _CG_STATIC_QUALIFIER dim3 dim_blocks() { return details::cluster::dim_blocks(); } _CG_STATIC_QUALIFIER unsigned int num_blocks() { return details::cluster::num_blocks(); } _CG_STATIC_QUALIFIER dim3 dim_threads() { return details::cluster::dim_threads(); } _CG_STATIC_QUALIFIER unsigned int num_threads() { return details::cluster::num_threads(); } // Legacy aliases _CG_STATIC_QUALIFIER unsigned int size() { return num_threads(); } }; /* * cluster_group this_cluster() * * Constructs a cluster_group */ _CG_QUALIFIER cluster_group this_cluster() { cluster_group cg; #ifdef _CG_DEBUG cg.sync(); #endif return cg; } #endif #if defined(_CG_CPP11_FEATURES) class thread_block; template <unsigned int MaxBlockSize> _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch); #endif /** * class thread_block * * Every GPU kernel is executed by a grid of thread blocks, and threads within * each block are guaranteed to reside on the same streaming multiprocessor. * A thread_block represents a thread block whose dimensions are not known until runtime. * * Constructed via this_thread_block(); */ class thread_block : public thread_group_base<details::thread_block_id> { // Friends friend _CG_QUALIFIER thread_block this_thread_block(); friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz); #if defined(_CG_CPP11_FEATURES) template <unsigned int MaxBlockSize> friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch); template <unsigned int Size> friend class __static_size_multi_warp_tile_base; details::multi_warp_scratch* const tile_memory; template <unsigned int MaxBlockSize> _CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) : tile_memory(details::get_scratch_ptr(&scratch)) { #ifdef _CG_DEBUG if (num_threads() > MaxBlockSize) { details::abort(); } #endif #if !defined(_CG_HAS_RESERVED_SHARED) tile_memory->init_barriers(thread_rank()); sync(); #endif } #endif // Disable constructor _CG_QUALIFIER thread_block() #if defined(_CG_CPP11_FEATURES) : tile_memory(details::get_scratch_ptr(NULL)) #endif { } // Internal Use _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const { const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); // Invalid, immediately fail if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { details::abort(); return (thread_block()); } unsigned int mask; unsigned int base_offset = thread_rank() & (~(tilesz - 1)); unsigned int masklength = min((unsigned int)size() - base_offset, tilesz); mask = (unsigned int)(-1) >> (32 - masklength); mask <<= (details::laneid() & ~(tilesz - 1)); thread_group tile = thread_group(details::coalesced_group_id); tile._data.coalesced.mask = mask; tile._data.coalesced.size = __popc(mask); tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz; tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz; tile._data.coalesced.is_tiled = true; return (tile); } public: _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id; _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) _CG_STATIC_QUALIFIER void sync() { details::cta::sync(); } _CG_STATIC_QUALIFIER unsigned int size() { return details::cta::size(); } _CG_STATIC_QUALIFIER unsigned int thread_rank() { return details::cta::thread_rank(); } // Additional functionality exposed by the group _CG_STATIC_QUALIFIER dim3 group_index() { return details::cta::group_index(); } _CG_STATIC_QUALIFIER dim3 thread_index() { return details::cta::thread_index(); } _CG_STATIC_QUALIFIER dim3 group_dim() { return details::cta::block_dim(); } _CG_STATIC_QUALIFIER dim3 dim_threads() { return details::cta::dim_threads(); } _CG_STATIC_QUALIFIER unsigned int num_threads() { return details::cta::num_threads(); } }; /** * thread_block this_thread_block() * * Constructs a thread_block group */ _CG_QUALIFIER thread_block this_thread_block() { return (thread_block()); } #if defined(_CG_CPP11_FEATURES) template <unsigned int MaxBlockSize> _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) { return (thread_block(scratch)); } #endif /** * class coalesced_group * * A group representing the current set of converged threads in a warp. * The size of the group is not guaranteed and it may return a group of * only one thread (itself). * * This group exposes warp-synchronous builtins. * Constructed via coalesced_threads(); */ class coalesced_group : public thread_group_base<details::coalesced_group_id> { private: friend _CG_QUALIFIER coalesced_group coalesced_threads(); friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz); friend class details::_coalesced_group_data_access; _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const { unsigned int member_pack = 0; unsigned int member_rank = 0; for (int bit_idx = 0; bit_idx < 32; bit_idx++) { unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); if (lane_bit) { if (laneMask & lane_bit) member_pack |= 1 << member_rank; member_rank++; } } return (member_pack); } // Internal Use _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const { const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); // Invalid, immediately fail if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { details::abort(); return (coalesced_group(0)); } if (size() <= tilesz) { return (*this); } if ((_data.coalesced.is_tiled == true) && pow2_tilesz) { unsigned int base_offset = (thread_rank() & (~(tilesz - 1))); unsigned int masklength = min((unsigned int)size() - base_offset, tilesz); unsigned int mask = (unsigned int)(-1) >> (32 - masklength); mask <<= (details::laneid() & ~(tilesz - 1)); coalesced_group coalesced_tile = coalesced_group(mask); coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz; coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz; coalesced_tile._data.coalesced.is_tiled = true; return (coalesced_tile); } else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) { unsigned int mask = 0; unsigned int member_rank = 0; int seen_lanes = (thread_rank() / tilesz) * tilesz; for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) { unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); if (lane_bit) { if (seen_lanes <= 0 && member_rank < tilesz) { mask |= lane_bit; member_rank++; } seen_lanes--; } } coalesced_group coalesced_tile = coalesced_group(mask); // Override parent with the size of this group coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz; coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz; return coalesced_tile; } else { // None in _CG_VERSION 1000 details::abort(); } return (coalesced_group(0)); } protected: _CG_QUALIFIER coalesced_group(unsigned int mask) { _data.coalesced.mask = mask; _data.coalesced.size = __popc(mask); _data.coalesced.metaGroupRank = 0; _data.coalesced.metaGroupSize = 1; _data.coalesced.is_tiled = false; } _CG_QUALIFIER unsigned int get_mask() const { return (_data.coalesced.mask); } public: _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id; _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) _CG_QUALIFIER unsigned int num_threads() const { return _data.coalesced.size; } _CG_QUALIFIER unsigned int size() const { return num_threads(); } _CG_QUALIFIER unsigned int thread_rank() const { return (__popc(_data.coalesced.mask & details::lanemask32_lt())); } // Rank of this group in the upper level of the hierarchy _CG_QUALIFIER unsigned int meta_group_rank() const { return _data.coalesced.metaGroupRank; } // Total num partitions created out of all CTAs when the group was created _CG_QUALIFIER unsigned int meta_group_size() const { return _data.coalesced.metaGroupSize; } _CG_QUALIFIER void sync() const { __syncwarp(_data.coalesced.mask); } #ifdef _CG_CPP11_FEATURES template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const { unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 : (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1)); return details::tile::shuffle_dispatch<TyElem>::shfl( _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32); } template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const { if (size() == 32) { return details::tile::shuffle_dispatch<TyElem>::shfl_down( _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32); } unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1); if (lane >= 32) lane = details::laneid(); return details::tile::shuffle_dispatch<TyElem>::shfl( _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32); } template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const { if (size() == 32) { return details::tile::shuffle_dispatch<TyElem>::shfl_up( _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32); } unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1)); if (lane >= 32) lane = details::laneid(); return details::tile::shuffle_dispatch<TyElem>::shfl( _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32); } #else template <typename TyIntegral> _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const { details::assert_if_not_arithmetic<TyIntegral>(); unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); } template <typename TyIntegral> _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const { details::assert_if_not_arithmetic<TyIntegral>(); if (size() == 32) { return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32)); } unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1)); if (lane >= 32) lane = details::laneid(); return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); } template <typename TyIntegral> _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const { details::assert_if_not_arithmetic<TyIntegral>(); if (size() == 32) { return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32)); } unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1); if (lane >= 32) lane = details::laneid(); return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); } #endif _CG_QUALIFIER int any(int predicate) const { return (__ballot_sync(_data.coalesced.mask, predicate) != 0); } _CG_QUALIFIER int all(int predicate) const { return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask); } _CG_QUALIFIER unsigned int ballot(int predicate) const { if (size() == 32) { return (__ballot_sync(0xFFFFFFFF, predicate)); } unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate); return (_packLanes(lane_ballot)); } #ifdef _CG_HAS_MATCH_COLLECTIVE template <typename TyIntegral> _CG_QUALIFIER unsigned int match_any(TyIntegral val) const { details::assert_if_not_arithmetic<TyIntegral>(); if (size() == 32) { return (__match_any_sync(0xFFFFFFFF, val)); } unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); return (_packLanes(lane_match)); } template <typename TyIntegral> _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const { details::assert_if_not_arithmetic<TyIntegral>(); if (size() == 32) { return (__match_all_sync(0xFFFFFFFF, val, &pred)); } unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); return (_packLanes(lane_match)); } #endif /* !_CG_HAS_MATCH_COLLECTIVE */ }; _CG_QUALIFIER coalesced_group coalesced_threads() { return (coalesced_group(__activemask())); } namespace details { template <unsigned int Size> struct verify_thread_block_tile_size; template <> struct verify_thread_block_tile_size<32> { typedef void OK; }; template <> struct verify_thread_block_tile_size<16> { typedef void OK; }; template <> struct verify_thread_block_tile_size<8> { typedef void OK; }; template <> struct verify_thread_block_tile_size<4> { typedef void OK; }; template <> struct verify_thread_block_tile_size<2> { typedef void OK; }; template <> struct verify_thread_block_tile_size<1> { typedef void OK; }; #ifdef _CG_CPP11_FEATURES template <unsigned int Size> using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>; template <unsigned int Size> using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>; template <unsigned int Size> using _is_multi_warp = _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>; template <unsigned int Size> using _is_valid_single_warp_tile = _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>; template <unsigned int Size> using _is_valid_multi_warp_tile = _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>; #else template <unsigned int Size> struct _is_multi_warp { static const bool value = false; }; #endif } template <unsigned int Size> class __static_size_tile_base { protected: _CG_STATIC_CONST_DECL unsigned int numThreads = Size; public: _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) // Rank of thread within tile _CG_STATIC_QUALIFIER unsigned int thread_rank() { return (details::cta::thread_rank() & (numThreads - 1)); } // Number of threads within tile _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() { return numThreads; } _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() { return num_threads(); } }; template <unsigned int Size> class __static_size_thread_block_tile_base : public __static_size_tile_base<Size> { friend class details::_coalesced_group_data_access; typedef details::tile::tile_helpers<Size> th; #ifdef _CG_CPP11_FEATURES static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32"); #else typedef typename details::verify_thread_block_tile_size<Size>::OK valid; #endif using __static_size_tile_base<Size>::numThreads; _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF; protected: _CG_STATIC_QUALIFIER unsigned int build_mask() { unsigned int mask = fullMask; if (numThreads != 32) { // [0,31] representing the current active thread in the warp unsigned int laneId = details::laneid(); // shift mask according to the partition it belongs to mask = th::tileMask << (laneId & ~(th::laneMask)); } return (mask); } public: _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id; _CG_STATIC_QUALIFIER void sync() { __syncwarp(build_mask()); } #ifdef _CG_CPP11_FEATURES // PTX supported collectives template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const { return details::tile::shuffle_dispatch<TyElem>::shfl( _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads); } template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const { return details::tile::shuffle_dispatch<TyElem>::shfl_down( _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads); } template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const { return details::tile::shuffle_dispatch<TyElem>::shfl_up( _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads); } template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const { return details::tile::shuffle_dispatch<TyElem>::shfl_xor( _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads); } #else template <typename TyIntegral> _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const { details::assert_if_not_arithmetic<TyIntegral>(); return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } template <typename TyIntegral> _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const { details::assert_if_not_arithmetic<TyIntegral>(); return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } template <typename TyIntegral> _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const { details::assert_if_not_arithmetic<TyIntegral>(); return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } template <typename TyIntegral> _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const { details::assert_if_not_arithmetic<TyIntegral>(); return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } #endif //_CG_CPP11_FEATURES _CG_QUALIFIER int any(int predicate) const { unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); return (lane_ballot != 0); } _CG_QUALIFIER int all(int predicate) const { unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); return (lane_ballot == build_mask()); } _CG_QUALIFIER unsigned int ballot(int predicate) const { unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); return (lane_ballot >> (details::laneid() & (~(th::laneMask)))); } #ifdef _CG_HAS_MATCH_COLLECTIVE template <typename TyIntegral> _CG_QUALIFIER unsigned int match_any(TyIntegral val) const { details::assert_if_not_arithmetic<TyIntegral>(); unsigned int lane_match = __match_any_sync(build_mask(), val); return (lane_match >> (details::laneid() & (~(th::laneMask)))); } template <typename TyIntegral> _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const { details::assert_if_not_arithmetic<TyIntegral>(); unsigned int lane_match = __match_all_sync(build_mask(), val, &pred); return (lane_match >> (details::laneid() & (~(th::laneMask)))); } #endif }; template <unsigned int Size, typename ParentT> class __static_parent_thread_block_tile_base { public: // Rank of this group in the upper level of the hierarchy _CG_STATIC_QUALIFIER unsigned int meta_group_rank() { return ParentT::thread_rank() / Size; } // Total num partitions created out of all CTAs when the group was created _CG_STATIC_QUALIFIER unsigned int meta_group_size() { return (ParentT::size() + Size - 1) / Size; } }; /** * class thread_block_tile<unsigned int Size, ParentT = void> * * Statically-sized group type, representing one tile of a thread block. * The only specializations currently supported are those with native * hardware support (1/2/4/8/16/32) * * This group exposes warp-synchronous builtins. * Can only be constructed via tiled_partition<Size>(ParentT&) */ template <unsigned int Size, typename ParentT = void> class __single_warp_thread_block_tile : public __static_size_thread_block_tile_base<Size>, public __static_parent_thread_block_tile_base<Size, ParentT> { typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT; friend class details::_coalesced_group_data_access; protected: _CG_QUALIFIER __single_warp_thread_block_tile() { }; _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { }; _CG_STATIC_QUALIFIER unsigned int get_mask() { return __static_size_thread_block_tile_base<Size>::build_mask(); } }; template <unsigned int Size> class __single_warp_thread_block_tile<Size, void> : public __static_size_thread_block_tile_base<Size>, public thread_group_base<details::coalesced_group_id> { _CG_STATIC_CONST_DECL unsigned int numThreads = Size; template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile; friend class details::_coalesced_group_data_access; typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT; protected: _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank, unsigned int meta_group_size) { _data.coalesced.mask = staticSizeBaseT::build_mask(); _data.coalesced.size = numThreads; _data.coalesced.metaGroupRank = meta_group_rank; _data.coalesced.metaGroupSize = meta_group_size; _data.coalesced.is_tiled = true; } _CG_QUALIFIER unsigned int get_mask() const { return (_data.coalesced.mask); } public: using staticSizeBaseT::sync; using staticSizeBaseT::size; using staticSizeBaseT::num_threads; using staticSizeBaseT::thread_rank; _CG_QUALIFIER unsigned int meta_group_rank() const { return _data.coalesced.metaGroupRank; } _CG_QUALIFIER unsigned int meta_group_size() const { return _data.coalesced.metaGroupSize; } }; /** * Outer level API calls * void sync(GroupT) - see <group_type>.sync() * void thread_rank(GroupT) - see <group_type>.thread_rank() * void group_size(GroupT) - see <group_type>.size() */ template <class GroupT> _CG_QUALIFIER void sync(GroupT const &g) { g.sync(); } // TODO: Use a static dispatch to determine appropriate return type // C++03 is stuck with unsigned long long for now #ifdef _CG_CPP11_FEATURES template <class GroupT> _CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) { return g.thread_rank(); } template <class GroupT> _CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) { return g.num_threads(); } #else template <class GroupT> _CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) { return static_cast<unsigned long long>(g.thread_rank()); } template <class GroupT> _CG_QUALIFIER unsigned long long group_size(GroupT const &g) { return static_cast<unsigned long long>(g.num_threads()); } #endif /** * tiled_partition * * The tiled_partition(parent, tilesz) method is a collective operation that * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. * * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will * be created where threads having identical k = (thread_rank(parent)/tilesz) * will be members of the same subgroup. * * The implementation may cause the calling thread to wait until all the members * of the parent group have invoked the operation before resuming execution. * * Functionality is limited to power-of-two sized subgorup instances of at most * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be * tiled_partition() in _CG_VERSION 1000. */ _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz) { if (parent.get_type() == details::coalesced_group_id) { const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent); return _cg->_get_tiled_threads(tilesz); } else { const thread_block *_tb = static_cast<const thread_block*>(&parent); return _tb->_get_tiled_threads(tilesz); } } // Thread block type overload: returns a basic thread_group for now (may be specialized later) _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz) { return (parent._get_tiled_threads(tilesz)); } // Coalesced group type overload: retains its ability to stay coalesced _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz) { return (parent._get_tiled_threads(tilesz)); } namespace details { template <unsigned int Size, typename ParentT> class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {}; template <unsigned int Size, typename ParentT> _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() { return internal_thread_block_tile<Size, ParentT>(); } template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda> _CG_QUALIFIER TyVal multi_warp_collectives_helper( const GroupT& group, WarpLambda warp_lambda, InterWarpLambda inter_warp_lambda) { return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda); } template <typename T, typename GroupT> _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) { return group.template get_scratch_location<T>(warp_id); } template <typename GroupT> _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) { return group.get_sync_location(); } } /** * tiled_partition<tilesz> * * The tiled_partition<tilesz>(parent) method is a collective operation that * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. * * A total of ((size(parent)/tilesz) subgroups will be created, * therefore the parent group size must be evenly divisible by the tilesz. * The allow parent groups are thread_block or thread_block_tile<size>. * * The implementation may cause the calling thread to wait until all the members * of the parent group have invoked the operation before resuming execution. * * Functionality is limited to native hardware sizes, 1/2/4/8/16/32. * The size(parent) must be greater than the template Size parameter * otherwise the results are undefined. */ #if defined(_CG_CPP11_FEATURES) template <unsigned int Size> class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size> { static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512"); template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda> friend __device__ TyVal details::multi_warp_collectives_helper( const GroupT& group, WarpLambda warp_lambda, InterWarpLambda inter_warp_lambda); template <typename T, typename GroupT> friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id); template <typename GroupT> friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group); template <unsigned int OtherSize> friend class __static_size_multi_warp_tile_base; using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>; using ThisType = __static_size_multi_warp_tile_base<Size>; _CG_STATIC_CONST_DECL int numWarps = Size / 32; protected: details::multi_warp_scratch* const tile_memory; template <typename GroupT> _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) { #if defined(_CG_HAS_RESERVED_SHARED) details::sync_warps_reset(get_sync_location(), details::cta::thread_rank()); g.sync(); #endif } private: _CG_QUALIFIER details::barrier_t* get_sync_location() const { // Different group sizes use different barriers, all groups of a given size share one barrier. unsigned int sync_id = details::log2(Size / 64); return &tile_memory->barriers[sync_id]; } template <typename T> _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const { unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id; return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]); } template <typename T> _CG_QUALIFIER T* get_scratch_location() const { unsigned int scratch_id = details::cta::thread_rank() / 32; return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]); } template <typename TyVal> _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const { unsigned int src_warp = src / 32; auto warp = details::tiled_partition_internal<32, ThisType>(); details::barrier_t* sync_location = get_sync_location(); // Get warp slot of the source threads warp. TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp); if (warp.meta_group_rank() == src_warp) { warp.sync(); // Put shuffled value into my warp slot and let my warp arrive at the barrier. if (thread_rank() == src) { *warp_scratch_location = val; } details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps); TyVal result = *warp_scratch_location; details::sync_warps_wait(sync_location, details::cta::thread_rank()); return result; } else { // Wait for the source warp to arrive on the barrier. details::sync_warps_wait_for_specific_warp(sync_location, (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp)); TyVal result = *warp_scratch_location; details::sync_warps(sync_location, details::cta::thread_rank(), numWarps); return result; } } template <typename TyVal, typename WarpLambda, typename InterWarpLambda> _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const { static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size, "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes"); auto warp = details::tiled_partition_internal<32, ThisType>(); details::barrier_t* sync_location = get_sync_location(); TyVal* warp_scratch_location = get_scratch_location<TyVal>(); warp_lambda(warp, warp_scratch_location); if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) { auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>(); if (subwarp.meta_group_rank() == 0) { TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank()); inter_warp_lambda(subwarp, thread_scratch_location); } warp.sync(); details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps); } TyVal result = *warp_scratch_location; return result; } public: _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id; using __static_size_tile_base<Size>::thread_rank; template <typename TyVal> _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const { static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size, "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes"); return shfl_impl(val, src); } _CG_QUALIFIER void sync() const { details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps); } _CG_QUALIFIER int any(int predicate) const { auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) { *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate); }; auto inter_warp_lambda = [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) { *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location); }; return collectives_scheme<int>(warp_lambda, inter_warp_lambda); } _CG_QUALIFIER int all(int predicate) const { auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) { *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate); }; auto inter_warp_lambda = [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) { *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location); }; return collectives_scheme<int>(warp_lambda, inter_warp_lambda); } }; template <unsigned int Size, typename ParentT = void> class __multi_warp_thread_block_tile : public __static_size_multi_warp_tile_base<Size>, public __static_parent_thread_block_tile_base<Size, ParentT> { typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT; typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT; protected: _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) : __static_size_multi_warp_tile_base<Size>(g) {} }; template <unsigned int Size> class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size> { const unsigned int metaGroupRank; const unsigned int metaGroupSize; protected: template <unsigned int OtherSize, typename ParentT> _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) : __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {} public: _CG_QUALIFIER unsigned int meta_group_rank() const { return metaGroupRank; } _CG_QUALIFIER unsigned int meta_group_size() const { return metaGroupSize; } }; #endif template <unsigned int Size, typename ParentT = void> class thread_block_tile; namespace details { template <unsigned int Size, typename ParentT, bool IsMultiWarp> class thread_block_tile_impl; template <unsigned int Size, typename ParentT> class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT> { protected: template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp> _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) : __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {} _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) : __single_warp_thread_block_tile<Size, ParentT>() {} }; #if defined(_CG_CPP11_FEATURES) template <unsigned int Size, typename ParentT> class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT> { protected: template <typename GroupT> _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) : __multi_warp_thread_block_tile<Size, ParentT>(g) {} }; #else template <unsigned int Size, typename ParentT> class thread_block_tile_impl<Size, ParentT, true> { protected: template <typename GroupT> _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {} }; #endif } template <unsigned int Size, typename ParentT> class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value> { friend _CG_QUALIFIER thread_block_tile<1, void> this_thread(); protected: _CG_QUALIFIER thread_block_tile(const ParentT& g) : details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {} public: _CG_QUALIFIER operator thread_block_tile<Size, void>() const { return thread_block_tile<Size, void>(*this); } }; template <unsigned int Size> class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value> { template <unsigned int, typename ParentT> friend class thread_block_tile; protected: template <unsigned int OtherSize, typename OtherParentT> _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) : details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {} public: template <typename ParentT> _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) : details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {} }; namespace details { template <unsigned int Size, typename ParentT> struct tiled_partition_impl; template <unsigned int Size> struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> { _CG_QUALIFIER tiled_partition_impl(const thread_block& g) : thread_block_tile<Size, thread_block>(g) {} }; // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization template <unsigned int Size, unsigned int ParentSize, typename GrandParent> struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > : public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > { #ifdef _CG_CPP11_FEATURES static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size"); #endif _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) : thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {} }; } template <unsigned int Size, typename ParentT> _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g) { return details::tiled_partition_impl<Size, ParentT>(g); } /** * thread_group this_thread() * * Constructs a generic thread_group containing only the calling thread */ _CG_QUALIFIER thread_block_tile<1, void> this_thread() { // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its // meta group rank and size set to 0 and 1 respectively. return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block()); } /** * <group_type>.sync() * * Executes a barrier across the group * * Implements both a compiler fence and an architectural fence to prevent, * memory reordering around the barrier. */ _CG_QUALIFIER void thread_group::sync() const { switch (_data.group.type) { case details::coalesced_group_id: cooperative_groups::sync(*static_cast<const coalesced_group*>(this)); break; case details::thread_block_id: cooperative_groups::sync(*static_cast<const thread_block*>(this)); break; case details::grid_group_id: cooperative_groups::sync(*static_cast<const grid_group*>(this)); break; #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) case details::multi_grid_group_id: cooperative_groups::sync(*static_cast<const multi_grid_group*>(this)); break; #endif #if defined(_CG_HAS_CLUSTER_GROUP) case details::cluster_group_id: cooperative_groups::sync(*static_cast<const cluster_group*>(this)); break; #endif default: break; } } /** * <group_type>.size() * * Returns the total number of threads in the group. */ _CG_QUALIFIER unsigned long long thread_group::size() const { unsigned long long size = 0; switch (_data.group.type) { case details::coalesced_group_id: size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this)); break; case details::thread_block_id: size = cooperative_groups::group_size(*static_cast<const thread_block*>(this)); break; case details::grid_group_id: size = cooperative_groups::group_size(*static_cast<const grid_group*>(this)); break; #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) case details::multi_grid_group_id: size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this)); break; #endif #if defined(_CG_HAS_CLUSTER_GROUP) case details::cluster_group_id: size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this)); break; #endif default: break; } return size; } /** * <group_type>.thread_rank() * * Returns the linearized rank of the calling thread along the interval [0, size()). */ _CG_QUALIFIER unsigned long long thread_group::thread_rank() const { unsigned long long rank = 0; switch (_data.group.type) { case details::coalesced_group_id: rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this)); break; case details::thread_block_id: rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this)); break; case details::grid_group_id: rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this)); break; #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) case details::multi_grid_group_id: rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this)); break; #endif #if defined(_CG_HAS_CLUSTER_GROUP) case details::cluster_group_id: rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this)); break; #endif default: break; } return rank; } _CG_END_NAMESPACE #include <cooperative_groups/details/partitioning.h> #if (!defined(_MSC_VER) || defined(_WIN64)) # include <cooperative_groups/details/invoke.h> #endif # endif /* ! (__cplusplus, __CUDACC__) */ #endif /* !_COOPERATIVE_GROUPS_H_ */