/*
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef _CUDA_PIPELINE_H_
# define _CUDA_PIPELINE_H_
# include "cuda_pipeline_primitives.h"
# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
-std=c++11 compiler option.
# endif
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
# include "cuda_awbarrier.h"
# endif
// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
# if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
# else
# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
# endif
# define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
# define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
# define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
struct __block_scope_barrier_base;
}}
# endif
_CUDA_PIPELINE_BEGIN_NAMESPACE
template<size_t N, typename T>
_CUDA_PIPELINE_QUALIFIER
auto segment(T* ptr) -> T(*)[N];
class pipeline {
public:
pipeline(const pipeline&) = delete;
pipeline(pipeline&&) = delete;
pipeline& operator=(const pipeline&) = delete;
pipeline& operator=(pipeline&&) = delete;
_CUDA_PIPELINE_QUALIFIER pipeline();
_CUDA_PIPELINE_QUALIFIER size_t commit();
_CUDA_PIPELINE_QUALIFIER void commit_and_wait();
_CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
template<unsigned N>
_CUDA_PIPELINE_QUALIFIER void wait_prior();
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
_CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
_CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
# endif
private:
size_t current_batch;
};
template<class T>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T& dst, const T& src, pipeline& pipe);
template<class T, size_t DstN, size_t SrcN>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
template<size_t N, typename T>
_CUDA_PIPELINE_QUALIFIER
auto segment(T* ptr) -> T(*)[N]
{
return (T(*)[N])ptr;
}
_CUDA_PIPELINE_QUALIFIER
pipeline::pipeline()
: current_batch(0)
{
}
_CUDA_PIPELINE_QUALIFIER
size_t pipeline::commit()
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
return this->current_batch++;
}
_CUDA_PIPELINE_QUALIFIER
void pipeline::commit_and_wait()
{
(void)pipeline::commit();
pipeline::wait_prior<0>();
}
_CUDA_PIPELINE_QUALIFIER
void pipeline::wait(size_t batch)
{
const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
switch (prior) {
case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
}
}
template<unsigned N>
_CUDA_PIPELINE_QUALIFIER
void pipeline::wait_prior()
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
}
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
_CUDA_PIPELINE_QUALIFIER
void pipeline::arrive_on(awbarrier& barrier)
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
}
_CUDA_PIPELINE_QUALIFIER
void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
{
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
}
# endif
template<class T>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T& dst, const T& src, pipeline& pipe)
{
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
if (__is_trivially_copyable(T)) {
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
} else {
dst = src;
}
}
template<class T, size_t DstN, size_t SrcN>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
{
constexpr size_t dst_size = sizeof(*dst);
constexpr size_t src_size = sizeof(*src);
static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
if (__is_trivially_copyable(T)) {
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
} else {
for (size_t i = 0; i < DstN; ++i) {
(*dst)[i] = (i < SrcN) ? (*src)[i] : T();
}
}
}
_CUDA_PIPELINE_END_NAMESPACE
#endif /* !_CUDA_PIPELINE_H_ */