/*
* Copyright 2011-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(_CUPTI_ACTIVITY_H_)
#define _CUPTI_ACTIVITY_H_
#include <cuda.h>
#include <cupti_callbacks.h>
#include <cupti_events.h>
#include <cupti_metrics.h>
#include <cupti_result.h>
#if defined(CUPTI_DIRECTIVE_SUPPORT)
#include <Openacc/cupti_openacc.h>
#include <Openmp/cupti_openmp.h>
#endif
#ifndef CUPTIAPI
#ifdef _WIN32
#define CUPTIAPI __stdcall
#else
#define CUPTIAPI
#endif
#endif
#if defined(__LP64__)
#define CUPTILP64 1
#elif defined(_WIN64)
#define CUPTILP64 1
#else
#undef CUPTILP64
#endif
#define ACTIVITY_RECORD_ALIGNMENT 8
#if defined(_WIN32) // Windows 32- and 64-bit
#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding
#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT))
#define END_PACKED_ALIGNMENT __pragma(pack(pop))
#elif defined(__GNUC__) // GCC
#define START_PACKED_ALIGNMENT
#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT)))
#define END_PACKED_ALIGNMENT
#else // all other compilers
#define START_PACKED_ALIGNMENT
#define PACKED_ALIGNMENT
#define END_PACKED_ALIGNMENT
#endif
#define CUPTI_UNIFIED_MEMORY_CPU_DEVICE_ID ((uint32_t) 0xFFFFFFFFU)
#define CUPTI_INVALID_CONTEXT_ID ((uint32_t) 0xFFFFFFFFU)
#define CUPTI_INVALID_STREAM_ID ((uint32_t) 0xFFFFFFFFU)
#define CUPTI_INVALID_CHANNEL_ID ((uint32_t) 0xFFFFFFFFU)
#if defined(__cplusplus)
extern "C" {
#endif
#if defined(__GNUC__) && defined(CUPTI_LIB)
#pragma GCC visibility push(default)
#endif
#define invalidNumaId ((uint32_t) 0xFFFFFFFF)
/**
* \defgroup CUPTI_ACTIVITY_API CUPTI Activity API
* Functions, types, and enums that implement the CUPTI Activity API.
* @{
*/
/**
* \brief The kinds of activity records.
*
* Each activity record kind represents information about a GPU or an
* activity occurring on a CPU or GPU. Each kind is associated with a
* activity record structure that holds the information associated
* with the kind.
* \see CUpti_Activity
* \see CUpti_ActivityAPI
* \see CUpti_ActivityContext
* \see CUpti_ActivityDevice
* \see CUpti_ActivityDevice2
* \see CUpti_ActivityDevice3
* \see CUpti_ActivityDevice4
* \see CUpti_ActivityDeviceAttribute
* \see CUpti_ActivityEvent
* \see CUpti_ActivityEventInstance
* \see CUpti_ActivityKernel
* \see CUpti_ActivityKernel2
* \see CUpti_ActivityKernel3
* \see CUpti_ActivityKernel4
* \see CUpti_ActivityKernel5
* \see CUpti_ActivityKernel6
* \see CUpti_ActivityKernel7
* \see CUpti_ActivityKernel8
* \see CUpti_ActivityKernel9
* \see CUpti_ActivityCdpKernel
* \see CUpti_ActivityPreemption
* \see CUpti_ActivityMemcpy
* \see CUpti_ActivityMemcpy3
* \see CUpti_ActivityMemcpy4
* \see CUpti_ActivityMemcpy5
* \see CUpti_ActivityMemcpyPtoP
* \see CUpti_ActivityMemcpyPtoP2
* \see CUpti_ActivityMemcpyPtoP3
* \see CUpti_ActivityMemcpyPtoP4
* \see CUpti_ActivityMemset
* \see CUpti_ActivityMemset2
* \see CUpti_ActivityMemset3
* \see CUpti_ActivityMemset4
* \see CUpti_ActivityMetric
* \see CUpti_ActivityMetricInstance
* \see CUpti_ActivityName
* \see CUpti_ActivityMarker
* \see CUpti_ActivityMarker2
* \see CUpti_ActivityMarkerData
* \see CUpti_ActivitySourceLocator
* \see CUpti_ActivityGlobalAccess
* \see CUpti_ActivityGlobalAccess2
* \see CUpti_ActivityGlobalAccess3
* \see CUpti_ActivityBranch
* \see CUpti_ActivityBranch2
* \see CUpti_ActivityOverhead
* \see CUpti_ActivityEnvironment
* \see CUpti_ActivityInstructionExecution
* \see CUpti_ActivityUnifiedMemoryCounter
* \see CUpti_ActivityFunction
* \see CUpti_ActivityModule
* \see CUpti_ActivitySharedAccess
* \see CUpti_ActivityPCSampling
* \see CUpti_ActivityPCSampling2
* \see CUpti_ActivityPCSampling3
* \see CUpti_ActivityPCSamplingRecordInfo
* \see CUpti_ActivityCudaEvent
* \see CUpti_ActivityStream
* \see CUpti_ActivitySynchronization
* \see CUpti_ActivityInstructionCorrelation
* \see CUpti_ActivityExternalCorrelation
* \see CUpti_ActivityUnifiedMemoryCounter2
* \see CUpti_ActivityOpenAccData
* \see CUpti_ActivityOpenAccLaunch
* \see CUpti_ActivityOpenAccOther
* \see CUpti_ActivityOpenMp
* \see CUpti_ActivityNvLink
* \see CUpti_ActivityNvLink2
* \see CUpti_ActivityNvLink3
* \see CUpti_ActivityNvLink4
* \see CUpti_ActivityMemory
* \see CUpti_ActivityPcie
*/
typedef enum {
/**
* The activity record is invalid.
*/
CUPTI_ACTIVITY_KIND_INVALID = 0,
/**
* A host<->host, host<->device, or device<->device memory copy. The
* corresponding activity record structure is \ref
* CUpti_ActivityMemcpy5.
*/
CUPTI_ACTIVITY_KIND_MEMCPY = 1,
/**
* A memory set executing on the GPU. The corresponding activity
* record structure is \ref CUpti_ActivityMemset4.
*/
CUPTI_ACTIVITY_KIND_MEMSET = 2,
/**
* A kernel executing on the GPU. This activity kind may significantly change
* the overall performance characteristics of the application because all
* kernel executions are serialized on the GPU. Other activity kind for kernel
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL doesn't break kernel concurrency.
* The corresponding activity record structure is \ref CUpti_ActivityKernel9.
*/
CUPTI_ACTIVITY_KIND_KERNEL = 3,
/**
* A CUDA driver API function execution. The corresponding activity
* record structure is \ref CUpti_ActivityAPI.
*/
CUPTI_ACTIVITY_KIND_DRIVER = 4,
/**
* A CUDA runtime API function execution. The corresponding activity
* record structure is \ref CUpti_ActivityAPI.
*/
CUPTI_ACTIVITY_KIND_RUNTIME = 5,
/**
* An event value. The corresponding activity record structure is
* \ref CUpti_ActivityEvent.
*/
CUPTI_ACTIVITY_KIND_EVENT = 6,
/**
* A metric value. The corresponding activity record structure is
* \ref CUpti_ActivityMetric.
*/
CUPTI_ACTIVITY_KIND_METRIC = 7,
/**
* Information about a device. The corresponding activity record
* structure is \ref CUpti_ActivityDevice4.
*/
CUPTI_ACTIVITY_KIND_DEVICE = 8,
/**
* Information about a context. The corresponding activity record
* structure is \ref CUpti_ActivityContext.
*/
CUPTI_ACTIVITY_KIND_CONTEXT = 9,
/**
* A kernel executing on the GPU. This activity kind doesn't break
* kernel concurrency. The corresponding activity record structure
* is \ref CUpti_ActivityKernel9.
*/
CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL = 10,
/**
* Resource naming done via NVTX APIs for thread, device, context, etc.
* The corresponding activity record structure is \ref CUpti_ActivityName.
*/
CUPTI_ACTIVITY_KIND_NAME = 11,
/**
* Instantaneous, start, or end NVTX marker. The corresponding activity
* record structure is \ref CUpti_ActivityMarker2.
*/
CUPTI_ACTIVITY_KIND_MARKER = 12,
/**
* Extended, optional, data about a marker. The corresponding
* activity record structure is \ref CUpti_ActivityMarkerData.
*/
CUPTI_ACTIVITY_KIND_MARKER_DATA = 13,
/**
* Source information about source level result. The corresponding
* activity record structure is \ref CUpti_ActivitySourceLocator.
*/
CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR = 14,
/**
* Results for source-level global acccess. The
* corresponding activity record structure is \ref
* CUpti_ActivityGlobalAccess3.
*/
CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS = 15,
/**
* Results for source-level branch. The corresponding
* activity record structure is \ref CUpti_ActivityBranch2.
*/
CUPTI_ACTIVITY_KIND_BRANCH = 16,
/**
* Overhead activity records. The
* corresponding activity record structure is
* \ref CUpti_ActivityOverhead.
*/
CUPTI_ACTIVITY_KIND_OVERHEAD = 17,
/**
* A CDP (CUDA Dynamic Parallel) kernel executing on the GPU. The
* corresponding activity record structure is \ref
* CUpti_ActivityCdpKernel. This activity can not be directly
* enabled or disabled. It is enabled and disabled through
* concurrent kernel activity i.e. _CONCURRENT_KERNEL.
*/
CUPTI_ACTIVITY_KIND_CDP_KERNEL = 18,
/**
* Preemption activity record indicating a preemption of a CDP (CUDA
* Dynamic Parallel) kernel executing on the GPU. The corresponding
* activity record structure is \ref CUpti_ActivityPreemption.
*/
CUPTI_ACTIVITY_KIND_PREEMPTION = 19,
/**
* Environment activity records indicating power, clock, thermal,
* etc. levels of the GPU. The corresponding activity record
* structure is \ref CUpti_ActivityEnvironment.
*/
CUPTI_ACTIVITY_KIND_ENVIRONMENT = 20,
/**
* An event value associated with a specific event domain
* instance. The corresponding activity record structure is \ref
* CUpti_ActivityEventInstance.
*/
CUPTI_ACTIVITY_KIND_EVENT_INSTANCE = 21,
/**
* A peer to peer memory copy. The corresponding activity record
* structure is \ref CUpti_ActivityMemcpyPtoP4.
*/
CUPTI_ACTIVITY_KIND_MEMCPY2 = 22,
/**
* A metric value associated with a specific metric domain
* instance. The corresponding activity record structure is \ref
* CUpti_ActivityMetricInstance.
*/
CUPTI_ACTIVITY_KIND_METRIC_INSTANCE = 23,
/**
* Results for source-level instruction execution.
* The corresponding activity record structure is \ref
* CUpti_ActivityInstructionExecution.
*/
CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION = 24,
/**
* Unified Memory counter record. The corresponding activity
* record structure is \ref CUpti_ActivityUnifiedMemoryCounter2.
*/
CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER = 25,
/**
* Device global/function record. The corresponding activity
* record structure is \ref CUpti_ActivityFunction.
*/
CUPTI_ACTIVITY_KIND_FUNCTION = 26,
/**
* CUDA Module record. The corresponding activity
* record structure is \ref CUpti_ActivityModule.
*/
CUPTI_ACTIVITY_KIND_MODULE = 27,
/**
* A device attribute value. The corresponding activity record
* structure is \ref CUpti_ActivityDeviceAttribute.
*/
CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE = 28,
/**
* Results for source-level shared acccess. The
* corresponding activity record structure is \ref
* CUpti_ActivitySharedAccess.
*/
CUPTI_ACTIVITY_KIND_SHARED_ACCESS = 29,
/**
* Enable PC sampling for kernels. This will serialize
* kernels. The corresponding activity record structure
* is \ref CUpti_ActivityPCSampling3.
*/
CUPTI_ACTIVITY_KIND_PC_SAMPLING = 30,
/**
* Summary information about PC sampling records. The
* corresponding activity record structure is \ref
* CUpti_ActivityPCSamplingRecordInfo.
*/
CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO = 31,
/**
* SASS/Source line-by-line correlation record.
* This will generate sass/source correlation for functions that have source
* level analysis or pc sampling results. The records will be generated only
* when either of source level analysis or pc sampling activity is enabled.
* The corresponding activity record structure is \ref
* CUpti_ActivityInstructionCorrelation.
*/
CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION = 32,
/**
* OpenACC data events.
* The corresponding activity record structure is \ref
* CUpti_ActivityOpenAccData.
*/
CUPTI_ACTIVITY_KIND_OPENACC_DATA = 33,
/**
* OpenACC launch events.
* The corresponding activity record structure is \ref
* CUpti_ActivityOpenAccLaunch.
*/
CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH = 34,
/**
* OpenACC other events.
* The corresponding activity record structure is \ref
* CUpti_ActivityOpenAccOther.
*/
CUPTI_ACTIVITY_KIND_OPENACC_OTHER = 35,
/**
* Information about a CUDA event. The
* corresponding activity record structure is \ref
* CUpti_ActivityCudaEvent.
*/
CUPTI_ACTIVITY_KIND_CUDA_EVENT = 36,
/**
* Information about a CUDA stream. The
* corresponding activity record structure is \ref
* CUpti_ActivityStream.
*/
CUPTI_ACTIVITY_KIND_STREAM = 37,
/**
* Records for synchronization management. The
* corresponding activity record structure is \ref
* CUpti_ActivitySynchronization.
*/
CUPTI_ACTIVITY_KIND_SYNCHRONIZATION = 38,
/**
* Records for correlation of different programming APIs. The
* corresponding activity record structure is \ref
* CUpti_ActivityExternalCorrelation.
*/
CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION = 39,
/**
* NVLink information.
* The corresponding activity record structure is \ref
* CUpti_ActivityNvLink4.
*/
CUPTI_ACTIVITY_KIND_NVLINK = 40,
/**
* Instantaneous Event information.
* The corresponding activity record structure is \ref
* CUpti_ActivityInstantaneousEvent.
*/
CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT = 41,
/**
* Instantaneous Event information for a specific event
* domain instance.
* The corresponding activity record structure is \ref
* CUpti_ActivityInstantaneousEventInstance
*/
CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE = 42,
/**
* Instantaneous Metric information
* The corresponding activity record structure is \ref
* CUpti_ActivityInstantaneousMetric.
*/
CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC = 43,
/**
* Instantaneous Metric information for a specific metric
* domain instance.
* The corresponding activity record structure is \ref
* CUpti_ActivityInstantaneousMetricInstance.
*/
CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE = 44,
/**
* Memory activity tracking allocation and freeing of the memory
* The corresponding activity record structure is \ref
* CUpti_ActivityMemory.
*/
CUPTI_ACTIVITY_KIND_MEMORY = 45,
/**
* PCI devices information used for PCI topology.
* The corresponding activity record structure is \ref
* CUpti_ActivityPcie.
*/
CUPTI_ACTIVITY_KIND_PCIE = 46,
/**
* OpenMP parallel events.
* The corresponding activity record structure is \ref
* CUpti_ActivityOpenMp.
*/
CUPTI_ACTIVITY_KIND_OPENMP = 47,
/**
* A CUDA driver kernel launch occurring outside of any
* public API function execution. Tools can handle these
* like records for driver API launch functions, although
* the cbid field is not used here.
* The corresponding activity record structure is \ref
* CUpti_ActivityAPI.
*/
CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API = 48,
/**
* Memory activity tracking allocation and freeing of the memory
* The corresponding activity record structure is \ref
* CUpti_ActivityMemory3.
*/
CUPTI_ACTIVITY_KIND_MEMORY2 = 49,
/**
* Memory pool activity tracking creation, destruction and
* triming of the memory pool.
* The corresponding activity record structure is \ref
* CUpti_ActivityMemoryPool2.
*/
CUPTI_ACTIVITY_KIND_MEMORY_POOL = 50,
/**
* The corresponding activity record structure is
* \ref CUpti_ActivityGraphTrace.
*/
CUPTI_ACTIVITY_KIND_GRAPH_TRACE = 51,
/**
* JIT operation tracking
* The corresponding activity record structure is \ref
* CUpti_ActivityJit.
*/
CUPTI_ACTIVITY_KIND_JIT = 52,
CUPTI_ACTIVITY_KIND_COUNT,
CUPTI_ACTIVITY_KIND_FORCE_INT = 0x7fffffff
} CUpti_ActivityKind;
/**
* \brief The kinds of activity objects.
* \see CUpti_ActivityObjectKindId
*/
typedef enum {
/**
* The object kind is not known.
*/
CUPTI_ACTIVITY_OBJECT_UNKNOWN = 0,
/**
* A process.
*/
CUPTI_ACTIVITY_OBJECT_PROCESS = 1,
/**
* A thread.
*/
CUPTI_ACTIVITY_OBJECT_THREAD = 2,
/**
* A device.
*/
CUPTI_ACTIVITY_OBJECT_DEVICE = 3,
/**
* A context.
*/
CUPTI_ACTIVITY_OBJECT_CONTEXT = 4,
/**
* A stream.
*/
CUPTI_ACTIVITY_OBJECT_STREAM = 5,
CUPTI_ACTIVITY_OBJECT_FORCE_INT = 0x7fffffff
} CUpti_ActivityObjectKind;
/**
* \brief Identifiers for object kinds as specified by
* CUpti_ActivityObjectKind.
* \see CUpti_ActivityObjectKind
*/
typedef union {
/**
* A process object requires that we identify the process ID. A
* thread object requires that we identify both the process and
* thread ID.
*/
struct {
uint32_t processId;
uint32_t threadId;
} pt;
/**
* A device object requires that we identify the device ID. A
* context object requires that we identify both the device and
* context ID. A stream object requires that we identify device,
* context, and stream ID.
*/
struct {
uint32_t deviceId;
uint32_t contextId;
uint32_t streamId;
} dcs;
} CUpti_ActivityObjectKindId;
/**
* \brief The kinds of activity overhead.
*/
typedef enum {
/**
* The overhead kind is not known.
*/
CUPTI_ACTIVITY_OVERHEAD_UNKNOWN = 0,
/**
* Compiler overhead.
*/
CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER = 1,
/**
* Activity buffer flush overhead.
*/
CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH = 1<<16,
/**
* CUPTI instrumentation overhead.
*/
CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION = 2<<16,
/**
* CUPTI resource creation and destruction overhead.
*/
CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE = 3<<16,
CUPTI_ACTIVITY_OVERHEAD_FORCE_INT = 0x7fffffff
} CUpti_ActivityOverheadKind;
/**
* \brief The kind of a compute API.
*/
typedef enum {
/**
* The compute API is not known.
*/
CUPTI_ACTIVITY_COMPUTE_API_UNKNOWN = 0,
/**
* The compute APIs are for CUDA.
*/
CUPTI_ACTIVITY_COMPUTE_API_CUDA = 1,
/**
* The compute APIs are for CUDA running
* in MPS (Multi-Process Service) environment.
*/
CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS = 2,
CUPTI_ACTIVITY_COMPUTE_API_FORCE_INT = 0x7fffffff
} CUpti_ActivityComputeApiKind;
/**
* \brief Flags associated with activity records.
*
* Activity record flags. Flags can be combined by bitwise OR to
* associated multiple flags with an activity record. Each flag is
* specific to a certain activity kind, as noted below.
*/
typedef enum {
/**
* Indicates the activity record has no flags.
*/
CUPTI_ACTIVITY_FLAG_NONE = 0,
/**
* Indicates the activity represents a device that supports
* concurrent kernel execution. Valid for
* CUPTI_ACTIVITY_KIND_DEVICE.
*/
CUPTI_ACTIVITY_FLAG_DEVICE_CONCURRENT_KERNELS = 1 << 0,
/**
* Indicates if the activity represents a CUdevice_attribute value
* or a CUpti_DeviceAttribute value. Valid for
* CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
*/
CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE = 1 << 0,
/**
* Indicates the activity represents an asynchronous memcpy
* operation. Valid for CUPTI_ACTIVITY_KIND_MEMCPY.
*/
CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC = 1 << 0,
/**
* Indicates the activity represents an instantaneous marker. Valid
* for CUPTI_ACTIVITY_KIND_MARKER.
*/
CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS = 1 << 0,
/**
* Indicates the activity represents a region start marker. Valid
* for CUPTI_ACTIVITY_KIND_MARKER.
*/
CUPTI_ACTIVITY_FLAG_MARKER_START = 1 << 1,
/**
* Indicates the activity represents a region end marker. Valid for
* CUPTI_ACTIVITY_KIND_MARKER.
*/
CUPTI_ACTIVITY_FLAG_MARKER_END = 1 << 2,
/**
* Indicates the activity represents an attempt to acquire a user
* defined synchronization object.
* Valid for CUPTI_ACTIVITY_KIND_MARKER.
*/
CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE = 1 << 3,
/**
* Indicates the activity represents success in acquiring the
* user defined synchronization object.
* Valid for CUPTI_ACTIVITY_KIND_MARKER.
*/
CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS = 1 << 4,
/**
* Indicates the activity represents failure in acquiring the
* user defined synchronization object.
* Valid for CUPTI_ACTIVITY_KIND_MARKER.
*/
CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED = 1 << 5,
/**
* Indicates the activity represents releasing a reservation on
* user defined synchronization object.
* Valid for CUPTI_ACTIVITY_KIND_MARKER.
*/
CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE = 1 << 6,
/**
* Indicates the activity represents a marker that does not specify
* a color. Valid for CUPTI_ACTIVITY_KIND_MARKER_DATA.
*/
CUPTI_ACTIVITY_FLAG_MARKER_COLOR_NONE = 1 << 0,
/**
* Indicates the activity represents a marker that specifies a color
* in alpha-red-green-blue format. Valid for
* CUPTI_ACTIVITY_KIND_MARKER_DATA.
*/
CUPTI_ACTIVITY_FLAG_MARKER_COLOR_ARGB = 1 << 1,
/**
* The number of bytes requested by each thread
* Valid for CUpti_ActivityGlobalAccess3.
*/
CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_SIZE_MASK = 0xFF << 0,
/**
* If bit in this flag is set, the access was load, else it is a
* store access. Valid for CUpti_ActivityGlobalAccess3.
*/
CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_LOAD = 1 << 8,
/**
* If this bit in flag is set, the load access was cached else it is
* uncached. Valid for CUpti_ActivityGlobalAccess3.
*/
CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_CACHED = 1 << 9,
/**
* If this bit in flag is set, the metric value overflowed. Valid
* for CUpti_ActivityMetric and CUpti_ActivityMetricInstance.
*/
CUPTI_ACTIVITY_FLAG_METRIC_OVERFLOWED = 1 << 0,
/**
* If this bit in flag is set, the metric value couldn't be
* calculated. This occurs when a value(s) required to calculate the
* metric is missing. Valid for CUpti_ActivityMetric and
* CUpti_ActivityMetricInstance.
*/
CUPTI_ACTIVITY_FLAG_METRIC_VALUE_INVALID = 1 << 1,
/**
* If this bit in flag is set, the source level metric value couldn't be
* calculated. This occurs when a value(s) required to calculate the
* source level metric cannot be evaluated.
* Valid for CUpti_ActivityInstructionExecution.
*/
CUPTI_ACTIVITY_FLAG_INSTRUCTION_VALUE_INVALID = 1 << 0,
/**
* The mask for the instruction class, \ref CUpti_ActivityInstructionClass
* Valid for CUpti_ActivityInstructionExecution and
* CUpti_ActivityInstructionCorrelation
*/
CUPTI_ACTIVITY_FLAG_INSTRUCTION_CLASS_MASK = 0xFF << 1,
/**
* When calling cuptiActivityFlushAll, this flag
* can be set to force CUPTI to flush all records in the buffer, whether
* finished or not
*/
CUPTI_ACTIVITY_FLAG_FLUSH_FORCED = 1 << 0,
/**
* The number of bytes requested by each thread
* Valid for CUpti_ActivitySharedAccess.
*/
CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_SIZE_MASK = 0xFF << 0,
/**
* If bit in this flag is set, the access was load, else it is a
* store access. Valid for CUpti_ActivitySharedAccess.
*/
CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_LOAD = 1 << 8,
/**
* Indicates the activity represents an asynchronous memset
* operation. Valid for CUPTI_ACTIVITY_KIND_MEMSET.
*/
CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC = 1 << 0,
/**
* Indicates the activity represents thrashing in CPU.
* Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING in
* CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
*/
CUPTI_ACTIVITY_FLAG_THRASHING_IN_CPU = 1 << 0,
/**
* Indicates the activity represents page throttling in CPU.
* Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING in
* CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
*/
CUPTI_ACTIVITY_FLAG_THROTTLING_IN_CPU = 1 << 0,
CUPTI_ACTIVITY_FLAG_FORCE_INT = 0x7fffffff
} CUpti_ActivityFlag;
/**
* \brief The stall reason for PC sampling activity.
*/
typedef enum {
/**
* Invalid reason
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID = 0,
/**
* No stall, instruction is selected for issue
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE = 1,
/**
* Warp is blocked because next instruction is not yet available,
* because of instruction cache miss, or because of branching effects
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH = 2,
/**
* Instruction is waiting on an arithmatic dependency
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY = 3,
/**
* Warp is blocked because it is waiting for a memory access to complete.
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY = 4,
/**
* Texture sub-system is fully utilized or has too many outstanding requests.
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE = 5,
/**
* Warp is blocked as it is waiting at __syncthreads() or at memory barrier.
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC = 6,
/**
* Warp is blocked waiting for __constant__ memory and immediate memory access to complete.
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY = 7,
/**
* Compute operation cannot be performed due to the required resources not
* being available.
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY = 8,
/**
* Warp is blocked because there are too many pending memory operations.
* In Kepler architecture it often indicates high number of memory replays.
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE = 9,
/**
* Warp was ready to issue, but some other warp issued instead.
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED = 10,
/**
* Miscellaneous reasons
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER = 11,
/**
* Sleeping.
*/
CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING = 12,
CUPTI_ACTIVITY_PC_SAMPLING_STALL_FORCE_INT = 0x7fffffff
} CUpti_ActivityPCSamplingStallReason;
/**
* \brief Sampling period for PC sampling method
*
* Sampling period can be set using \ref cuptiActivityConfigurePCSampling
*/
typedef enum {
/**
* The PC sampling period is not set.
*/
CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_INVALID = 0,
/**
* Minimum sampling period available on the device.
*/
CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN = 1,
/**
* Sampling period in lower range.
*/
CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_LOW = 2,
/**
* Medium sampling period.
*/
CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MID = 3,
/**
* Sampling period in higher range.
*/
CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_HIGH = 4,
/**
* Maximum sampling period available on the device.
*/
CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MAX = 5,
CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_FORCE_INT = 0x7fffffff
} CUpti_ActivityPCSamplingPeriod;
/**
* \brief The kind of a memory copy, indicating the source and
* destination targets of the copy.
*
* Each kind represents the source and destination targets of a memory
* copy. Targets are host, device, and array.
*/
typedef enum {
/**
* The memory copy kind is not known.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN = 0,
/**
* A host to device memory copy.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_HTOD = 1,
/**
* A device to host memory copy.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_DTOH = 2,
/**
* A host to device array memory copy.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_HTOA = 3,
/**
* A device array to host memory copy.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_ATOH = 4,
/**
* A device array to device array memory copy.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_ATOA = 5,
/**
* A device array to device memory copy.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_ATOD = 6,
/**
* A device to device array memory copy.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_DTOA = 7,
/**
* A device to device memory copy on the same device.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_DTOD = 8,
/**
* A host to host memory copy.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_HTOH = 9,
/**
* A peer to peer memory copy across different devices.
*/
CUPTI_ACTIVITY_MEMCPY_KIND_PTOP = 10,
CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT = 0x7fffffff
} CUpti_ActivityMemcpyKind;
/**
* \brief The kinds of memory accessed by a memory operation/copy.
*
* Each kind represents the type of the memory
* accessed by a memory operation/copy.
*/
typedef enum {
/**
* The memory kind is unknown.
*/
CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN = 0,
/**
* The memory is pageable.
*/
CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE = 1,
/**
* The memory is pinned.
*/
CUPTI_ACTIVITY_MEMORY_KIND_PINNED = 2,
/**
* The memory is on the device.
*/
CUPTI_ACTIVITY_MEMORY_KIND_DEVICE = 3,
/**
* The memory is an array.
*/
CUPTI_ACTIVITY_MEMORY_KIND_ARRAY = 4,
/**
* The memory is managed
*/
CUPTI_ACTIVITY_MEMORY_KIND_MANAGED = 5,
/**
* The memory is device static
*/
CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC = 6,
/**
* The memory is managed static
*/
CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC = 7,
CUPTI_ACTIVITY_MEMORY_KIND_FORCE_INT = 0x7fffffff
} CUpti_ActivityMemoryKind;
/**
* \brief The kind of a preemption activity.
*/
typedef enum {
/**
* The preemption kind is not known.
*/
CUPTI_ACTIVITY_PREEMPTION_KIND_UNKNOWN = 0,
/**
* Preemption to save CDP block.
*/
CUPTI_ACTIVITY_PREEMPTION_KIND_SAVE = 1,
/**
* Preemption to restore CDP block.
*/
CUPTI_ACTIVITY_PREEMPTION_KIND_RESTORE = 2,
CUPTI_ACTIVITY_PREEMPTION_KIND_FORCE_INT = 0x7fffffff
} CUpti_ActivityPreemptionKind;
/**
* \brief The kind of environment data. Used to indicate what type of
* data is being reported by an environment activity record.
*/
typedef enum {
/**
* Unknown data.
*/
CUPTI_ACTIVITY_ENVIRONMENT_UNKNOWN = 0,
/**
* The environment data is related to speed.
*/
CUPTI_ACTIVITY_ENVIRONMENT_SPEED = 1,
/**
* The environment data is related to temperature.
*/
CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE = 2,
/**
* The environment data is related to power.
*/
CUPTI_ACTIVITY_ENVIRONMENT_POWER = 3,
/**
* The environment data is related to cooling.
*/
CUPTI_ACTIVITY_ENVIRONMENT_COOLING = 4,
CUPTI_ACTIVITY_ENVIRONMENT_COUNT,
CUPTI_ACTIVITY_ENVIRONMENT_KIND_FORCE_INT = 0x7fffffff
} CUpti_ActivityEnvironmentKind;
/**
* \brief Reasons for clock throttling.
*
* The possible reasons that a clock can be throttled. There can be
* more than one reason that a clock is being throttled so these types
* can be combined by bitwise OR. These are used in the
* clocksThrottleReason field in the Environment Activity Record.
*/
typedef enum {
/**
* Nothing is running on the GPU and the clocks are dropping to idle
* state.
*/
CUPTI_CLOCKS_THROTTLE_REASON_GPU_IDLE = 0x00000001,
/**
* The GPU clocks are limited by a user specified limit.
*/
CUPTI_CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS = 0x00000002,
/**
* A software power scaling algorithm is reducing the clocks below
* requested clocks.
*/
CUPTI_CLOCKS_THROTTLE_REASON_SW_POWER_CAP = 0x00000004,
/**
* Hardware slowdown to reduce the clock by a factor of two or more
* is engaged. This is an indicator of one of the following: 1)
* Temperature is too high, 2) External power brake assertion is
* being triggered (e.g. by the system power supply), 3) Change in
* power state.
*/
CUPTI_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN = 0x00000008,
/**
* Some unspecified factor is reducing the clocks.
*/
CUPTI_CLOCKS_THROTTLE_REASON_UNKNOWN = 0x80000000,
/**
* Throttle reason is not supported for this GPU.
*/
CUPTI_CLOCKS_THROTTLE_REASON_UNSUPPORTED = 0x40000000,
/**
* No clock throttling.
*/
CUPTI_CLOCKS_THROTTLE_REASON_NONE = 0x00000000,
CUPTI_CLOCKS_THROTTLE_REASON_FORCE_INT = 0x7fffffff
} CUpti_EnvironmentClocksThrottleReason;
/**
* \brief Scope of the unified memory counter (deprecated in CUDA 7.0)
*/
typedef enum {
/**
* The unified memory counter scope is not known.
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_UNKNOWN = 0,
/**
* Collect unified memory counter for single process on one device
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE = 1,
/**
* Collect unified memory counter for single process across all devices
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_ALL_DEVICES = 2,
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_COUNT,
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_FORCE_INT = 0x7fffffff
} CUpti_ActivityUnifiedMemoryCounterScope;
/**
* \brief Kind of the Unified Memory counter
*
* Many activities are associated with Unified Memory mechanism; among them
* are tranfer from host to device, device to host, page fault at
* host side.
*/
typedef enum {
/**
* The unified memory counter kind is not known.
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_UNKNOWN = 0,
/**
* Number of bytes transfered from host to device
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD = 1,
/**
* Number of bytes transfered from device to host
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH = 2,
/**
* Number of CPU page faults, this is only supported on 64 bit
* Linux and Mac platforms
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT = 3,
/**
* Number of GPU page faults, this is only supported on devices with
* compute capability 6.0 and higher and 64 bit Linux platforms
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT = 4,
/**
* Thrashing occurs when data is frequently accessed by
* multiple processors and has to be constantly migrated around
* to achieve data locality. In this case the overhead of migration
* may exceed the benefits of locality.
* This is only supported on 64 bit Linux platforms.
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING = 5,
/**
* Throttling is a prevention technique used by the driver to avoid
* further thrashing. Here, the driver doesn't service the fault for
* one of the contending processors for a specific period of time,
* so that the other processor can run at full-speed.
* This is only supported on 64 bit Linux platforms.
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING = 6,
/**
* In case throttling does not help, the driver tries to pin the memory
* to a processor for a specific period of time. One of the contending
* processors will have slow access to the memory, while the other will
* have fast access.
* This is only supported on 64 bit Linux platforms.
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP = 7,
/**
* Number of bytes transferred from one device to another device.
* This is only supported on 64 bit Linux platforms.
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD = 8,
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_COUNT,
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_FORCE_INT = 0x7fffffff
} CUpti_ActivityUnifiedMemoryCounterKind;
/**
* \brief Memory access type for unified memory page faults
*
* This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
* and \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
*/
typedef enum {
/**
* The unified memory access type is not known
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_UNKNOWN = 0,
/**
* The page fault was triggered by read memory instruction
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_READ = 1,
/**
* The page fault was triggered by write memory instruction
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_WRITE = 2,
/**
* The page fault was triggered by atomic memory instruction
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_ATOMIC = 3,
/**
* The page fault was triggered by memory prefetch operation
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_PREFETCH = 4
} CUpti_ActivityUnifiedMemoryAccessType;
/**
* \brief Migration cause of the Unified Memory counter
*
* This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
* \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH
*/
typedef enum {
/**
* The unified memory migration cause is not known
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_UNKNOWN = 0,
/**
* The unified memory migrated due to an explicit call from
* the user e.g. cudaMemPrefetchAsync
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_USER = 1,
/**
* The unified memory migrated to guarantee data coherence
* e.g. CPU/GPU faults on Pascal+ and kernel launch on pre-Pascal GPUs
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_COHERENCE = 2,
/**
* The unified memory was speculatively migrated by the UVM driver
* before being accessed by the destination processor to improve
* performance
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_PREFETCH = 3,
/**
* The unified memory migrated to the CPU because it was evicted to make
* room for another block of memory on the GPU
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_EVICTION = 4,
/**
* The unified memory migrated to another processor because of access counter
* notifications. Only frequently accessed pages are migrated between CPU and GPU, or
* between peer GPUs.
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_ACCESS_COUNTERS = 5,
} CUpti_ActivityUnifiedMemoryMigrationCause;
/**
* \brief Remote memory map cause of the Unified Memory counter
*
* This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP
*/
typedef enum {
/**
* The cause of mapping to remote memory was unknown
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_UNKNOWN = 0,
/**
* Mapping to remote memory was added to maintain data coherence.
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_COHERENCE = 1,
/**
* Mapping to remote memory was added to prevent further thrashing
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_THRASHING = 2,
/**
* Mapping to remote memory was added to enforce the hints
* specified by the programmer or by performance heuristics of the
* UVM driver
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_POLICY = 3,
/**
* Mapping to remote memory was added because there is no more
* memory available on the processor and eviction was not
* possible
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_OUT_OF_MEMORY = 4,
/**
* Mapping to remote memory was added after the memory was
* evicted to make room for another block of memory on the GPU
*/
CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_EVICTION = 5,
} CUpti_ActivityUnifiedMemoryRemoteMapCause;
/**
* \brief SASS instruction classification.
*
* The sass instruction are broadly divided into different class. Each enum represents a classification.
*/
typedef enum {
/**
* The instruction class is not known.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNKNOWN = 0,
/**
* Represents a 32 bit floating point operation.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_32 = 1,
/**
* Represents a 64 bit floating point operation.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_64 = 2,
/**
* Represents an integer operation.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTEGER = 3,
/**
* Represents a bit conversion operation.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_BIT_CONVERSION = 4,
/**
* Represents a control flow instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONTROL_FLOW = 5,
/**
* Represents a global load-store instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL = 6,
/**
* Represents a shared load-store instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED = 7,
/**
* Represents a local load-store instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_LOCAL = 8,
/**
* Represents a generic load-store instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_GENERIC = 9,
/**
* Represents a surface load-store instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE = 10,
/**
* Represents a constant load instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONSTANT = 11,
/**
* Represents a texture load-store instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_TEXTURE = 12,
/**
* Represents a global atomic instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL_ATOMIC = 13,
/**
* Represents a shared atomic instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED_ATOMIC = 14,
/**
* Represents a surface atomic instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE_ATOMIC = 15,
/**
* Represents a inter-thread communication instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTER_THREAD_COMMUNICATION = 16,
/**
* Represents a barrier instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_BARRIER = 17,
/**
* Represents some miscellaneous instructions which do not fit in the above classification.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_MISCELLANEOUS = 18,
/**
* Represents a 16 bit floating point operation.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_16 = 19,
/**
* Represents uniform instruction.
*/
CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNIFORM = 20,
CUPTI_ACTIVITY_INSTRUCTION_CLASS_KIND_FORCE_INT = 0x7fffffff
} CUpti_ActivityInstructionClass;
/**
* \brief Partitioned global caching option
*/
typedef enum {
/**
* Partitioned global cache config unknown.
*/
CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_UNKNOWN = 0,
/**
* Partitioned global cache not supported.
*/
CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_NOT_SUPPORTED = 1,
/**
* Partitioned global cache config off.
*/
CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_OFF = 2,
/**
* Partitioned global cache config on.
*/
CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_ON = 3,
CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_FORCE_INT = 0x7fffffff
} CUpti_ActivityPartitionedGlobalCacheConfig;
/**
* \brief Synchronization type.
*
* The types of synchronization to be used with CUpti_ActivitySynchronization.
*/
typedef enum {
/**
* Unknown data.
*/
CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_UNKNOWN = 0,
/**
* Event synchronize API.
*/
CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE = 1,
/**
* Stream wait event API.
*/
CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT = 2,
/**
* Stream synchronize API.
*/
CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE = 3,
/**
* Context synchronize API.
*/
CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE = 4,
CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_FORCE_INT = 0x7fffffff
} CUpti_ActivitySynchronizationType;
/**
* \brief stream type.
*
* The types of stream to be used with CUpti_ActivityStream.
*/
typedef enum {
/**
* Unknown data.
*/
CUPTI_ACTIVITY_STREAM_CREATE_FLAG_UNKNOWN = 0,
/**
* Default stream.
*/
CUPTI_ACTIVITY_STREAM_CREATE_FLAG_DEFAULT = 1,
/**
* Non-blocking stream.
*/
CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NON_BLOCKING = 2,
/**
* Null stream.
*/
CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NULL = 3,
/**
* Stream create Mask
*/
CUPTI_ACTIVITY_STREAM_CREATE_MASK = 0xFFFF,
CUPTI_ACTIVITY_STREAM_CREATE_FLAG_FORCE_INT = 0x7fffffff
} CUpti_ActivityStreamFlag;
/**
* \brief Link flags.
*
* Describes link properties, to be used with CUpti_ActivityNvLink.
*/
typedef enum {
/**
* The flag is invalid.
*/
CUPTI_LINK_FLAG_INVALID = 0,
/**
* Is peer to peer access supported by this link.
*/
CUPTI_LINK_FLAG_PEER_ACCESS = (1 << 1),
/**
* Is system memory access supported by this link.
*/
CUPTI_LINK_FLAG_SYSMEM_ACCESS = (1 << 2),
/**
* Is peer atomic access supported by this link.
*/
CUPTI_LINK_FLAG_PEER_ATOMICS = (1 << 3),
/**
* Is system memory atomic access supported by this link.
*/
CUPTI_LINK_FLAG_SYSMEM_ATOMICS = (1 << 4),
CUPTI_LINK_FLAG_FORCE_INT = 0x7fffffff
} CUpti_LinkFlag;
/**
* \brief Memory operation types.
*
* Describes the type of memory operation, to be used with CUpti_ActivityMemory3.
*/
typedef enum {
/**
* The operation is invalid.
*/
CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_INVALID = 0,
/**
* Memory is allocated.
*/
CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION = 1,
/**
* Memory is released.
*/
CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE = 2,
CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_FORCE_INT = 0x7fffffff
} CUpti_ActivityMemoryOperationType;
/**
* \brief Memory pool types.
*
* Describes the type of memory pool, to be used with CUpti_ActivityMemory3.
*/
typedef enum {
/**
* The operation is invalid.
*/
CUPTI_ACTIVITY_MEMORY_POOL_TYPE_INVALID = 0,
/**
* Memory pool is local to the process.
*/
CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL = 1,
/**
* Memory pool is imported by the process.
*/
CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED = 2,
CUPTI_ACTIVITY_MEMORY_POOL_TYPE_FORCE_INT = 0x7fffffff
} CUpti_ActivityMemoryPoolType;
/**
* \brief Memory pool operation types.
*
* Describes the type of memory pool operation, to be used with CUpti_ActivityMemoryPool2.
*/
typedef enum {
/**
* The operation is invalid.
*/
CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_INVALID = 0,
/**
* Memory pool is created.
*/
CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_CREATED = 1,
/**
* Memory pool is destroyed.
*/
CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_DESTROYED = 2,
/**
* Memory pool is trimmed.
*/
CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED = 3,
CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_FORCE_INT = 0x7fffffff
} CUpti_ActivityMemoryPoolOperationType;
typedef enum {
CUPTI_CHANNEL_TYPE_INVALID = 0,
CUPTI_CHANNEL_TYPE_COMPUTE = 1,
CUPTI_CHANNEL_TYPE_ASYNC_MEMCPY = 2
} CUpti_ChannelType;
/**
* The source-locator ID that indicates an unknown source
* location. There is not an actual CUpti_ActivitySourceLocator object
* corresponding to this value.
*/
#define CUPTI_SOURCE_LOCATOR_ID_UNKNOWN 0
/**
* An invalid function index ID.
*/
#define CUPTI_FUNCTION_INDEX_ID_INVALID 0
/**
* An invalid/unknown correlation ID. A correlation ID of this value
* indicates that there is no correlation for the activity record.
*/
#define CUPTI_CORRELATION_ID_UNKNOWN 0
/**
* An invalid/unknown grid ID.
*/
#define CUPTI_GRID_ID_UNKNOWN 0LL
/**
* An invalid/unknown timestamp for a start, end, queued, submitted,
* or completed time.
*/
#define CUPTI_TIMESTAMP_UNKNOWN 0LL
/**
* An invalid/unknown value.
*/
#define CUPTI_SYNCHRONIZATION_INVALID_VALUE -1
/**
* An invalid/unknown process id.
*/
#define CUPTI_AUTO_BOOST_INVALID_CLIENT_PID 0
/**
* Invalid/unknown NVLink port number.
*/
#define CUPTI_NVLINK_INVALID_PORT -1
/**
* Maximum NVLink port numbers.
*/
#define CUPTI_MAX_NVLINK_PORTS 32
START_PACKED_ALIGNMENT
/**
* \brief Unified Memory counters configuration structure
*
* This structure controls the enable/disable of the various
* Unified Memory counters consisting of scope, kind and other parameters.
* See function \ref cuptiActivityConfigureUnifiedMemoryCounter
*/
typedef struct PACKED_ALIGNMENT {
/**
* Unified Memory counter Counter scope. (deprecated in CUDA 7.0)
*/
CUpti_ActivityUnifiedMemoryCounterScope scope;
/**
* Unified Memory counter Counter kind
*/
CUpti_ActivityUnifiedMemoryCounterKind kind;
/**
* Device id of the traget device. This is relevant only
* for single device scopes. (deprecated in CUDA 7.0)
*/
uint32_t deviceId;
/**
* Control to enable/disable the counter. To enable the counter
* set it to non-zero value while disable is indicated by zero.
*/
uint32_t enable;
} CUpti_ActivityUnifiedMemoryCounterConfig;
/**
* \brief Device auto boost state structure
*
* This structure defines auto boost state for a device.
* See function \ref cuptiGetAutoBoostState
*/
typedef struct PACKED_ALIGNMENT {
/**
* Returned auto boost state. 1 is returned in case auto boost is enabled, 0
* otherwise
*/
uint32_t enabled;
/**
* Id of process that has set the current boost state. The value will be
* CUPTI_AUTO_BOOST_INVALID_CLIENT_PID if the user does not have the
* permission to query process ids or there is an error in querying the
* process id.
*/
uint32_t pid;
} CUpti_ActivityAutoBoostState;
/**
* \brief PC sampling configuration structure
*
* This structure defines the pc sampling configuration.
*
* See function \ref cuptiActivityConfigurePCSampling
*/
typedef struct PACKED_ALIGNMENT {
/**
* Size of configuration structure.
* CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
* available in the structure. Used to preserve backward compatibility.
*/
uint32_t size;
/**
* There are 5 level provided for sampling period. The level
* internally maps to a period in terms of cycles. Same level can
* map to different number of cycles on different gpus. No of
* cycles will be chosen to minimize information loss. The period
* chosen will be given by samplingPeriodInCycles in
* \ref CUpti_ActivityPCSamplingRecordInfo for each kernel instance.
*/
CUpti_ActivityPCSamplingPeriod samplingPeriod;
/**
* This will override the period set by samplingPeriod. Value 0 in samplingPeriod2 will be
* considered as samplingPeriod2 should not be used and samplingPeriod should be used.
* Valid values for samplingPeriod2 are between 5 to 31 both inclusive.
* This will set the sampling period to (2^samplingPeriod2) cycles.
*/
uint32_t samplingPeriod2;
} CUpti_ActivityPCSamplingConfig;
/**
* \brief The base activity record.
*
* The activity API uses a CUpti_Activity as a generic representation
* for any activity. The 'kind' field is used to determine the
* specific activity kind, and from that the CUpti_Activity object can
* be cast to the specific activity record type appropriate for that kind.
*
* Note that all activity record types are padded and aligned to
* ensure that each member of the record is naturally aligned.
*
* \see CUpti_ActivityKind
*/
typedef struct PACKED_ALIGNMENT {
/**
* The kind of this activity.
*/
CUpti_ActivityKind kind;
} CUpti_Activity;
/**
* \brief The activity record for memory copies. (deprecated)
*
* This activity record represents a memory copy
* (CUPTI_ACTIVITY_KIND_MEMCPY).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
*/
CUpti_ActivityKind kind;
/**
* The kind of the memory copy, stored as a byte to reduce record
* size. \see CUpti_ActivityMemcpyKind
*/
uint8_t copyKind;
/**
* The source memory kind read by the memory copy, stored as a byte
* to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t srcKind;
/**
* The destination memory kind read by the memory copy, stored as a
* byte to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t dstKind;
/**
* The flags associated with the memory copy. \see CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The number of bytes transferred by the memory copy.
*/
uint64_t bytes;
/**
* The start timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t start;
/**
* The end timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t end;
/**
* The ID of the device where the memory copy is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory copy is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory copy is occurring.
*/
uint32_t streamId;
/**
* The correlation ID of the memory copy. Each memory copy is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver API activity record that launched
* the memory copy.
*/
uint32_t correlationId;
/**
* The runtime correlation ID of the memory copy. Each memory copy
* is assigned a unique runtime correlation ID that is identical to
* the correlation ID in the runtime API activity record that
* launched the memory copy.
*/
uint32_t runtimeCorrelationId;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
} CUpti_ActivityMemcpy;
/**
* \brief The activity record for memory copies. (deprecated in CUDA 11.1)
*
* This activity record represents a memory copy
* (CUPTI_ACTIVITY_KIND_MEMCPY).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
*/
CUpti_ActivityKind kind;
/**
* The kind of the memory copy, stored as a byte to reduce record
* size. \see CUpti_ActivityMemcpyKind
*/
uint8_t copyKind;
/**
* The source memory kind read by the memory copy, stored as a byte
* to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t srcKind;
/**
* The destination memory kind read by the memory copy, stored as a
* byte to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t dstKind;
/**
* The flags associated with the memory copy. \see CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The number of bytes transferred by the memory copy.
*/
uint64_t bytes;
/**
* The start timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t start;
/**
* The end timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t end;
/**
* The ID of the device where the memory copy is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory copy is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory copy is occurring.
*/
uint32_t streamId;
/**
* The correlation ID of the memory copy. Each memory copy is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver API activity record that launched
* the memory copy.
*/
uint32_t correlationId;
/**
* The runtime correlation ID of the memory copy. Each memory copy
* is assigned a unique runtime correlation ID that is identical to
* the correlation ID in the runtime API activity record that
* launched the memory copy.
*/
uint32_t runtimeCorrelationId;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed this memcpy through graph launch.
* This field will be 0 if the memcpy is not done through graph launch.
*/
uint64_t graphNodeId;
} CUpti_ActivityMemcpy3;
/**
* \brief The activity record for memory copies. (deprecated in CUDA 11.6)
*
* This activity record represents a memory copy
* (CUPTI_ACTIVITY_KIND_MEMCPY).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
*/
CUpti_ActivityKind kind;
/**
* The kind of the memory copy, stored as a byte to reduce record
* size. \see CUpti_ActivityMemcpyKind
*/
uint8_t copyKind;
/**
* The source memory kind read by the memory copy, stored as a byte
* to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t srcKind;
/**
* The destination memory kind read by the memory copy, stored as a
* byte to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t dstKind;
/**
* The flags associated with the memory copy. \see CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The number of bytes transferred by the memory copy.
*/
uint64_t bytes;
/**
* The start timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t start;
/**
* The end timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t end;
/**
* The ID of the device where the memory copy is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory copy is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory copy is occurring.
*/
uint32_t streamId;
/**
* The correlation ID of the memory copy. Each memory copy is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver API activity record that launched
* the memory copy.
*/
uint32_t correlationId;
/**
* The runtime correlation ID of the memory copy. Each memory copy
* is assigned a unique runtime correlation ID that is identical to
* the correlation ID in the runtime API activity record that
* launched the memory copy.
*/
uint32_t runtimeCorrelationId;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed this memcpy through graph launch.
* This field will be 0 if the memcpy is not done through graph launch.
*/
uint64_t graphNodeId;
/**
* The unique ID of the graph that executed this memcpy through graph launch.
* This field will be 0 if the memcpy is not done through graph launch.
*/
uint32_t graphId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t padding;
} CUpti_ActivityMemcpy4;
/**
* \brief The activity record for memory copies.
*
* This activity record represents a memory copy
* (CUPTI_ACTIVITY_KIND_MEMCPY).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
*/
CUpti_ActivityKind kind;
/**
* The kind of the memory copy, stored as a byte to reduce record
* size. \see CUpti_ActivityMemcpyKind
*/
uint8_t copyKind;
/**
* The source memory kind read by the memory copy, stored as a byte
* to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t srcKind;
/**
* The destination memory kind read by the memory copy, stored as a
* byte to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t dstKind;
/**
* The flags associated with the memory copy. \see CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The number of bytes transferred by the memory copy.
*/
uint64_t bytes;
/**
* The start timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t start;
/**
* The end timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t end;
/**
* The ID of the device where the memory copy is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory copy is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory copy is occurring.
*/
uint32_t streamId;
/**
* The correlation ID of the memory copy. Each memory copy is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver API activity record that launched
* the memory copy.
*/
uint32_t correlationId;
/**
* The runtime correlation ID of the memory copy. Each memory copy
* is assigned a unique runtime correlation ID that is identical to
* the correlation ID in the runtime API activity record that
* launched the memory copy.
*/
uint32_t runtimeCorrelationId;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed this memcpy through graph launch.
* This field will be 0 if the memcpy is not done through graph launch.
*/
uint64_t graphNodeId;
/**
* The unique ID of the graph that executed this memcpy through graph launch.
* This field will be 0 if the memcpy is not done through graph launch.
*/
uint32_t graphId;
/**
* The ID of the HW channel on which the memory copy is occuring.
*/
uint32_t channelID;
/**
* The type of the channel
*/
CUpti_ChannelType channelType;
/**
* Reserved for internal use.
*/
uint32_t pad2;
} CUpti_ActivityMemcpy5;
/**
* \brief The activity record for peer-to-peer memory copies.
*
* This activity record represents a peer-to-peer memory copy
* (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated
* by CUPTI. Peer-to-peer memory copy activities are now reported using the
* CUpti_ActivityMemcpyPtoP2 activity record..
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
*/
CUpti_ActivityKind kind;
/**
* The kind of the memory copy, stored as a byte to reduce record
* size. \see CUpti_ActivityMemcpyKind
*/
uint8_t copyKind;
/**
* The source memory kind read by the memory copy, stored as a byte
* to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t srcKind;
/**
* The destination memory kind read by the memory copy, stored as a
* byte to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t dstKind;
/**
* The flags associated with the memory copy. \see
* CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The number of bytes transferred by the memory copy.
*/
uint64_t bytes;
/**
* The start timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t start;
/**
* The end timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t end;
/**
* The ID of the device where the memory copy is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory copy is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory copy is occurring.
*/
uint32_t streamId;
/**
* The ID of the device where memory is being copied from.
*/
uint32_t srcDeviceId;
/**
* The ID of the context owning the memory being copied from.
*/
uint32_t srcContextId;
/**
* The ID of the device where memory is being copied to.
*/
uint32_t dstDeviceId;
/**
* The ID of the context owning the memory being copied to.
*/
uint32_t dstContextId;
/**
* The correlation ID of the memory copy. Each memory copy is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver and runtime API activity record that
* launched the memory copy.
*/
uint32_t correlationId;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
} CUpti_ActivityMemcpyPtoP;
typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2;
/**
* \brief The activity record for peer-to-peer memory copies.
* (deprecated in CUDA 11.1)
*
* This activity record represents a peer-to-peer memory copy
* (CUPTI_ACTIVITY_KIND_MEMCPY2).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
*/
CUpti_ActivityKind kind;
/**
* The kind of the memory copy, stored as a byte to reduce record
* size. \see CUpti_ActivityMemcpyKind
*/
uint8_t copyKind;
/**
* The source memory kind read by the memory copy, stored as a byte
* to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t srcKind;
/**
* The destination memory kind read by the memory copy, stored as a
* byte to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t dstKind;
/**
* The flags associated with the memory copy. \see
* CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The number of bytes transferred by the memory copy.
*/
uint64_t bytes;
/**
* The start timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t start;
/**
* The end timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t end;
/**
* The ID of the device where the memory copy is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory copy is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory copy is occurring.
*/
uint32_t streamId;
/**
* The ID of the device where memory is being copied from.
*/
uint32_t srcDeviceId;
/**
* The ID of the context owning the memory being copied from.
*/
uint32_t srcContextId;
/**
* The ID of the device where memory is being copied to.
*/
uint32_t dstDeviceId;
/**
* The ID of the context owning the memory being copied to.
*/
uint32_t dstContextId;
/**
* The correlation ID of the memory copy. Each memory copy is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver and runtime API activity record that
* launched the memory copy.
*/
uint32_t correlationId;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed the memcpy through graph launch.
* This field will be 0 if memcpy is not done using graph launch.
*/
uint64_t graphNodeId;
} CUpti_ActivityMemcpyPtoP2;
/**
* \brief The activity record for peer-to-peer memory copies.
* (deprecated in CUDA 11.6)
*
* This activity record represents a peer-to-peer memory copy
* (CUPTI_ACTIVITY_KIND_MEMCPY2).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
*/
CUpti_ActivityKind kind;
/**
* The kind of the memory copy, stored as a byte to reduce record
* size. \see CUpti_ActivityMemcpyKind
*/
uint8_t copyKind;
/**
* The source memory kind read by the memory copy, stored as a byte
* to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t srcKind;
/**
* The destination memory kind read by the memory copy, stored as a
* byte to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t dstKind;
/**
* The flags associated with the memory copy. \see
* CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The number of bytes transferred by the memory copy.
*/
uint64_t bytes;
/**
* The start timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t start;
/**
* The end timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t end;
/**
* The ID of the device where the memory copy is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory copy is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory copy is occurring.
*/
uint32_t streamId;
/**
* The ID of the device where memory is being copied from.
*/
uint32_t srcDeviceId;
/**
* The ID of the context owning the memory being copied from.
*/
uint32_t srcContextId;
/**
* The ID of the device where memory is being copied to.
*/
uint32_t dstDeviceId;
/**
* The ID of the context owning the memory being copied to.
*/
uint32_t dstContextId;
/**
* The correlation ID of the memory copy. Each memory copy is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver and runtime API activity record that
* launched the memory copy.
*/
uint32_t correlationId;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed the memcpy through graph launch.
* This field will be 0 if memcpy is not done using graph launch.
*/
uint64_t graphNodeId;
/**
* The unique ID of the graph that executed this memcpy through graph launch.
* This field will be 0 if the memcpy is not done through graph launch.
*/
uint32_t graphId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t padding;
} CUpti_ActivityMemcpyPtoP3;
/**
* \brief The activity record for peer-to-peer memory copies.
*
* This activity record represents a peer-to-peer memory copy
* (CUPTI_ACTIVITY_KIND_MEMCPY2).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
*/
CUpti_ActivityKind kind;
/**
* The kind of the memory copy, stored as a byte to reduce record
* size. \see CUpti_ActivityMemcpyKind
*/
uint8_t copyKind;
/**
* The source memory kind read by the memory copy, stored as a byte
* to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t srcKind;
/**
* The destination memory kind read by the memory copy, stored as a
* byte to reduce record size. \see CUpti_ActivityMemoryKind
*/
uint8_t dstKind;
/**
* The flags associated with the memory copy. \see
* CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The number of bytes transferred by the memory copy.
*/
uint64_t bytes;
/**
* The start timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t start;
/**
* The end timestamp for the memory copy, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory copy.
*/
uint64_t end;
/**
* The ID of the device where the memory copy is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory copy is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory copy is occurring.
*/
uint32_t streamId;
/**
* The ID of the device where memory is being copied from.
*/
uint32_t srcDeviceId;
/**
* The ID of the context owning the memory being copied from.
*/
uint32_t srcContextId;
/**
* The ID of the device where memory is being copied to.
*/
uint32_t dstDeviceId;
/**
* The ID of the context owning the memory being copied to.
*/
uint32_t dstContextId;
/**
* The correlation ID of the memory copy. Each memory copy is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver and runtime API activity record that
* launched the memory copy.
*/
uint32_t correlationId;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed the memcpy through graph launch.
* This field will be 0 if memcpy is not done using graph launch.
*/
uint64_t graphNodeId;
/**
* The unique ID of the graph that executed this memcpy through graph launch.
* This field will be 0 if the memcpy is not done through graph launch.
*/
uint32_t graphId;
/**
* The ID of the HW channel on which the memory copy is occuring.
*/
uint32_t channelID;
/**
* The type of the channel
*/
CUpti_ChannelType channelType;
} CUpti_ActivityMemcpyPtoP4;
/**
* \brief The activity record for memset. (deprecated)
*
* This activity record represents a memory set operation
* (CUPTI_ACTIVITY_KIND_MEMSET).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
*/
CUpti_ActivityKind kind;
/**
* The value being assigned to memory by the memory set.
*/
uint32_t value;
/**
* The number of bytes being set by the memory set.
*/
uint64_t bytes;
/**
* The start timestamp for the memory set, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory set.
*/
uint64_t start;
/**
* The end timestamp for the memory set, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory set.
*/
uint64_t end;
/**
* The ID of the device where the memory set is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory set is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory set is occurring.
*/
uint32_t streamId;
/**
* The correlation ID of the memory set. Each memory set is assigned
* a unique correlation ID that is identical to the correlation ID
* in the driver API activity record that launched the memory set.
*/
uint32_t correlationId;
/**
* The flags associated with the memset. \see CUpti_ActivityFlag
*/
uint16_t flags;
/**
* The memory kind of the memory set \see CUpti_ActivityMemoryKind
*/
uint16_t memoryKind;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
} CUpti_ActivityMemset;
/**
* \brief The activity record for memset. (deprecated in CUDA 11.1)
*
* This activity record represents a memory set operation
* (CUPTI_ACTIVITY_KIND_MEMSET).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
*/
CUpti_ActivityKind kind;
/**
* The value being assigned to memory by the memory set.
*/
uint32_t value;
/**
* The number of bytes being set by the memory set.
*/
uint64_t bytes;
/**
* The start timestamp for the memory set, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory set.
*/
uint64_t start;
/**
* The end timestamp for the memory set, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory set.
*/
uint64_t end;
/**
* The ID of the device where the memory set is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory set is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory set is occurring.
*/
uint32_t streamId;
/**
* The correlation ID of the memory set. Each memory set is assigned
* a unique correlation ID that is identical to the correlation ID
* in the driver API activity record that launched the memory set.
*/
uint32_t correlationId;
/**
* The flags associated with the memset. \see CUpti_ActivityFlag
*/
uint16_t flags;
/**
* The memory kind of the memory set \see CUpti_ActivityMemoryKind
*/
uint16_t memoryKind;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed this memset through graph launch.
* This field will be 0 if the memset is not executed through graph launch.
*/
uint64_t graphNodeId;
} CUpti_ActivityMemset2;
/**
* \brief The activity record for memset. (deprecated in CUDA 11.6)
*
* This activity record represents a memory set operation
* (CUPTI_ACTIVITY_KIND_MEMSET).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
*/
CUpti_ActivityKind kind;
/**
* The value being assigned to memory by the memory set.
*/
uint32_t value;
/**
* The number of bytes being set by the memory set.
*/
uint64_t bytes;
/**
* The start timestamp for the memory set, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory set.
*/
uint64_t start;
/**
* The end timestamp for the memory set, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory set.
*/
uint64_t end;
/**
* The ID of the device where the memory set is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory set is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory set is occurring.
*/
uint32_t streamId;
/**
* The correlation ID of the memory set. Each memory set is assigned
* a unique correlation ID that is identical to the correlation ID
* in the driver API activity record that launched the memory set.
*/
uint32_t correlationId;
/**
* The flags associated with the memset. \see CUpti_ActivityFlag
*/
uint16_t flags;
/**
* The memory kind of the memory set \see CUpti_ActivityMemoryKind
*/
uint16_t memoryKind;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed this memset through graph launch.
* This field will be 0 if the memset is not executed through graph launch.
*/
uint64_t graphNodeId;
/**
* The unique ID of the graph that executed this memset through graph launch.
* This field will be 0 if the memset is not executed through graph launch.
*/
uint32_t graphId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t padding;
} CUpti_ActivityMemset3;
/**
* \brief The activity record for memset.
*
* This activity record represents a memory set operation
* (CUPTI_ACTIVITY_KIND_MEMSET).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
*/
CUpti_ActivityKind kind;
/**
* The value being assigned to memory by the memory set.
*/
uint32_t value;
/**
* The number of bytes being set by the memory set.
*/
uint64_t bytes;
/**
* The start timestamp for the memory set, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory set.
*/
uint64_t start;
/**
* The end timestamp for the memory set, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the memory set.
*/
uint64_t end;
/**
* The ID of the device where the memory set is occurring.
*/
uint32_t deviceId;
/**
* The ID of the context where the memory set is occurring.
*/
uint32_t contextId;
/**
* The ID of the stream where the memory set is occurring.
*/
uint32_t streamId;
/**
* The correlation ID of the memory set. Each memory set is assigned
* a unique correlation ID that is identical to the correlation ID
* in the driver API activity record that launched the memory set.
*/
uint32_t correlationId;
/**
* The flags associated with the memset. \see CUpti_ActivityFlag
*/
uint16_t flags;
/**
* The memory kind of the memory set \see CUpti_ActivityMemoryKind
*/
uint16_t memoryKind;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The unique ID of the graph node that executed this memset through graph launch.
* This field will be 0 if the memset is not executed through graph launch.
*/
uint64_t graphNodeId;
/**
* The unique ID of the graph that executed this memset through graph launch.
* This field will be 0 if the memset is not executed through graph launch.
*/
uint32_t graphId;
/**
* The ID of the HW channel on which the memory set is occuring.
*/
uint32_t channelID;
/**
* The type of the channel
*/
CUpti_ChannelType channelType;
/**
* Undefined. Reserved for internal use
*/
uint32_t pad2;
} CUpti_ActivityMemset4;
/**
* \brief The activity record for memory.
*
* This activity record represents a memory allocation and free operation
* (CUPTI_ACTIVITY_KIND_MEMORY).
* This activity record provides a single record for the memory
* allocation and memory release operations.
*
* Note: It is recommended to move to the new activity record \ref CUpti_ActivityMemory3
* enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY2.
* \ref CUpti_ActivityMemory3 provides separate records for memory
* allocation and memory release operations. This allows to correlate the
* corresponding driver and runtime API activity record with the memory operation.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY
*/
CUpti_ActivityKind kind;
/**
* The memory kind requested by the user
*/
CUpti_ActivityMemoryKind memoryKind;
/**
* The virtual address of the allocation
*/
uint64_t address;
/**
* The number of bytes of memory allocated.
*/
uint64_t bytes;
/**
* The start timestamp for the memory operation, i.e.
* the time when memory was allocated, in ns.
*/
uint64_t start;
/**
* The end timestamp for the memory operation, i.e.
* the time when memory was freed, in ns.
* This will be 0 if memory is not freed in the application
*/
uint64_t end;
/**
* The program counter of the allocation of memory
*/
uint64_t allocPC;
/**
* The program counter of the freeing of memory. This will
* be 0 if memory is not freed in the application
*/
uint64_t freePC;
/**
* The ID of the process to which this record belongs to.
*/
uint32_t processId;
/**
* The ID of the device where the memory allocation is taking place.
*/
uint32_t deviceId;
/**
* The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
*/
uint32_t contextId;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* Variable name. This name is shared across all activity
* records representing the same symbol, and so should not be
* modified.
*/
const char* name;
} CUpti_ActivityMemory;
/**
* \brief The activity record for memory.
*
* This activity record represents a memory allocation and free operation
* (CUPTI_ACTIVITY_KIND_MEMORY2).
* This activity record provides separate records for memory allocation and
* memory release operations.
* This allows to correlate the corresponding driver and runtime API
* activity record with the memory operation.
*
* Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
* enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
* \ref CUpti_ActivityMemory provides a single record for the memory
* allocation and memory release operations.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
*/
CUpti_ActivityKind kind;
/**
* The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
*/
CUpti_ActivityMemoryOperationType memoryOperationType;
/**
* The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
*/
CUpti_ActivityMemoryKind memoryKind;
/**
* The correlation ID of the memory operation. Each memory operation is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver and runtime API activity record that
* launched the memory operation.
*/
uint32_t correlationId;
/**
* The virtual address of the allocation.
*/
uint64_t address;
/**
* The number of bytes of memory allocated.
*/
uint64_t bytes;
/**
* The start timestamp for the memory operation, in ns.
*/
uint64_t timestamp;
/**
* The program counter of the memory operation.
*/
uint64_t PC;
/**
* The ID of the process to which this record belongs to.
*/
uint32_t processId;
/**
* The ID of the device where the memory operation is taking place.
*/
uint32_t deviceId;
/**
* The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
*/
uint32_t contextId;
/**
* The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
*/
uint32_t streamId;
/**
* Variable name. This name is shared across all activity
* records representing the same symbol, and so should not be
* modified.
*/
const char* name;
/**
* \p isAsync is set if memory operation happens through async memory APIs.
*/
uint32_t isAsync;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad1;
#endif
/**
* The memory pool configuration used for the memory operations.
*/
struct {
/**
* The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
*/
CUpti_ActivityMemoryPoolType memoryPoolType;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad2;
#endif
/**
* The base address of the memory pool.
*/
uint64_t address;
/**
* The release threshold of the memory pool in bytes. \p releaseThreshold is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t releaseThreshold;
union {
/**
* The size of the memory pool in bytes.
* \p size is valid if \p memoryPoolType is
* CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t size;
/**
* The processId of the memory pool.
* \p processId is valid if \p memoryPoolType is
* CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t processId;
} pool;
} memoryPoolConfig;
} CUpti_ActivityMemory2;
/**
* \brief The activity record for memory.
*
* This activity record represents a memory allocation and free operation
* (CUPTI_ACTIVITY_KIND_MEMORY2).
* This activity record provides separate records for memory allocation and
* memory release operations.
* This allows to correlate the corresponding driver and runtime API
* activity record with the memory operation.
*
* Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
* enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
* \ref CUpti_ActivityMemory provides a single record for the memory
* allocation and memory release operations.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
*/
CUpti_ActivityKind kind;
/**
* The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
*/
CUpti_ActivityMemoryOperationType memoryOperationType;
/**
* The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
*/
CUpti_ActivityMemoryKind memoryKind;
/**
* The correlation ID of the memory operation. Each memory operation is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver and runtime API activity record that
* launched the memory operation.
*/
uint32_t correlationId;
/**
* The virtual address of the allocation.
*/
uint64_t address;
/**
* The number of bytes of memory allocated.
*/
uint64_t bytes;
/**
* The start timestamp for the memory operation, in ns.
*/
uint64_t timestamp;
/**
* The program counter of the memory operation.
*/
uint64_t PC;
/**
* The ID of the process to which this record belongs to.
*/
uint32_t processId;
/**
* The ID of the device where the memory operation is taking place.
*/
uint32_t deviceId;
/**
* The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
*/
uint32_t contextId;
/**
* The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
*/
uint32_t streamId;
/**
* Variable name. This name is shared across all activity
* records representing the same symbol, and so should not be
* modified.
*/
const char* name;
/**
* \p isAsync is set if memory operation happens through async memory APIs.
*/
uint32_t isAsync;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad1;
#endif
/**
* The memory pool configuration used for the memory operations.
*/
struct PACKED_ALIGNMENT {
/**
* The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
*/
CUpti_ActivityMemoryPoolType memoryPoolType;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad2;
#endif
/**
* The base address of the memory pool.
*/
uint64_t address;
/**
* The release threshold of the memory pool in bytes. \p releaseThreshold is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t releaseThreshold;
union {
/**
* The size of the memory pool in bytes.
* \p size is valid if \p memoryPoolType is
* CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t size;
/**
* The processId of the memory pool.
* \p processId is valid if \p memoryPoolType is
* CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t processId;
} pool;
/**
* The utilized size of the memory pool. \p utilizedSize is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t utilizedSize;
} memoryPoolConfig;
} CUpti_ActivityMemory3;
/**
* \brief The activity record for memory pool.
*
* This activity record represents a memory pool creation, destruction and
* trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
* This activity record provides separate records for memory pool creation,
* destruction and triming operations.
* This allows to correlate the corresponding driver and runtime API
* activity record with the memory pool operation.
*
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
*/
CUpti_ActivityKind kind;
/**
* The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
*/
CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
/**
* The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
*/
CUpti_ActivityMemoryPoolType memoryPoolType;
/**
* The correlation ID of the memory pool operation. Each memory pool
* operation is assigned a unique correlation ID that is identical to the
* correlation ID in the driver and runtime API activity record that
* launched the memory operation.
*/
uint32_t correlationId;
/**
* The ID of the process to which this record belongs to.
*/
uint32_t processId;
/**
* The ID of the device where the memory pool is created.
*/
uint32_t deviceId;
/**
* The minimum bytes to keep of the memory pool. \p minBytesToKeep is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
* \ref CUpti_ActivityMemoryPoolOperationType
*/
size_t minBytesToKeep;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The virtual address of the allocation.
*/
uint64_t address;
/**
* The size of the memory pool operation in bytes. \p size is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t size;
/**
* The release threshold of the memory pool. \p releaseThreshold is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t releaseThreshold;
/**
* The start timestamp for the memory operation, in ns.
*/
uint64_t timestamp;
} CUpti_ActivityMemoryPool;
/**
* \brief The activity record for memory pool.
*
* This activity record represents a memory pool creation, destruction and
* trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
* This activity record provides separate records for memory pool creation,
* destruction and triming operations.
* This allows to correlate the corresponding driver and runtime API
* activity record with the memory pool operation.
*
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
*/
CUpti_ActivityKind kind;
/**
* The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
*/
CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
/**
* The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
*/
CUpti_ActivityMemoryPoolType memoryPoolType;
/**
* The correlation ID of the memory pool operation. Each memory pool
* operation is assigned a unique correlation ID that is identical to the
* correlation ID in the driver and runtime API activity record that
* launched the memory operation.
*/
uint32_t correlationId;
/**
* The ID of the process to which this record belongs to.
*/
uint32_t processId;
/**
* The ID of the device where the memory pool is created.
*/
uint32_t deviceId;
/**
* The minimum bytes to keep of the memory pool. \p minBytesToKeep is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
* \ref CUpti_ActivityMemoryPoolOperationType
*/
size_t minBytesToKeep;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The virtual address of the allocation.
*/
uint64_t address;
/**
* The size of the memory pool operation in bytes. \p size is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t size;
/**
* The release threshold of the memory pool. \p releaseThreshold is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t releaseThreshold;
/**
* The start timestamp for the memory operation, in ns.
*/
uint64_t timestamp;
/**
* The utilized size of the memory pool. \p utilizedSize is
* valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
*/
uint64_t utilizedSize;
} CUpti_ActivityMemoryPool2;
/**
* \brief The activity record for kernel. (deprecated)
*
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
* by CUPTI. Kernel activities are now reported using the
* CUpti_ActivityKernel9 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL
* or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t cacheConfigRequested;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t cacheConfigExecuted;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver API activity record that launched
* the kernel.
*/
uint32_t correlationId;
/**
* The runtime correlation ID of the kernel. Each kernel execution
* is assigned a unique runtime correlation ID that is identical to
* the correlation ID in the runtime API activity record that
* launched the kernel.
*/
uint32_t runtimeCorrelationId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
} CUpti_ActivityKernel;
/**
* \brief The activity record for kernel. (deprecated)
*
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
* by CUPTI. Kernel activities are now reported using the
* CUpti_ActivityKernel9 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The completed timestamp for the kernel execution, in ns. It
* represents the completion of all it's child kernels and the
* kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
* the completion time is unknown.
*/
uint64_t completed;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel is assigned a unique
* grid ID at runtime.
*/
int64_t gridId;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
} CUpti_ActivityKernel2;
/**
* \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards).
* (deprecated in CUDA 9.0)
*
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
* Kernel activities are now reported using the CUpti_ActivityKernel9 activity
* record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The partitioned global caching requested for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
/**
* The partitioned global caching executed for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2. Partitioned global caching can be
* automatically disabled if the occupancy requirement of the launch cannot
* support caching.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The completed timestamp for the kernel execution, in ns. It
* represents the completion of all it's child kernels and the
* kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
* the completion time is unknown.
*/
uint64_t completed;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel is assigned a unique
* grid ID at runtime.
*/
int64_t gridId;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
} CUpti_ActivityKernel3;
/**
* \brief The type of the CUDA kernel launch.
*/
typedef enum {
/**
* The kernel was launched via a regular kernel call
*/
CUPTI_ACTIVITY_LAUNCH_TYPE_REGULAR = 0,
/**
* The kernel was launched via API \ref cudaLaunchCooperativeKernel() or
* \ref cuLaunchCooperativeKernel()
*/
CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_SINGLE_DEVICE = 1,
/**
* The kernel was launched via API \ref cudaLaunchCooperativeKernelMultiDevice() or
* \ref cuLaunchCooperativeKernelMultiDevice()
*/
CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_MULTI_DEVICE = 2,
/**
* The kernel was launched as a CBL commandlist
*/
CUPTI_ACTIVITY_LAUNCH_TYPE_CBL_COMMANDLIST = 3,
} CUpti_ActivityLaunchType;
/**
* \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards).
* (deprecated in CUDA 11.0)
*
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
* Kernel activities are now reported using the CUpti_ActivityKernel9 activity
* record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
/**
* For devices with compute capability 7.0+ cacheConfig values are not updated
* in case field isSharedMemoryCarveoutRequested is set
*/
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The partitioned global caching requested for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
/**
* The partitioned global caching executed for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2. Partitioned global caching can be
* automatically disabled if the occupancy requirement of the launch cannot
* support caching.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The completed timestamp for the kernel execution, in ns. It
* represents the completion of all it's child kernels and the
* kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
* the completion time is unknown.
*/
uint64_t completed;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel is assigned a unique
* grid ID at runtime.
*/
int64_t gridId;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The timestamp when the kernel is queued up in the command buffer, in ns.
* A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
* could not be collected for the kernel. This timestamp is not collected
* by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
* enable collection.
*
* Command buffer is a buffer written by CUDA driver to send commands
* like kernel launch, memory copy etc to the GPU. All launches of CUDA
* kernels are asynchrnous with respect to the host, the host requests
* the launch by writing commands into the command buffer, then returns
* without checking the GPU's progress.
*/
uint64_t queued;
/**
* The timestamp when the command buffer containing the kernel launch
* is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
* indicates that the submitted time could not be collected for the kernel.
* This timestamp is not collected by default. Use API \ref
* cuptiActivityEnableLatencyTimestamps() to enable collection.
*/
uint64_t submitted;
/**
* The indicates if the kernel was executed via a regular launch or via a
* single/multi device cooperative launch. \see CUpti_ActivityLaunchType
*/
uint8_t launchType;
/**
* This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
* updated for the kernel launch
*/
uint8_t isSharedMemoryCarveoutRequested;
/**
* Shared memory carveout value requested for the function in percentage of
* the total resource. The value will be updated only if field
* isSharedMemoryCarveoutRequested is set.
*/
uint8_t sharedMemoryCarveoutRequested;
/**
* Undefined. Reserved for internal use.
*/
uint8_t padding;
/**
* Shared memory size set by the driver.
*/
uint32_t sharedMemoryExecuted;
} CUpti_ActivityKernel4;
/**
* \brief The shared memory limit per block config for a kernel
* This should be used to set 'cudaOccFuncShmemConfig' field in occupancy calculator API
*/
typedef enum {
/** The shared memory limit config is default
*/
CUPTI_FUNC_SHMEM_LIMIT_DEFAULT = 0x00,
/** User has opted for a higher dynamic shared memory limit using function attribute
* 'cudaFuncAttributeMaxDynamicSharedMemorySize' for runtime API or
* CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES for driver API
*/
CUPTI_FUNC_SHMEM_LIMIT_OPTIN = 0x01,
CUPTI_FUNC_SHMEM_LIMIT_FORCE_INT = 0x7fffffff
} CUpti_FuncShmemLimitConfig;
/**
* \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards).
* (deprecated in CUDA 11.2)
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
* by CUPTI. Kernel activities are now reported using the
* CUpti_ActivityKernel9 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
/**
* For devices with compute capability 7.0+ cacheConfig values are not updated
* in case field isSharedMemoryCarveoutRequested is set
*/
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The partitioned global caching requested for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
/**
* The partitioned global caching executed for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2. Partitioned global caching can be
* automatically disabled if the occupancy requirement of the launch cannot
* support caching.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The completed timestamp for the kernel execution, in ns. It
* represents the completion of all it's child kernels and the
* kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
* the completion time is unknown.
*/
uint64_t completed;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel is assigned a unique
* grid ID at runtime.
*/
int64_t gridId;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The timestamp when the kernel is queued up in the command buffer, in ns.
* A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
* could not be collected for the kernel. This timestamp is not collected
* by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
* enable collection.
*
* Command buffer is a buffer written by CUDA driver to send commands
* like kernel launch, memory copy etc to the GPU. All launches of CUDA
* kernels are asynchrnous with respect to the host, the host requests
* the launch by writing commands into the command buffer, then returns
* without checking the GPU's progress.
*/
uint64_t queued;
/**
* The timestamp when the command buffer containing the kernel launch
* is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
* indicates that the submitted time could not be collected for the kernel.
* This timestamp is not collected by default. Use API \ref
* cuptiActivityEnableLatencyTimestamps() to enable collection.
*/
uint64_t submitted;
/**
* The indicates if the kernel was executed via a regular launch or via a
* single/multi device cooperative launch. \see CUpti_ActivityLaunchType
*/
uint8_t launchType;
/**
* This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
* updated for the kernel launch
*/
uint8_t isSharedMemoryCarveoutRequested;
/**
* Shared memory carveout value requested for the function in percentage of
* the total resource. The value will be updated only if field
* isSharedMemoryCarveoutRequested is set.
*/
uint8_t sharedMemoryCarveoutRequested;
/**
* Undefined. Reserved for internal use.
*/
uint8_t padding;
/**
* Shared memory size set by the driver.
*/
uint32_t sharedMemoryExecuted;
/**
* The unique ID of the graph node that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint64_t graphNodeId;
/**
* The shared memory limit config for the kernel. This field shows whether user has opted for a
* higher per block limit of dynamic shared memory.
*/
CUpti_FuncShmemLimitConfig shmemLimitConfig;
/**
* The unique ID of the graph that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint32_t graphId;
} CUpti_ActivityKernel5;
/**
* \brief The activity record for kernel. (deprecated in CUDA 11.6)
*
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
* by CUPTI. Kernel activities are now reported using the
* CUpti_ActivityKernel9 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
/**
* For devices with compute capability 7.0+ cacheConfig values are not updated
* in case field isSharedMemoryCarveoutRequested is set
*/
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The partitioned global caching requested for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
/**
* The partitioned global caching executed for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2. Partitioned global caching can be
* automatically disabled if the occupancy requirement of the launch cannot
* support caching.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The completed timestamp for the kernel execution, in ns. It
* represents the completion of all it's child kernels and the
* kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
* the completion time is unknown.
*/
uint64_t completed;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel is assigned a unique
* grid ID at runtime.
*/
int64_t gridId;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The timestamp when the kernel is queued up in the command buffer, in ns.
* A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
* could not be collected for the kernel. This timestamp is not collected
* by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
* enable collection.
*
* Command buffer is a buffer written by CUDA driver to send commands
* like kernel launch, memory copy etc to the GPU. All launches of CUDA
* kernels are asynchrnous with respect to the host, the host requests
* the launch by writing commands into the command buffer, then returns
* without checking the GPU's progress.
*/
uint64_t queued;
/**
* The timestamp when the command buffer containing the kernel launch
* is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
* indicates that the submitted time could not be collected for the kernel.
* This timestamp is not collected by default. Use API \ref
* cuptiActivityEnableLatencyTimestamps() to enable collection.
*/
uint64_t submitted;
/**
* The indicates if the kernel was executed via a regular launch or via a
* single/multi device cooperative launch. \see CUpti_ActivityLaunchType
*/
uint8_t launchType;
/**
* This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
* updated for the kernel launch
*/
uint8_t isSharedMemoryCarveoutRequested;
/**
* Shared memory carveout value requested for the function in percentage of
* the total resource. The value will be updated only if field
* isSharedMemoryCarveoutRequested is set.
*/
uint8_t sharedMemoryCarveoutRequested;
/**
* Undefined. Reserved for internal use.
*/
uint8_t padding;
/**
* Shared memory size set by the driver.
*/
uint32_t sharedMemoryExecuted;
/**
* The unique ID of the graph node that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint64_t graphNodeId;
/**
* The shared memory limit config for the kernel. This field shows whether user has opted for a
* higher per block limit of dynamic shared memory.
*/
CUpti_FuncShmemLimitConfig shmemLimitConfig;
/**
* The unique ID of the graph that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint32_t graphId;
/**
* The pointer to the access policy window. The structure CUaccessPolicyWindow is
* defined in cuda.h.
*/
CUaccessPolicyWindow *pAccessPolicyWindow;
} CUpti_ActivityKernel6;
/**
* \brief The activity record for kernel. (deprecated in CUDA 11.8)
*
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
* by CUPTI. Kernel activities are now reported using the
* CUpti_ActivityKernel9 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
/**
* For devices with compute capability 7.0+ cacheConfig values are not updated
* in case field isSharedMemoryCarveoutRequested is set
*/
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The partitioned global caching requested for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
/**
* The partitioned global caching executed for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2. Partitioned global caching can be
* automatically disabled if the occupancy requirement of the launch cannot
* support caching.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The completed timestamp for the kernel execution, in ns. It
* represents the completion of all it's child kernels and the
* kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
* the completion time is unknown.
*/
uint64_t completed;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel is assigned a unique
* grid ID at runtime.
*/
int64_t gridId;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The timestamp when the kernel is queued up in the command buffer, in ns.
* A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
* could not be collected for the kernel. This timestamp is not collected
* by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
* enable collection.
*
* Command buffer is a buffer written by CUDA driver to send commands
* like kernel launch, memory copy etc to the GPU. All launches of CUDA
* kernels are asynchrnous with respect to the host, the host requests
* the launch by writing commands into the command buffer, then returns
* without checking the GPU's progress.
*/
uint64_t queued;
/**
* The timestamp when the command buffer containing the kernel launch
* is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
* indicates that the submitted time could not be collected for the kernel.
* This timestamp is not collected by default. Use API \ref
* cuptiActivityEnableLatencyTimestamps() to enable collection.
*/
uint64_t submitted;
/**
* The indicates if the kernel was executed via a regular launch or via a
* single/multi device cooperative launch. \see CUpti_ActivityLaunchType
*/
uint8_t launchType;
/**
* This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
* updated for the kernel launch
*/
uint8_t isSharedMemoryCarveoutRequested;
/**
* Shared memory carveout value requested for the function in percentage of
* the total resource. The value will be updated only if field
* isSharedMemoryCarveoutRequested is set.
*/
uint8_t sharedMemoryCarveoutRequested;
/**
* Undefined. Reserved for internal use.
*/
uint8_t padding;
/**
* Shared memory size set by the driver.
*/
uint32_t sharedMemoryExecuted;
/**
* The unique ID of the graph node that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint64_t graphNodeId;
/**
* The shared memory limit config for the kernel. This field shows whether user has opted for a
* higher per block limit of dynamic shared memory.
*/
CUpti_FuncShmemLimitConfig shmemLimitConfig;
/**
* The unique ID of the graph that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint32_t graphId;
/**
* The pointer to the access policy window. The structure CUaccessPolicyWindow is
* defined in cuda.h.
*/
CUaccessPolicyWindow *pAccessPolicyWindow;
/**
* The ID of the HW channel on which the kernel is launched.
*/
uint32_t channelID;
/**
* The type of the channel
*/
CUpti_ChannelType channelType;
} CUpti_ActivityKernel7;
/**
* \brief The activity record for kernel.
*
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
/**
* For devices with compute capability 7.0+ cacheConfig values are not updated
* in case field isSharedMemoryCarveoutRequested is set
*/
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The partitioned global caching requested for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
/**
* The partitioned global caching executed for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2. Partitioned global caching can be
* automatically disabled if the occupancy requirement of the launch cannot
* support caching.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The completed timestamp for the kernel execution, in ns. It
* represents the completion of all it's child kernels and the
* kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
* the completion time is unknown.
*/
uint64_t completed;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes (deprecated in CUDA 11.8).
* Refer field localMemoryTotal_v2
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel is assigned a unique
* grid ID at runtime.
*/
int64_t gridId;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The timestamp when the kernel is queued up in the command buffer, in ns.
* A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
* could not be collected for the kernel. This timestamp is not collected
* by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
* enable collection.
*
* Command buffer is a buffer written by CUDA driver to send commands
* like kernel launch, memory copy etc to the GPU. All launches of CUDA
* kernels are asynchrnous with respect to the host, the host requests
* the launch by writing commands into the command buffer, then returns
* without checking the GPU's progress.
*/
uint64_t queued;
/**
* The timestamp when the command buffer containing the kernel launch
* is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
* indicates that the submitted time could not be collected for the kernel.
* This timestamp is not collected by default. Use API \ref
* cuptiActivityEnableLatencyTimestamps() to enable collection.
*/
uint64_t submitted;
/**
* The indicates if the kernel was executed via a regular launch or via a
* single/multi device cooperative launch. \see CUpti_ActivityLaunchType
*/
uint8_t launchType;
/**
* This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
* updated for the kernel launch
*/
uint8_t isSharedMemoryCarveoutRequested;
/**
* Shared memory carveout value requested for the function in percentage of
* the total resource. The value will be updated only if field
* isSharedMemoryCarveoutRequested is set.
*/
uint8_t sharedMemoryCarveoutRequested;
/**
* Undefined. Reserved for internal use.
*/
uint8_t padding;
/**
* Shared memory size set by the driver.
*/
uint32_t sharedMemoryExecuted;
/**
* The unique ID of the graph node that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint64_t graphNodeId;
/**
* The shared memory limit config for the kernel. This field shows whether user has opted for a
* higher per block limit of dynamic shared memory.
*/
CUpti_FuncShmemLimitConfig shmemLimitConfig;
/**
* The unique ID of the graph that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint32_t graphId;
/**
* The pointer to the access policy window. The structure CUaccessPolicyWindow is
* defined in cuda.h.
*/
CUaccessPolicyWindow *pAccessPolicyWindow;
/**
* The ID of the HW channel on which the kernel is launched.
*/
uint32_t channelID;
/**
* The type of the channel
*/
CUpti_ChannelType channelType;
/**
* The X-dimension cluster size for the kernel.
* Field is valid for devices with compute capability 9.0 and higher
*/
uint32_t clusterX;
/**
* The Y-dimension cluster size for the kernel.
* Field is valid for devices with compute capability 9.0 and higher
*/
uint32_t clusterY;
/**
* The Z-dimension cluster size for the kernel.
* Field is valid for devices with compute capability 9.0 and higher
*/
uint32_t clusterZ;
/**
* The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
* Field is valid for devices with compute capability 9.0 and higher
*/
uint32_t clusterSchedulingPolicy;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint64_t localMemoryTotal_v2;
} CUpti_ActivityKernel8;
/**
* \brief The activity record for kernel.
*
* This activity record represents a kernel execution
* (CUPTI_ACTIVITY_KIND_KERNEL and
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
* CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
*/
CUpti_ActivityKind kind;
/**
* For devices with compute capability 7.0+ cacheConfig values are not updated
* in case field isSharedMemoryCarveoutRequested is set
*/
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The partitioned global caching requested for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
/**
* The partitioned global caching executed for the kernel. Partitioned
* global caching is required to enable caching on certain chips, such as
* devices with compute capability 5.2. Partitioned global caching can be
* automatically disabled if the occupancy requirement of the launch cannot
* support caching.
*/
CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The completed timestamp for the kernel execution, in ns. It
* represents the completion of all it's child kernels and the
* kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
* the completion time is unknown.
*/
uint64_t completed;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes (deprecated in CUDA 11.8).
* Refer field localMemoryTotal_v2
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel is assigned a unique
* grid ID at runtime.
*/
int64_t gridId;
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
/**
* Undefined. Reserved for internal use.
*/
void *reserved0;
/**
* The timestamp when the kernel is queued up in the command buffer, in ns.
* A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
* could not be collected for the kernel. This timestamp is not collected
* by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
* enable collection.
*
* Command buffer is a buffer written by CUDA driver to send commands
* like kernel launch, memory copy etc to the GPU. All launches of CUDA
* kernels are asynchrnous with respect to the host, the host requests
* the launch by writing commands into the command buffer, then returns
* without checking the GPU's progress.
*/
uint64_t queued;
/**
* The timestamp when the command buffer containing the kernel launch
* is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
* indicates that the submitted time could not be collected for the kernel.
* This timestamp is not collected by default. Use API \ref
* cuptiActivityEnableLatencyTimestamps() to enable collection.
*/
uint64_t submitted;
/**
* The indicates if the kernel was executed via a regular launch or via a
* single/multi device cooperative launch. \see CUpti_ActivityLaunchType
*/
uint8_t launchType;
/**
* This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
* updated for the kernel launch
*/
uint8_t isSharedMemoryCarveoutRequested;
/**
* Shared memory carveout value requested for the function in percentage of
* the total resource. The value will be updated only if field
* isSharedMemoryCarveoutRequested is set.
*/
uint8_t sharedMemoryCarveoutRequested;
/**
* Undefined. Reserved for internal use.
*/
uint8_t padding;
/**
* Shared memory size set by the driver.
*/
uint32_t sharedMemoryExecuted;
/**
* The unique ID of the graph node that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint64_t graphNodeId;
/**
* The shared memory limit config for the kernel. This field shows whether user has opted for a
* higher per block limit of dynamic shared memory.
*/
CUpti_FuncShmemLimitConfig shmemLimitConfig;
/**
* The unique ID of the graph that launched this kernel through graph launch APIs.
* This field will be 0 if the kernel is not launched through graph launch APIs.
*/
uint32_t graphId;
/**
* The pointer to the access policy window. The structure CUaccessPolicyWindow is
* defined in cuda.h.
*/
CUaccessPolicyWindow *pAccessPolicyWindow;
/**
* The ID of the HW channel on which the kernel is launched.
*/
uint32_t channelID;
/**
* The type of the channel
*/
CUpti_ChannelType channelType;
/**
* The X-dimension cluster size for the kernel.
* Field is valid for devices with compute capability 9.0 and higher
*/
uint32_t clusterX;
/**
* The Y-dimension cluster size for the kernel.
* Field is valid for devices with compute capability 9.0 and higher
*/
uint32_t clusterY;
/**
* The Z-dimension cluster size for the kernel.
* Field is valid for devices with compute capability 9.0 and higher
*/
uint32_t clusterZ;
/**
* The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
* Field is valid for devices with compute capability 9.0 and higher
*/
uint32_t clusterSchedulingPolicy;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint64_t localMemoryTotal_v2;
/**
* The maximum cluster size for the kernel
*/
uint32_t maxPotentialClusterSize;
/**
* The maximum clusters that could co-exist on the target device for the kernel
*/
uint32_t maxActiveClusters;
} CUpti_ActivityKernel9;
/**
* \brief The activity record for CDP (CUDA Dynamic Parallelism)
* kernel.
*
* This activity record represents a CDP kernel execution.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_CDP_KERNEL
*/
CUpti_ActivityKind kind;
union {
uint8_t both;
struct {
/**
* The cache configuration requested by the kernel. The value is one
* of the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t requested:4;
/**
* The cache configuration used for the kernel. The value is one of
* the CUfunc_cache enumeration values from cuda.h.
*/
uint8_t executed:4;
} config;
} cacheConfig;
/**
* The shared memory configuration used for the kernel. The value is one of
* the CUsharedconfig enumeration values from cuda.h.
*/
uint8_t sharedMemoryConfig;
/**
* The number of registers required for each thread executing the
* kernel.
*/
uint16_t registersPerThread;
/**
* The start timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t start;
/**
* The end timestamp for the kernel execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the kernel.
*/
uint64_t end;
/**
* The ID of the device where the kernel is executing.
*/
uint32_t deviceId;
/**
* The ID of the context where the kernel is executing.
*/
uint32_t contextId;
/**
* The ID of the stream where the kernel is executing.
*/
uint32_t streamId;
/**
* The X-dimension grid size for the kernel.
*/
int32_t gridX;
/**
* The Y-dimension grid size for the kernel.
*/
int32_t gridY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t gridZ;
/**
* The X-dimension block size for the kernel.
*/
int32_t blockX;
/**
* The Y-dimension block size for the kernel.
*/
int32_t blockY;
/**
* The Z-dimension grid size for the kernel.
*/
int32_t blockZ;
/**
* The static shared memory allocated for the kernel, in bytes.
*/
int32_t staticSharedMemory;
/**
* The dynamic shared memory reserved for the kernel, in bytes.
*/
int32_t dynamicSharedMemory;
/**
* The amount of local memory reserved for each thread, in bytes.
*/
uint32_t localMemoryPerThread;
/**
* The total amount of local memory reserved for the kernel, in
* bytes.
*/
uint32_t localMemoryTotal;
/**
* The correlation ID of the kernel. Each kernel execution is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver API activity record that launched
* the kernel.
*/
uint32_t correlationId;
/**
* The grid ID of the kernel. Each kernel execution
* is assigned a unique grid ID.
*/
int64_t gridId;
/**
* The grid ID of the parent kernel.
*/
int64_t parentGridId;
/**
* The timestamp when kernel is queued up, in ns. A value of
* CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time is
* unknown.
*/
uint64_t queued;
/**
* The timestamp when kernel is submitted to the gpu, in ns. A value
* of CUPTI_TIMESTAMP_UNKNOWN indicates that the submission time is
* unknown.
*/
uint64_t submitted;
/**
* The timestamp when kernel is marked as completed, in ns. A value
* of CUPTI_TIMESTAMP_UNKNOWN indicates that the completion time is
* unknown.
*/
uint64_t completed;
/**
* The X-dimension of the parent block.
*/
uint32_t parentBlockX;
/**
* The Y-dimension of the parent block.
*/
uint32_t parentBlockY;
/**
* The Z-dimension of the parent block.
*/
uint32_t parentBlockZ;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The name of the kernel. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
} CUpti_ActivityCdpKernel;
/**
* \brief The activity record for a preemption of a CDP kernel.
*
* This activity record represents a preemption of a CDP kernel.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_PREEMPTION
*/
CUpti_ActivityKind kind;
/**
* kind of the preemption
*/
CUpti_ActivityPreemptionKind preemptionKind;
/**
* The timestamp of the preemption, in ns. A value of 0 indicates
* that timestamp information could not be collected for the
* preemption.
*/
uint64_t timestamp;
/**
* The grid-id of the block that is preempted
*/
int64_t gridId;
/**
* The X-dimension of the block that is preempted
*/
uint32_t blockX;
/**
* The Y-dimension of the block that is preempted
*/
uint32_t blockY;
/**
* The Z-dimension of the block that is preempted
*/
uint32_t blockZ;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityPreemption;
/**
* \brief The activity record for a driver or runtime API invocation.
*
* This activity record represents an invocation of a driver or
* runtime API (CUPTI_ACTIVITY_KIND_DRIVER and
* CUPTI_ACTIVITY_KIND_RUNTIME).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_DRIVER,
* CUPTI_ACTIVITY_KIND_RUNTIME, or CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API.
*/
CUpti_ActivityKind kind;
/**
* The ID of the driver or runtime function.
*/
CUpti_CallbackId cbid;
/**
* The start timestamp for the function, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the function.
*/
uint64_t start;
/**
* The end timestamp for the function, in ns. A value of 0 for both
* the start and end timestamps indicates that timestamp information
* could not be collected for the function.
*/
uint64_t end;
/**
* The ID of the process where the driver or runtime CUDA function
* is executing.
*/
uint32_t processId;
/**
* The ID of the thread where the driver or runtime CUDA function is
* executing.
*/
uint32_t threadId;
/**
* The correlation ID of the driver or runtime CUDA function. Each
* function invocation is assigned a unique correlation ID that is
* identical to the correlation ID in the memcpy, memset, or kernel
* activity record that is associated with this function.
*/
uint32_t correlationId;
/**
* The return value for the function. For a CUDA driver function
* with will be a CUresult value, and for a CUDA runtime function
* this will be a cudaError_t value.
*/
uint32_t returnValue;
} CUpti_ActivityAPI;
/**
* \brief The activity record for a CUPTI event.
*
* This activity record represents a CUPTI event value
* (CUPTI_ACTIVITY_KIND_EVENT). This activity record kind is not
* produced by the activity API but is included for completeness and
* ease-of-use. Profile frameworks built on top of CUPTI that collect
* event data may choose to use this type to store the collected event
* data.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_EVENT.
*/
CUpti_ActivityKind kind;
/**
* The event ID.
*/
CUpti_EventID id;
/**
* The event value.
*/
uint64_t value;
/**
* The event domain ID.
*/
CUpti_EventDomainID domain;
/**
* The correlation ID of the event. Use of this ID is user-defined,
* but typically this ID value will equal the correlation ID of the
* kernel for which the event was gathered.
*/
uint32_t correlationId;
} CUpti_ActivityEvent;
/**
* \brief The activity record for a CUPTI event with instance
* information.
*
* This activity record represents the a CUPTI event value for a
* specific event domain instance
* (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE). This activity record kind is
* not produced by the activity API but is included for completeness
* and ease-of-use. Profile frameworks built on top of CUPTI that
* collect event data may choose to use this type to store the
* collected event data. This activity record should be used when
* event domain instance information needs to be associated with the
* event.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be
* CUPTI_ACTIVITY_KIND_EVENT_INSTANCE.
*/
CUpti_ActivityKind kind;
/**
* The event ID.
*/
CUpti_EventID id;
/**
* The event domain ID.
*/
CUpti_EventDomainID domain;
/**
* The event domain instance.
*/
uint32_t instance;
/**
* The event value.
*/
uint64_t value;
/**
* The correlation ID of the event. Use of this ID is user-defined,
* but typically this ID value will equal the correlation ID of the
* kernel for which the event was gathered.
*/
uint32_t correlationId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityEventInstance;
/**
* \brief The activity record for a CUPTI metric.
*
* This activity record represents the collection of a CUPTI metric
* value (CUPTI_ACTIVITY_KIND_METRIC). This activity record kind is not
* produced by the activity API but is included for completeness and
* ease-of-use. Profile frameworks built on top of CUPTI that collect
* metric data may choose to use this type to store the collected metric
* data.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_METRIC.
*/
CUpti_ActivityKind kind;
/**
* The metric ID.
*/
CUpti_MetricID id;
/**
* The metric value.
*/
CUpti_MetricValue value;
/**
* The correlation ID of the metric. Use of this ID is user-defined,
* but typically this ID value will equal the correlation ID of the
* kernel for which the metric was gathered.
*/
uint32_t correlationId;
/**
* The properties of this metric. \see CUpti_ActivityFlag
*/
uint8_t flags;
/**
* Undefined. Reserved for internal use.
*/
uint8_t pad[3];
} CUpti_ActivityMetric;
/**
* \brief The activity record for a CUPTI metric with instance
* information.
*
* This activity record represents a CUPTI metric value
* for a specific metric domain instance
* (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE). This activity record kind
* is not produced by the activity API but is included for
* completeness and ease-of-use. Profile frameworks built on top of
* CUPTI that collect metric data may choose to use this type to store
* the collected metric data. This activity record should be used when
* metric domain instance information needs to be associated with the
* metric.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be
* CUPTI_ACTIVITY_KIND_METRIC_INSTANCE.
*/
CUpti_ActivityKind kind;
/**
* The metric ID.
*/
CUpti_MetricID id;
/**
* The metric value.
*/
CUpti_MetricValue value;
/**
* The metric domain instance.
*/
uint32_t instance;
/**
* The correlation ID of the metric. Use of this ID is user-defined,
* but typically this ID value will equal the correlation ID of the
* kernel for which the metric was gathered.
*/
uint32_t correlationId;
/**
* The properties of this metric. \see CUpti_ActivityFlag
*/
uint8_t flags;
/**
* Undefined. Reserved for internal use.
*/
uint8_t pad[7];
} CUpti_ActivityMetricInstance;
/**
* \brief The activity record for source locator.
*
* This activity record represents a source locator
* (CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR.
*/
CUpti_ActivityKind kind;
/**
* The ID for the source path, will be used in all the source level
* results.
*/
uint32_t id;
/**
* The line number in the source .
*/
uint32_t lineNumber;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The path for the file.
*/
const char *fileName;
} CUpti_ActivitySourceLocator;
/**
* \brief The activity record for source-level global
* access. (deprecated)
*
* This activity records the locations of the global
* accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
* Global access activities are now reported using the
* CUpti_ActivityGlobalAccess3 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
*/
CUpti_ActivityKind kind;
/**
* The properties of this global access.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* The pc offset for the access.
*/
uint32_t pcOffset;
/**
* The number of times this instruction was executed per warp. It will be incremented
* when at least one of thread among warp is active with predicate and condition code
* evaluating to true.
*/
uint32_t executed;
/**
* This increments each time when this instruction is executed by number
* of threads that executed this instruction with predicate and condition code evaluating to true.
*/
uint64_t threadsExecuted;
/**
* The total number of 32 bytes transactions to L2 cache generated by this access
*/
uint64_t l2_transactions;
} CUpti_ActivityGlobalAccess;
/**
* \brief The activity record for source-level global
* access. (deprecated in CUDA 9.0)
*
* This activity records the locations of the global
* accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
* Global access activities are now reported using the
* CUpti_ActivityGlobalAccess3 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
*/
CUpti_ActivityKind kind;
/**
* The properties of this global access.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* The pc offset for the access.
*/
uint32_t pcOffset;
/**
* This increments each time when this instruction is executed by number
* of threads that executed this instruction with predicate and condition code evaluating to true.
*/
uint64_t threadsExecuted;
/**
* The total number of 32 bytes transactions to L2 cache generated by this access
*/
uint64_t l2_transactions;
/**
* The minimum number of L2 transactions possible based on the access pattern.
*/
uint64_t theoreticalL2Transactions;
/**
* The number of times this instruction was executed per warp. It will be incremented
* when at least one of thread among warp is active with predicate and condition code
* evaluating to true.
*/
uint32_t executed;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityGlobalAccess2;
/**
* \brief The activity record for source-level global
* access.
*
* This activity records the locations of the global
* accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
*/
CUpti_ActivityKind kind;
/**
* The properties of this global access.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* The number of times this instruction was executed per warp. It will be incremented
* when at least one of thread among warp is active with predicate and condition code
* evaluating to true.
*/
uint32_t executed;
/**
* The pc offset for the access.
*/
uint64_t pcOffset;
/**
* This increments each time when this instruction is executed by number of
* threads that executed this instruction with predicate and condition code
* evaluating to true.
*/
uint64_t threadsExecuted;
/**
* The total number of 32 bytes transactions to L2 cache generated by this
access
*/
uint64_t l2_transactions;
/**
* The minimum number of L2 transactions possible based on the access pattern.
*/
uint64_t theoreticalL2Transactions;
} CUpti_ActivityGlobalAccess3;
/**
* \brief The activity record for source level result
* branch. (deprecated)
*
* This activity record the locations of the branches in the
* source (CUPTI_ACTIVITY_KIND_BRANCH).
* Branch activities are now reported using the
* CUpti_ActivityBranch2 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
*/
CUpti_ActivityKind kind;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* The pc offset for the branch.
*/
uint32_t pcOffset;
/**
* The number of times this instruction was executed per warp. It will be incremented
* regardless of predicate or condition code.
*/
uint32_t executed;
/**
* Number of times this branch diverged
*/
uint32_t diverged;
/**
* This increments each time when this instruction is executed by number
* of threads that executed this instruction
*/
uint64_t threadsExecuted;
} CUpti_ActivityBranch;
/**
* \brief The activity record for source level result
* branch.
*
* This activity record the locations of the branches in the
* source (CUPTI_ACTIVITY_KIND_BRANCH).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
*/
CUpti_ActivityKind kind;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* The pc offset for the branch.
*/
uint32_t pcOffset;
/**
* Number of times this branch diverged
*/
uint32_t diverged;
/**
* This increments each time when this instruction is executed by number
* of threads that executed this instruction
*/
uint64_t threadsExecuted;
/**
* The number of times this instruction was executed per warp. It will be incremented
* regardless of predicate or condition code.
*/
uint32_t executed;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityBranch2;
/**
* \brief The activity record for a device. (deprecated)
*
* This activity record represents information about a GPU device
* (CUPTI_ACTIVITY_KIND_DEVICE).
* Device activity is now reported using the
* CUpti_ActivityDevice4 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
*/
CUpti_ActivityKind kind;
/**
* The flags associated with the device. \see CUpti_ActivityFlag
*/
CUpti_ActivityFlag flags;
/**
* The global memory bandwidth available on the device, in
* kBytes/sec.
*/
uint64_t globalMemoryBandwidth;
/**
* The amount of global memory on the device, in bytes.
*/
uint64_t globalMemorySize;
/**
* The amount of constant memory on the device, in bytes.
*/
uint32_t constantMemorySize;
/**
* The size of the L2 cache on the device, in bytes.
*/
uint32_t l2CacheSize;
/**
* The number of threads per warp on the device.
*/
uint32_t numThreadsPerWarp;
/**
* The core clock rate of the device, in kHz.
*/
uint32_t coreClockRate;
/**
* Number of memory copy engines on the device.
*/
uint32_t numMemcpyEngines;
/**
* Number of multiprocessors on the device.
*/
uint32_t numMultiprocessors;
/**
* The maximum "instructions per cycle" possible on each device
* multiprocessor.
*/
uint32_t maxIPC;
/**
* Maximum number of warps that can be present on a multiprocessor
* at any given time.
*/
uint32_t maxWarpsPerMultiprocessor;
/**
* Maximum number of blocks that can be present on a multiprocessor
* at any given time.
*/
uint32_t maxBlocksPerMultiprocessor;
/**
* Maximum number of registers that can be allocated to a block.
*/
uint32_t maxRegistersPerBlock;
/**
* Maximum amount of shared memory that can be assigned to a block,
* in bytes.
*/
uint32_t maxSharedMemoryPerBlock;
/**
* Maximum number of threads allowed in a block.
*/
uint32_t maxThreadsPerBlock;
/**
* Maximum allowed X dimension for a block.
*/
uint32_t maxBlockDimX;
/**
* Maximum allowed Y dimension for a block.
*/
uint32_t maxBlockDimY;
/**
* Maximum allowed Z dimension for a block.
*/
uint32_t maxBlockDimZ;
/**
* Maximum allowed X dimension for a grid.
*/
uint32_t maxGridDimX;
/**
* Maximum allowed Y dimension for a grid.
*/
uint32_t maxGridDimY;
/**
* Maximum allowed Z dimension for a grid.
*/
uint32_t maxGridDimZ;
/**
* Compute capability for the device, major number.
*/
uint32_t computeCapabilityMajor;
/**
* Compute capability for the device, minor number.
*/
uint32_t computeCapabilityMinor;
/**
* The device ID.
*/
uint32_t id;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The device name. This name is shared across all activity records
* representing instances of the device, and so should not be
* modified.
*/
const char *name;
} CUpti_ActivityDevice;
/**
* \brief The activity record for a device. (deprecated)
*
* This activity record represents information about a GPU device
* (CUPTI_ACTIVITY_KIND_DEVICE).
* Device activity is now reported using the
* CUpti_ActivityDevice4 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
*/
CUpti_ActivityKind kind;
/**
* The flags associated with the device. \see CUpti_ActivityFlag
*/
CUpti_ActivityFlag flags;
/**
* The global memory bandwidth available on the device, in
* kBytes/sec.
*/
uint64_t globalMemoryBandwidth;
/**
* The amount of global memory on the device, in bytes.
*/
uint64_t globalMemorySize;
/**
* The amount of constant memory on the device, in bytes.
*/
uint32_t constantMemorySize;
/**
* The size of the L2 cache on the device, in bytes.
*/
uint32_t l2CacheSize;
/**
* The number of threads per warp on the device.
*/
uint32_t numThreadsPerWarp;
/**
* The core clock rate of the device, in kHz.
*/
uint32_t coreClockRate;
/**
* Number of memory copy engines on the device.
*/
uint32_t numMemcpyEngines;
/**
* Number of multiprocessors on the device.
*/
uint32_t numMultiprocessors;
/**
* The maximum "instructions per cycle" possible on each device
* multiprocessor.
*/
uint32_t maxIPC;
/**
* Maximum number of warps that can be present on a multiprocessor
* at any given time.
*/
uint32_t maxWarpsPerMultiprocessor;
/**
* Maximum number of blocks that can be present on a multiprocessor
* at any given time.
*/
uint32_t maxBlocksPerMultiprocessor;
/**
* Maximum amount of shared memory available per multiprocessor, in bytes.
*/
uint32_t maxSharedMemoryPerMultiprocessor;
/**
* Maximum number of 32-bit registers available per multiprocessor.
*/
uint32_t maxRegistersPerMultiprocessor;
/**
* Maximum number of registers that can be allocated to a block.
*/
uint32_t maxRegistersPerBlock;
/**
* Maximum amount of shared memory that can be assigned to a block,
* in bytes.
*/
uint32_t maxSharedMemoryPerBlock;
/**
* Maximum number of threads allowed in a block.
*/
uint32_t maxThreadsPerBlock;
/**
* Maximum allowed X dimension for a block.
*/
uint32_t maxBlockDimX;
/**
* Maximum allowed Y dimension for a block.
*/
uint32_t maxBlockDimY;
/**
* Maximum allowed Z dimension for a block.
*/
uint32_t maxBlockDimZ;
/**
* Maximum allowed X dimension for a grid.
*/
uint32_t maxGridDimX;
/**
* Maximum allowed Y dimension for a grid.
*/
uint32_t maxGridDimY;
/**
* Maximum allowed Z dimension for a grid.
*/
uint32_t maxGridDimZ;
/**
* Compute capability for the device, major number.
*/
uint32_t computeCapabilityMajor;
/**
* Compute capability for the device, minor number.
*/
uint32_t computeCapabilityMinor;
/**
* The device ID.
*/
uint32_t id;
/**
* ECC enabled flag for device
*/
uint32_t eccEnabled;
/**
* The device UUID. This value is the globally unique immutable
* alphanumeric identifier of the device.
*/
CUuuid uuid;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The device name. This name is shared across all activity records
* representing instances of the device, and so should not be
* modified.
*/
const char *name;
} CUpti_ActivityDevice2;
/**
* \brief The activity record for a device. (CUDA 7.0 onwards)
*
* This activity record represents information about a GPU device
* (CUPTI_ACTIVITY_KIND_DEVICE).
* Device activity is now reported using the
* CUpti_ActivityDevice4 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
*/
CUpti_ActivityKind kind;
/**
* The flags associated with the device. \see CUpti_ActivityFlag
*/
CUpti_ActivityFlag flags;
/**
* The global memory bandwidth available on the device, in
* kBytes/sec.
*/
uint64_t globalMemoryBandwidth;
/**
* The amount of global memory on the device, in bytes.
*/
uint64_t globalMemorySize;
/**
* The amount of constant memory on the device, in bytes.
*/
uint32_t constantMemorySize;
/**
* The size of the L2 cache on the device, in bytes.
*/
uint32_t l2CacheSize;
/**
* The number of threads per warp on the device.
*/
uint32_t numThreadsPerWarp;
/**
* The core clock rate of the device, in kHz.
*/
uint32_t coreClockRate;
/**
* Number of memory copy engines on the device.
*/
uint32_t numMemcpyEngines;
/**
* Number of multiprocessors on the device.
*/
uint32_t numMultiprocessors;
/**
* The maximum "instructions per cycle" possible on each device
* multiprocessor.
*/
uint32_t maxIPC;
/**
* Maximum number of warps that can be present on a multiprocessor
* at any given time.
*/
uint32_t maxWarpsPerMultiprocessor;
/**
* Maximum number of blocks that can be present on a multiprocessor
* at any given time.
*/
uint32_t maxBlocksPerMultiprocessor;
/**
* Maximum amount of shared memory available per multiprocessor, in bytes.
*/
uint32_t maxSharedMemoryPerMultiprocessor;
/**
* Maximum number of 32-bit registers available per multiprocessor.
*/
uint32_t maxRegistersPerMultiprocessor;
/**
* Maximum number of registers that can be allocated to a block.
*/
uint32_t maxRegistersPerBlock;
/**
* Maximum amount of shared memory that can be assigned to a block,
* in bytes.
*/
uint32_t maxSharedMemoryPerBlock;
/**
* Maximum number of threads allowed in a block.
*/
uint32_t maxThreadsPerBlock;
/**
* Maximum allowed X dimension for a block.
*/
uint32_t maxBlockDimX;
/**
* Maximum allowed Y dimension for a block.
*/
uint32_t maxBlockDimY;
/**
* Maximum allowed Z dimension for a block.
*/
uint32_t maxBlockDimZ;
/**
* Maximum allowed X dimension for a grid.
*/
uint32_t maxGridDimX;
/**
* Maximum allowed Y dimension for a grid.
*/
uint32_t maxGridDimY;
/**
* Maximum allowed Z dimension for a grid.
*/
uint32_t maxGridDimZ;
/**
* Compute capability for the device, major number.
*/
uint32_t computeCapabilityMajor;
/**
* Compute capability for the device, minor number.
*/
uint32_t computeCapabilityMinor;
/**
* The device ID.
*/
uint32_t id;
/**
* ECC enabled flag for device
*/
uint32_t eccEnabled;
/**
* The device UUID. This value is the globally unique immutable
* alphanumeric identifier of the device.
*/
CUuuid uuid;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The device name. This name is shared across all activity records
* representing instances of the device, and so should not be
* modified.
*/
const char *name;
/**
* Flag to indicate whether the device is visible to CUDA. Users can
* set the device visibility using CUDA_VISIBLE_DEVICES environment
*/
uint8_t isCudaVisible;
uint8_t reserved[7];
} CUpti_ActivityDevice3;
/**
* \brief The activity record for a device. (CUDA 11.6 onwards)
*
* This activity record represents information about a GPU device
* (CUPTI_ACTIVITY_KIND_DEVICE).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
*/
CUpti_ActivityKind kind;
/**
* The flags associated with the device. \see CUpti_ActivityFlag
*/
CUpti_ActivityFlag flags;
/**
* The global memory bandwidth available on the device, in
* kBytes/sec.
*/
uint64_t globalMemoryBandwidth;
/**
* The amount of global memory on the device, in bytes.
*/
uint64_t globalMemorySize;
/**
* The amount of constant memory on the device, in bytes.
*/
uint32_t constantMemorySize;
/**
* The size of the L2 cache on the device, in bytes.
*/
uint32_t l2CacheSize;
/**
* The number of threads per warp on the device.
*/
uint32_t numThreadsPerWarp;
/**
* The core clock rate of the device, in kHz.
*/
uint32_t coreClockRate;
/**
* Number of memory copy engines on the device.
*/
uint32_t numMemcpyEngines;
/**
* Number of multiprocessors on the device.
*/
uint32_t numMultiprocessors;
/**
* The maximum "instructions per cycle" possible on each device
* multiprocessor.
*/
uint32_t maxIPC;
/**
* Maximum number of warps that can be present on a multiprocessor
* at any given time.
*/
uint32_t maxWarpsPerMultiprocessor;
/**
* Maximum number of blocks that can be present on a multiprocessor
* at any given time.
*/
uint32_t maxBlocksPerMultiprocessor;
/**
* Maximum amount of shared memory available per multiprocessor, in bytes.
*/
uint32_t maxSharedMemoryPerMultiprocessor;
/**
* Maximum number of 32-bit registers available per multiprocessor.
*/
uint32_t maxRegistersPerMultiprocessor;
/**
* Maximum number of registers that can be allocated to a block.
*/
uint32_t maxRegistersPerBlock;
/**
* Maximum amount of shared memory that can be assigned to a block,
* in bytes.
*/
uint32_t maxSharedMemoryPerBlock;
/**
* Maximum number of threads allowed in a block.
*/
uint32_t maxThreadsPerBlock;
/**
* Maximum allowed X dimension for a block.
*/
uint32_t maxBlockDimX;
/**
* Maximum allowed Y dimension for a block.
*/
uint32_t maxBlockDimY;
/**
* Maximum allowed Z dimension for a block.
*/
uint32_t maxBlockDimZ;
/**
* Maximum allowed X dimension for a grid.
*/
uint32_t maxGridDimX;
/**
* Maximum allowed Y dimension for a grid.
*/
uint32_t maxGridDimY;
/**
* Maximum allowed Z dimension for a grid.
*/
uint32_t maxGridDimZ;
/**
* Compute capability for the device, major number.
*/
uint32_t computeCapabilityMajor;
/**
* Compute capability for the device, minor number.
*/
uint32_t computeCapabilityMinor;
/**
* The device ID.
*/
uint32_t id;
/**
* ECC enabled flag for device
*/
uint32_t eccEnabled;
/**
* The device UUID. This value is the globally unique immutable
* alphanumeric identifier of the device.
*/
CUuuid uuid;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The device name. This name is shared across all activity records
* representing instances of the device, and so should not be
* modified.
*/
const char *name;
/**
* Flag to indicate whether the device is visible to CUDA. Users can
* set the device visibility using CUDA_VISIBLE_DEVICES environment
*/
uint8_t isCudaVisible;
/**
* MIG enabled flag for device
*/
uint8_t isMigEnabled;
uint8_t reserved[6];
/**
* GPU Instance id for MIG enabled devices.
* If mig mode is disabled value is set to UINT32_MAX
*/
uint32_t gpuInstanceId;
/**
* Compute Instance id for MIG enabled devices.
* If mig mode is disabled value is set to UINT32_MAX
*/
uint32_t computeInstanceId;
/**
* The MIG UUID. This value is the globally unique immutable
* alphanumeric identifier of the device.
*/
CUuuid migUuid;
} CUpti_ActivityDevice4;
/**
* \brief The activity record for a device attribute.
*
* This activity record represents information about a GPU device:
* either a CUpti_DeviceAttribute or CUdevice_attribute value
* (CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be
* CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
*/
CUpti_ActivityKind kind;
/**
* The flags associated with the device. \see CUpti_ActivityFlag
*/
CUpti_ActivityFlag flags;
/**
* The ID of the device that this attribute applies to.
*/
uint32_t deviceId;
/**
* The attribute, either a CUpti_DeviceAttribute or
* CUdevice_attribute. Flag
* CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is used to indicate
* what kind of attribute this is. If
* CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is 1 then
* CUdevice_attribute field is value, otherwise
* CUpti_DeviceAttribute field is valid.
*/
union {
CUdevice_attribute cu;
CUpti_DeviceAttribute cupti;
} attribute;
/**
* The value for the attribute. See CUpti_DeviceAttribute and
* CUdevice_attribute for the type of the value for a given
* attribute.
*/
union {
double vDouble;
uint32_t vUint32;
uint64_t vUint64;
int32_t vInt32;
int64_t vInt64;
} value;
} CUpti_ActivityDeviceAttribute;
/**
* \brief The activity record for a context.
*
* This activity record represents information about a context
* (CUPTI_ACTIVITY_KIND_CONTEXT).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
*/
CUpti_ActivityKind kind;
/**
* The context ID.
*/
uint32_t contextId;
/**
* The device ID.
*/
uint32_t deviceId;
/**
* The compute API kind. \see CUpti_ActivityComputeApiKind
*/
uint16_t computeApiKind;
/**
* The ID for the NULL stream in this context
*/
uint16_t nullStreamId;
} CUpti_ActivityContext;
/**
* \brief The activity record providing a name.
*
* This activity record provides a name for a device, context, thread,
* etc. and other resource naming done via NVTX APIs
* (CUPTI_ACTIVITY_KIND_NAME).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_NAME.
*/
CUpti_ActivityKind kind;
/**
* The kind of activity object being named.
*/
CUpti_ActivityObjectKind objectKind;
/**
* The identifier for the activity object. 'objectKind' indicates
* which ID is valid for this record.
*/
CUpti_ActivityObjectKindId objectId;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The name.
*/
const char *name;
} CUpti_ActivityName;
/**
* \brief The activity record providing a marker which is an
* instantaneous point in time. (deprecated in CUDA 8.0)
*
* The marker is specified with a descriptive name and unique id
* (CUPTI_ACTIVITY_KIND_MARKER).
* Marker activity is now reported using the
* CUpti_ActivityMarker2 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
*/
CUpti_ActivityKind kind;
/**
* The flags associated with the marker. \see CUpti_ActivityFlag
*/
CUpti_ActivityFlag flags;
/**
* The timestamp for the marker, in ns. A value of 0 indicates that
* timestamp information could not be collected for the marker.
*/
uint64_t timestamp;
/**
* The marker ID.
*/
uint32_t id;
/**
* The kind of activity object associated with this marker.
*/
CUpti_ActivityObjectKind objectKind;
/**
* The identifier for the activity object associated with this
* marker. 'objectKind' indicates which ID is valid for this record.
*/
CUpti_ActivityObjectKindId objectId;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The marker name for an instantaneous or start marker. This will
* be NULL for an end marker.
*/
const char *name;
} CUpti_ActivityMarker;
/**
* \brief The activity record providing a marker which is an
* instantaneous point in time.
*
* The marker is specified with a descriptive name and unique id
* (CUPTI_ACTIVITY_KIND_MARKER).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
*/
CUpti_ActivityKind kind;
/**
* The flags associated with the marker. \see CUpti_ActivityFlag
*/
CUpti_ActivityFlag flags;
/**
* The timestamp for the marker, in ns. A value of 0 indicates that
* timestamp information could not be collected for the marker.
*/
uint64_t timestamp;
/**
* The marker ID.
*/
uint32_t id;
/**
* The kind of activity object associated with this marker.
*/
CUpti_ActivityObjectKind objectKind;
/**
* The identifier for the activity object associated with this
* marker. 'objectKind' indicates which ID is valid for this record.
*/
CUpti_ActivityObjectKindId objectId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
/**
* The marker name for an instantaneous or start marker. This will
* be NULL for an end marker.
*/
const char *name;
/**
* The name of the domain to which this marker belongs to.
* This will be NULL for default domain.
*/
const char *domain;
} CUpti_ActivityMarker2;
/**
* \brief The activity record providing detailed information for a marker.
*
* The marker data contains color, payload, and category.
* (CUPTI_ACTIVITY_KIND_MARKER_DATA).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be
* CUPTI_ACTIVITY_KIND_MARKER_DATA.
*/
CUpti_ActivityKind kind;
/**
* The flags associated with the marker. \see CUpti_ActivityFlag
*/
CUpti_ActivityFlag flags;
/**
* The marker ID.
*/
uint32_t id;
/**
* Defines the payload format for the value associated with the marker.
*/
CUpti_MetricValueKind payloadKind;
/**
* The payload value.
*/
CUpti_MetricValue payload;
/**
* The color for the marker.
*/
uint32_t color;
/**
* The category for the marker.
*/
uint32_t category;
} CUpti_ActivityMarkerData;
/**
* \brief The activity record for CUPTI and driver overheads.
*
* This activity record provides CUPTI and driver overhead information
* (CUPTI_ACTIVITY_OVERHEAD).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
*/
CUpti_ActivityKind kind;
/**
* The kind of overhead, CUPTI, DRIVER, COMPILER etc.
*/
CUpti_ActivityOverheadKind overheadKind;
/**
* The kind of activity object that the overhead is associated with.
*/
CUpti_ActivityObjectKind objectKind;
/**
* The identifier for the activity object. 'objectKind' indicates
* which ID is valid for this record.
*/
CUpti_ActivityObjectKindId objectId;
/**
* The start timestamp for the overhead, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the overhead.
*/
uint64_t start;
/**
* The end timestamp for the overhead, in ns. A value of 0 for both
* the start and end timestamps indicates that timestamp information
* could not be collected for the overhead.
*/
uint64_t end;
} CUpti_ActivityOverhead;
/**
* \brief The activity record for CUPTI environmental data.
*
* This activity record provides CUPTI environmental data, include
* power, clocks, and thermals. This information is sampled at
* various rates and returned in this activity record. The consumer
* of the record needs to check the environmentKind field to figure
* out what kind of environmental record this is.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_ENVIRONMENT.
*/
CUpti_ActivityKind kind;
/**
* The ID of the device
*/
uint32_t deviceId;
/**
* The timestamp when this sample was retrieved, in ns. A value of 0
* indicates that timestamp information could not be collected for
* the marker.
*/
uint64_t timestamp;
/**
* The kind of data reported in this record.
*/
CUpti_ActivityEnvironmentKind environmentKind;
union {
/**
* Data returned for CUPTI_ACTIVITY_ENVIRONMENT_SPEED environment
* kind.
*/
struct {
/**
* The SM frequency in MHz
*/
uint32_t smClock;
/**
* The memory frequency in MHz
*/
uint32_t memoryClock;
/**
* The PCIe link generation.
*/
uint32_t pcieLinkGen;
/**
* The PCIe link width.
*/
uint32_t pcieLinkWidth;
/**
* The clocks throttle reasons.
*/
CUpti_EnvironmentClocksThrottleReason clocksThrottleReasons;
} speed;
/**
* Data returned for CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE
* environment kind.
*/
struct {
/**
* The GPU temperature in degrees C.
*/
uint32_t gpuTemperature;
} temperature;
/**
* Data returned for CUPTI_ACTIVITY_ENVIRONMENT_POWER environment
* kind.
*/
struct {
/**
* The power in milliwatts consumed by GPU and associated
* circuitry.
*/
uint32_t power;
/**
* The power in milliwatts that will trigger power management
* algorithm.
*/
uint32_t powerLimit;
} power;
/**
* Data returned for CUPTI_ACTIVITY_ENVIRONMENT_COOLING
* environment kind.
*/
struct {
/**
* The fan speed as percentage of maximum.
*/
uint32_t fanSpeed;
} cooling;
} data;
} CUpti_ActivityEnvironment;
/**
* \brief The activity record for source-level instruction execution.
*
* This activity records result for source level instruction execution.
* (CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION.
*/
CUpti_ActivityKind kind;
/**
* The properties of this instruction execution.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* The pc offset for the instruction.
*/
uint32_t pcOffset;
/**
* This increments each time when this instruction is executed by number
* of threads that executed this instruction, regardless of predicate or condition code.
*/
uint64_t threadsExecuted;
/**
* This increments each time when this instruction is executed by number
* of threads that executed this instruction with predicate and condition code evaluating to true.
*/
uint64_t notPredOffThreadsExecuted;
/**
* The number of times this instruction was executed per warp. It will be incremented
* regardless of predicate or condition code.
*/
uint32_t executed;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityInstructionExecution;
/**
* \brief The activity record for PC sampling. (deprecated in CUDA 8.0)
*
* This activity records information obtained by sampling PC
* (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
* PC sampling activities are now reported using the
* CUpti_ActivityPCSampling2 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
*/
CUpti_ActivityKind kind;
/**
* The properties of this instruction.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* The pc offset for the instruction.
*/
uint32_t pcOffset;
/**
* Number of times the PC was sampled with the stallReason in the record.
* The same PC can be sampled with different stall reasons.
*/
uint32_t samples;
/**
* Current stall reason. Includes one of the reasons from
* \ref CUpti_ActivityPCSamplingStallReason
*/
CUpti_ActivityPCSamplingStallReason stallReason;
} CUpti_ActivityPCSampling;
/**
* \brief The activity record for PC sampling. (deprecated in CUDA 9.0)
*
* This activity records information obtained by sampling PC
* (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
* PC sampling activities are now reported using the
* CUpti_ActivityPCSampling3 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
*/
CUpti_ActivityKind kind;
/**
* The properties of this instruction.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* The pc offset for the instruction.
*/
uint32_t pcOffset;
/**
* Number of times the PC was sampled with the stallReason in the record.
* These samples indicate that no instruction was issued in that cycle from
* the warp scheduler from where the warp was sampled.
* Field is valid for devices with compute capability 6.0 and higher
*/
uint32_t latencySamples;
/**
* Number of times the PC was sampled with the stallReason in the record.
* The same PC can be sampled with different stall reasons. The count includes
* latencySamples.
*/
uint32_t samples;
/**
* Current stall reason. Includes one of the reasons from
* \ref CUpti_ActivityPCSamplingStallReason
*/
CUpti_ActivityPCSamplingStallReason stallReason;
uint32_t pad;
} CUpti_ActivityPCSampling2;
/**
* \brief The activity record for PC sampling.
*
* This activity records information obtained by sampling PC
* (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
*/
CUpti_ActivityKind kind;
/**
* The properties of this instruction.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* Number of times the PC was sampled with the stallReason in the record.
* These samples indicate that no instruction was issued in that cycle from
* the warp scheduler from where the warp was sampled.
* Field is valid for devices with compute capability 6.0 and higher
*/
uint32_t latencySamples;
/**
* Number of times the PC was sampled with the stallReason in the record.
* The same PC can be sampled with different stall reasons. The count includes
* latencySamples.
*/
uint32_t samples;
/**
* Current stall reason. Includes one of the reasons from
* \ref CUpti_ActivityPCSamplingStallReason
*/
CUpti_ActivityPCSamplingStallReason stallReason;
/**
* The pc offset for the instruction.
*/
uint64_t pcOffset;
} CUpti_ActivityPCSampling3;
/**
* \brief The activity record for record status for PC sampling.
*
* This activity records information obtained by sampling PC
* (CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO.
*/
CUpti_ActivityKind kind;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Number of times the PC was sampled for this kernel instance including all
* dropped samples.
*/
uint64_t totalSamples;
/**
* Number of samples that were dropped by hardware due to backpressure/overflow.
*/
uint64_t droppedSamples;
/**
* Sampling period in terms of number of cycles .
*/
uint64_t samplingPeriodInCycles;
} CUpti_ActivityPCSamplingRecordInfo;
/**
* \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0)
*
* This activity record represents a Unified Memory counter
* (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
*/
CUpti_ActivityKind kind;
/**
* The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind
*/
CUpti_ActivityUnifiedMemoryCounterKind counterKind;
/**
* Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope
*/
CUpti_ActivityUnifiedMemoryCounterScope scope;
/**
* The ID of the device involved in the memory transfer operation.
* It is not relevant if the scope of the counter is global (all devices).
*/
uint32_t deviceId;
/**
* Value of the counter
*
*/
uint64_t value;
/**
* The timestamp when this sample was retrieved, in ns. A value of 0
* indicates that timestamp information could not be collected
*/
uint64_t timestamp;
/**
* The ID of the process to which this record belongs to. In case of
* global scope, processId is undefined.
*/
uint32_t processId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityUnifiedMemoryCounter;
/**
* \brief The activity record for Unified Memory counters (CUDA 7.0 and beyond)
*
* This activity record represents a Unified Memory counter
* (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
*/
CUpti_ActivityKind kind;
/**
* The Unified Memory counter kind
*/
CUpti_ActivityUnifiedMemoryCounterKind counterKind;
/**
* Value of the counter
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD,
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH,
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THREASHING and
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, it is the size of the
* memory region in bytes.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, it
* is the number of page fault groups for the same page.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT,
* it is the program counter for the instruction that caused fault.
*/
uint64_t value;
/**
* The start timestamp of the counter, in ns.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
* captured when activity starts on GPU.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT and
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, timestamp is
* captured when CUDA driver started processing the fault.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, timestamp
* is captured when CUDA driver detected thrashing of memory region.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING,
* timestamp is captured when throttling opeeration was started by CUDA driver.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP,
* timestamp is captured when CUDA driver has pushed all required operations
* to the processor specified by dstId.
*/
uint64_t start;
/**
* The end timestamp of the counter, in ns.
* Ignore this field if counterKind is
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
* captured when activity finishes on GPU.
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, timestamp is
* captured when CUDA driver queues the replay of faulting memory accesses on the GPU
* For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, timestamp
* is captured when throttling operation was finished by CUDA driver
*/
uint64_t end;
/**
* This is the virtual base address of the page/s being transferred. For cpu and
* gpu faults, the virtual address for the page that faulted.
*/
uint64_t address;
/**
* The ID of the source CPU/device involved in the memory transfer, page fault, thrashing,
* throttling or remote map operation. For counterKind
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
* device IDs fighting for the memory region. Ignore this field if counterKind is
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
*/
uint32_t srcId;
/**
* The ID of the destination CPU/device involved in the memory transfer or remote map
* operation. Ignore this field if counterKind is
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
*/
uint32_t dstId;
/**
* The ID of the stream causing the transfer.
* This value of this field is invalid.
*/
uint32_t streamId;
/**
* The ID of the process to which this record belongs to.
*/
uint32_t processId;
/**
* The flags associated with this record. See enums \ref CUpti_ActivityUnifiedMemoryAccessType
* if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
* and \ref CUpti_ActivityUnifiedMemoryMigrationCause if counterKind is
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD
* and \ref CUpti_ActivityUnifiedMemoryRemoteMapCause if counterKind is
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP and \ref CUpti_ActivityFlag
* if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
* CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
*/
uint32_t flags;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityUnifiedMemoryCounter2;
/**
* \brief The activity record for global/device functions.
*
* This activity records function name and corresponding module
* information.
* (CUPTI_ACTIVITY_KIND_FUNCTION).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_FUNCTION.
*/
CUpti_ActivityKind kind;
/**
* ID to uniquely identify the record
*/
uint32_t id;
/**
* The ID of the context where the function is launched.
*/
uint32_t contextId;
/**
* The module ID in which this global/device function is present.
*/
uint32_t moduleId;
/**
* The function's unique symbol index in the module.
*/
uint32_t functionIndex;
#ifdef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The name of the function. This name is shared across all activity
* records representing the same kernel, and so should not be
* modified.
*/
const char *name;
} CUpti_ActivityFunction;
/**
* \brief The activity record for a CUDA module.
*
* This activity record represents a CUDA module
* (CUPTI_ACTIVITY_KIND_MODULE). This activity record kind is not
* produced by the activity API but is included for completeness and
* ease-of-use. Profile frameworks built on top of CUPTI that collect
* module data from the module callback may choose to use this type to
* store the collected module data.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_MODULE.
*/
CUpti_ActivityKind kind;
/**
* The ID of the context where the module is loaded.
*/
uint32_t contextId;
/**
* The module ID.
*/
uint32_t id;
/**
* The cubin size.
*/
uint32_t cubinSize;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
#endif
/**
* The pointer to cubin.
*/
const void *cubin;
} CUpti_ActivityModule;
/**
* \brief The activity record for source-level shared
* access.
*
* This activity records the locations of the shared
* accesses in the source
* (CUPTI_ACTIVITY_KIND_SHARED_ACCESS).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_SHARED_ACCESS.
*/
CUpti_ActivityKind kind;
/**
* The properties of this shared access.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* The correlation ID of the kernel to which this result is associated.
*/
uint32_t correlationId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* The pc offset for the access.
*/
uint32_t pcOffset;
/**
* This increments each time when this instruction is executed by number
* of threads that executed this instruction with predicate and condition code evaluating to true.
*/
uint64_t threadsExecuted;
/**
* The total number of shared memory transactions generated by this access
*/
uint64_t sharedTransactions;
/**
* The minimum number of shared memory transactions possible based on the access pattern.
*/
uint64_t theoreticalSharedTransactions;
/**
* The number of times this instruction was executed per warp. It will be incremented
* when at least one of thread among warp is active with predicate and condition code
* evaluating to true.
*/
uint32_t executed;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivitySharedAccess;
/**
* \brief The activity record for CUDA event.
*
* This activity is used to track recorded events.
* (CUPTI_ACTIVITY_KIND_CUDA_EVENT).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_CUDA_EVENT.
*/
CUpti_ActivityKind kind;
/**
* The correlation ID of the API to which this result is associated.
*/
uint32_t correlationId;
/**
* The ID of the context where the event was recorded.
*/
uint32_t contextId;
/**
* The compute stream where the event was recorded.
*/
uint32_t streamId;
/**
* A unique event ID to identify the event record.
*/
uint32_t eventId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityCudaEvent;
/**
* \brief The activity record for CUDA stream.
*
* This activity is used to track created streams.
* (CUPTI_ACTIVITY_KIND_STREAM).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_STREAM.
*/
CUpti_ActivityKind kind;
/**
* The ID of the context where the stream was created.
*/
uint32_t contextId;
/**
* A unique stream ID to identify the stream.
*/
uint32_t streamId;
/**
* The clamped priority for the stream.
*/
uint32_t priority;
/**
* Flags associated with the stream.
*/
CUpti_ActivityStreamFlag flag;
/**
* The correlation ID of the API to which this result is associated.
*/
uint32_t correlationId;
} CUpti_ActivityStream;
/**
* \brief The activity record for synchronization management.
*
* This activity is used to track various CUDA synchronization APIs.
* (CUPTI_ACTIVITY_KIND_SYNCHRONIZATION).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
*/
CUpti_ActivityKind kind;
/**
* The type of record.
*/
CUpti_ActivitySynchronizationType type;
/**
* The start timestamp for the function, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the function.
*/
uint64_t start;
/**
* The end timestamp for the function, in ns. A value of 0 for both
* the start and end timestamps indicates that timestamp information
* could not be collected for the function.
*/
uint64_t end;
/**
* The correlation ID of the API to which this result is associated.
*/
uint32_t correlationId;
/**
* The ID of the context for which the synchronization API is called.
* In case of context synchronization API it is the context id for which the API is called.
* In case of stream/event synchronization it is the ID of the context where the stream/event was created.
*/
uint32_t contextId;
/**
* The compute stream for which the synchronization API is called.
* A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
* Not valid for cuCtxSynchronize, cuEventSynchronize.
*/
uint32_t streamId;
/**
* The event ID for which the synchronization API is called.
* A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
* Not valid for cuCtxSynchronize, cuStreamSynchronize.
*/
uint32_t cudaEventId;
} CUpti_ActivitySynchronization;
/**
* \brief The activity record for source-level sass/source
* line-by-line correlation.
*
* This activity records source level sass/source correlation
* information.
* (CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION.
*/
CUpti_ActivityKind kind;
/**
* The properties of this instruction.
*/
CUpti_ActivityFlag flags;
/**
* The ID for source locator.
*/
uint32_t sourceLocatorId;
/**
* Correlation ID with global/device function name
*/
uint32_t functionId;
/**
* The pc offset for the instruction.
*/
uint32_t pcOffset;
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad;
} CUpti_ActivityInstructionCorrelation;
/**
* \brief The OpenAcc event kind for OpenAcc activity records.
*
* \see CUpti_ActivityKindOpenAcc
*/
typedef enum {
CUPTI_OPENACC_EVENT_KIND_INVALID = 0,
CUPTI_OPENACC_EVENT_KIND_DEVICE_INIT = 1,
CUPTI_OPENACC_EVENT_KIND_DEVICE_SHUTDOWN = 2,
CUPTI_OPENACC_EVENT_KIND_RUNTIME_SHUTDOWN = 3,
CUPTI_OPENACC_EVENT_KIND_ENQUEUE_LAUNCH = 4,
CUPTI_OPENACC_EVENT_KIND_ENQUEUE_UPLOAD = 5,
CUPTI_OPENACC_EVENT_KIND_ENQUEUE_DOWNLOAD = 6,
CUPTI_OPENACC_EVENT_KIND_WAIT = 7,
CUPTI_OPENACC_EVENT_KIND_IMPLICIT_WAIT = 8,
CUPTI_OPENACC_EVENT_KIND_COMPUTE_CONSTRUCT = 9,
CUPTI_OPENACC_EVENT_KIND_UPDATE = 10,
CUPTI_OPENACC_EVENT_KIND_ENTER_DATA = 11,
CUPTI_OPENACC_EVENT_KIND_EXIT_DATA = 12,
CUPTI_OPENACC_EVENT_KIND_CREATE = 13,
CUPTI_OPENACC_EVENT_KIND_DELETE = 14,
CUPTI_OPENACC_EVENT_KIND_ALLOC = 15,
CUPTI_OPENACC_EVENT_KIND_FREE = 16,
CUPTI_OPENACC_EVENT_KIND_FORCE_INT = 0x7fffffff
} CUpti_OpenAccEventKind;
/**
* \brief The OpenAcc parent construct kind for OpenAcc activity records.
*/
typedef enum {
CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN = 0,
CUPTI_OPENACC_CONSTRUCT_KIND_PARALLEL = 1,
CUPTI_OPENACC_CONSTRUCT_KIND_KERNELS = 2,
CUPTI_OPENACC_CONSTRUCT_KIND_LOOP = 3,
CUPTI_OPENACC_CONSTRUCT_KIND_DATA = 4,
CUPTI_OPENACC_CONSTRUCT_KIND_ENTER_DATA = 5,
CUPTI_OPENACC_CONSTRUCT_KIND_EXIT_DATA = 6,
CUPTI_OPENACC_CONSTRUCT_KIND_HOST_DATA = 7,
CUPTI_OPENACC_CONSTRUCT_KIND_ATOMIC = 8,
CUPTI_OPENACC_CONSTRUCT_KIND_DECLARE = 9,
CUPTI_OPENACC_CONSTRUCT_KIND_INIT = 10,
CUPTI_OPENACC_CONSTRUCT_KIND_SHUTDOWN = 11,
CUPTI_OPENACC_CONSTRUCT_KIND_SET = 12,
CUPTI_OPENACC_CONSTRUCT_KIND_UPDATE = 13,
CUPTI_OPENACC_CONSTRUCT_KIND_ROUTINE = 14,
CUPTI_OPENACC_CONSTRUCT_KIND_WAIT = 15,
CUPTI_OPENACC_CONSTRUCT_KIND_RUNTIME_API = 16,
CUPTI_OPENACC_CONSTRUCT_KIND_FORCE_INT = 0x7fffffff
} CUpti_OpenAccConstructKind;
typedef enum {
CUPTI_OPENMP_EVENT_KIND_INVALID = 0,
CUPTI_OPENMP_EVENT_KIND_PARALLEL = 1,
CUPTI_OPENMP_EVENT_KIND_TASK = 2,
CUPTI_OPENMP_EVENT_KIND_THREAD = 3,
CUPTI_OPENMP_EVENT_KIND_IDLE = 4,
CUPTI_OPENMP_EVENT_KIND_WAIT_BARRIER = 5,
CUPTI_OPENMP_EVENT_KIND_WAIT_TASKWAIT = 6,
CUPTI_OPENMP_EVENT_KIND_FORCE_INT = 0x7fffffff
} CUpti_OpenMpEventKind;
/**
* \brief The base activity record for OpenAcc records.
*
* The OpenACC activity API part uses a CUpti_ActivityOpenAcc as a generic
* representation for any OpenACC activity. The 'kind' field is used to determine the
* specific activity kind, and from that the CUpti_ActivityOpenAcc object can
* be cast to the specific OpenACC activity record type appropriate for that kind.
*
* Note that all OpenACC activity record types are padded and aligned to
* ensure that each member of the record is naturally aligned.
*
* \see CUpti_ActivityKind
*/
typedef struct PACKED_ALIGNMENT {
/**
* The kind of this activity.
*/
CUpti_ActivityKind kind;
/**
* CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
*/
CUpti_OpenAccEventKind eventKind;
/**
* CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
*
* Note that for applications using PGI OpenACC runtime < 16.1, this
* will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
*/
CUpti_OpenAccConstructKind parentConstruct;
/**
* Version number
*/
uint32_t version;
/**
* 1 for any implicit event, such as an implicit wait at a synchronous data construct
* 0 otherwise
*/
uint32_t implicit;
/**
* Device type
*/
uint32_t deviceType;
/**
* Device number
*/
uint32_t deviceNumber;
/**
* ThreadId
*/
uint32_t threadId;
/**
* Value of async() clause of the corresponding directive
*/
uint64_t async;
/**
* Internal asynchronous queue number used
*/
uint64_t asyncMap;
/**
* The line number of the directive or program construct or the starting line
* number of the OpenACC construct corresponding to the event.
* A zero value means the line number is not known.
*/
uint32_t lineNo;
/**
* For an OpenACC construct, this contains the line number of the end
* of the construct. A zero value means the line number is not known.
*/
uint32_t endLineNo;
/**
* The line number of the first line of the function named in funcName.
* A zero value means the line number is not known.
*/
uint32_t funcLineNo;
/**
* The last line number of the function named in funcName.
* A zero value means the line number is not known.
*/
uint32_t funcEndLineNo;
/**
* CUPTI start timestamp
*/
uint64_t start;
/**
* CUPTI end timestamp
*/
uint64_t end;
/**
* CUDA device id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuDeviceId;
/**
* CUDA context id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuContextId;
/**
* CUDA stream id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuStreamId;
/**
* The ID of the process where the OpenACC activity is executing.
*/
uint32_t cuProcessId;
/**
* The ID of the thread where the OpenACC activity is executing.
*/
uint32_t cuThreadId;
/**
* The OpenACC correlation ID.
* Valid only if deviceType is acc_device_nvidia.
* If not 0, it uniquely identifies this record. It is identical to the
* externalId in the preceeding external correlation record of type
* CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
*/
uint32_t externalId;
/*
* A pointer to null-terminated string containing the name of or path to
* the source file, if known, or a null pointer if not.
*/
const char *srcFile;
/*
* A pointer to a null-terminated string containing the name of the
* function in which the event occurred.
*/
const char *funcName;
} CUpti_ActivityOpenAcc;
/**
* \brief The activity record for OpenACC data.
*
* (CUPTI_ACTIVITY_KIND_OPENACC_DATA).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_DATA.
*/
CUpti_ActivityKind kind;
/**
* CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
*/
CUpti_OpenAccEventKind eventKind;
/*
* CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
*
* Note that for applications using PGI OpenACC runtime < 16.1, this
* will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
*/
CUpti_OpenAccConstructKind parentConstruct;
/*
* Version number
*/
uint32_t version;
/*
* 1 for any implicit event, such as an implicit wait at a synchronous data construct
* 0 otherwise
*/
uint32_t implicit;
/*
* Device type
*/
uint32_t deviceType;
/*
* Device number
*/
uint32_t deviceNumber;
/**
* ThreadId
*/
uint32_t threadId;
/*
* Value of async() clause of the corresponding directive
*/
uint64_t async;
/*
* Internal asynchronous queue number used
*/
uint64_t asyncMap;
/*
* The line number of the directive or program construct or the starting line
* number of the OpenACC construct corresponding to the event.
* A negative or zero value means the line number is not known.
*/
uint32_t lineNo;
/*
* For an OpenACC construct, this contains the line number of the end
* of the construct. A negative or zero value means the line number is not known.
*/
uint32_t endLineNo;
/*
* The line number of the first line of the function named in func_name.
* A negative or zero value means the line number is not known.
*/
uint32_t funcLineNo;
/*
* The last line number of the function named in func_name.
* A negative or zero value means the line number is not known.
*/
uint32_t funcEndLineNo;
/**
* CUPTI start timestamp
*/
uint64_t start;
/**
* CUPTI end timestamp
*/
uint64_t end;
/**
* CUDA device id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuDeviceId;
/**
* CUDA context id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuContextId;
/**
* CUDA stream id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuStreamId;
/**
* The ID of the process where the OpenACC activity is executing.
*/
uint32_t cuProcessId;
/**
* The ID of the thread where the OpenACC activity is executing.
*/
uint32_t cuThreadId;
/**
* The OpenACC correlation ID.
* Valid only if deviceType is acc_device_nvidia.
* If not 0, it uniquely identifies this record. It is identical to the
* externalId in the preceeding external correlation record of type
* CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
*/
uint32_t externalId;
/*
* A pointer to null-terminated string containing the name of or path to
* the source file, if known, or a null pointer if not.
*/
const char *srcFile;
/*
* A pointer to a null-terminated string containing the name of the
* function in which the event occurred.
*/
const char *funcName;
/* --- end of common CUpti_ActivityOpenAcc part --- */
/**
* Number of bytes
*/
uint64_t bytes;
/**
* Host pointer if available
*/
uint64_t hostPtr;
/**
* Device pointer if available
*/
uint64_t devicePtr;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad1;
#endif
/*
* A pointer to null-terminated string containing the name of the variable
* for which this event is triggered, if known, or a null pointer if not.
*/
const char *varName;
} CUpti_ActivityOpenAccData;
/**
* \brief The activity record for OpenACC launch.
*
* (CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH.
*/
CUpti_ActivityKind kind;
/**
* CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
*/
CUpti_OpenAccEventKind eventKind;
/**
* CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
*
* Note that for applications using PGI OpenACC runtime < 16.1, this
* will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
*/
CUpti_OpenAccConstructKind parentConstruct;
/**
* Version number
*/
uint32_t version;
/**
* 1 for any implicit event, such as an implicit wait at a synchronous data construct
* 0 otherwise
*/
uint32_t implicit;
/**
* Device type
*/
uint32_t deviceType;
/**
* Device number
*/
uint32_t deviceNumber;
/**
* ThreadId
*/
uint32_t threadId;
/**
* Value of async() clause of the corresponding directive
*/
uint64_t async;
/**
* Internal asynchronous queue number used
*/
uint64_t asyncMap;
/**
* The line number of the directive or program construct or the starting line
* number of the OpenACC construct corresponding to the event.
* A negative or zero value means the line number is not known.
*/
uint32_t lineNo;
/**
* For an OpenACC construct, this contains the line number of the end
* of the construct. A negative or zero value means the line number is not known.
*/
uint32_t endLineNo;
/**
* The line number of the first line of the function named in func_name.
* A negative or zero value means the line number is not known.
*/
uint32_t funcLineNo;
/**
* The last line number of the function named in func_name.
* A negative or zero value means the line number is not known.
*/
uint32_t funcEndLineNo;
/**
* CUPTI start timestamp
*/
uint64_t start;
/**
* CUPTI end timestamp
*/
uint64_t end;
/**
* CUDA device id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuDeviceId;
/**
* CUDA context id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuContextId;
/**
* CUDA stream id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuStreamId;
/**
* The ID of the process where the OpenACC activity is executing.
*/
uint32_t cuProcessId;
/**
* The ID of the thread where the OpenACC activity is executing.
*/
uint32_t cuThreadId;
/**
* The OpenACC correlation ID.
* Valid only if deviceType is acc_device_nvidia.
* If not 0, it uniquely identifies this record. It is identical to the
* externalId in the preceeding external correlation record of type
* CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
*/
uint32_t externalId;
/**
* A pointer to null-terminated string containing the name of or path to
* the source file, if known, or a null pointer if not.
*/
const char *srcFile;
/**
* A pointer to a null-terminated string containing the name of the
* function in which the event occurred.
*/
const char *funcName;
/* --- end of common CUpti_ActivityOpenAcc part --- */
/**
* The number of gangs created for this kernel launch
*/
uint64_t numGangs;
/**
* The number of workers created for this kernel launch
*/
uint64_t numWorkers;
/**
* The number of vector lanes created for this kernel launch
*/
uint64_t vectorLength;
#ifndef CUPTILP64
/**
* Undefined. Reserved for internal use.
*/
uint32_t pad1;
#endif
/**
* A pointer to null-terminated string containing the name of the
* kernel being launched, if known, or a null pointer if not.
*/
const char *kernelName;
} CUpti_ActivityOpenAccLaunch;
/**
* \brief The activity record for OpenACC other.
*
* (CUPTI_ACTIVITY_KIND_OPENACC_OTHER).
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_OTHER.
*/
CUpti_ActivityKind kind;
/**
* CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
*/
CUpti_OpenAccEventKind eventKind;
/**
* CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
*
* Note that for applications using PGI OpenACC runtime < 16.1, this
* will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
*/
CUpti_OpenAccConstructKind parentConstruct;
/**
* Version number
*/
uint32_t version;
/**
* 1 for any implicit event, such as an implicit wait at a synchronous data construct
* 0 otherwise
*/
uint32_t implicit;
/**
* Device type
*/
uint32_t deviceType;
/**
* Device number
*/
uint32_t deviceNumber;
/**
* ThreadId
*/
uint32_t threadId;
/**
* Value of async() clause of the corresponding directive
*/
uint64_t async;
/**
* Internal asynchronous queue number used
*/
uint64_t asyncMap;
/**
* The line number of the directive or program construct or the starting line
* number of the OpenACC construct corresponding to the event.
* A negative or zero value means the line number is not known.
*/
uint32_t lineNo;
/**
* For an OpenACC construct, this contains the line number of the end
* of the construct. A negative or zero value means the line number is not known.
*/
uint32_t endLineNo;
/**
* The line number of the first line of the function named in func_name.
* A negative or zero value means the line number is not known.
*/
uint32_t funcLineNo;
/**
* The last line number of the function named in func_name.
* A negative or zero value means the line number is not known.
*/
uint32_t funcEndLineNo;
/**
* CUPTI start timestamp
*/
uint64_t start;
/**
* CUPTI end timestamp
*/
uint64_t end;
/**
* CUDA device id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuDeviceId;
/**
* CUDA context id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuContextId;
/**
* CUDA stream id
* Valid only if deviceType is acc_device_nvidia.
*/
uint32_t cuStreamId;
/**
* The ID of the process where the OpenACC activity is executing.
*/
uint32_t cuProcessId;
/**
* The ID of the thread where the OpenACC activity is executing.
*/
uint32_t cuThreadId;
/**
* The OpenACC correlation ID.
* Valid only if deviceType is acc_device_nvidia.
* If not 0, it uniquely identifies this record. It is identical to the
* externalId in the preceeding external correlation record of type
* CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
*/
uint32_t externalId;
/**
* A pointer to null-terminated string containing the name of or path to
* the source file, if known, or a null pointer if not.
*/
const char *srcFile;
/**
* A pointer to a null-terminated string containing the name of the
* function in which the event occurred.
*/
const char *funcName;
/* --- end of common CUpti_ActivityOpenAcc part --- */
} CUpti_ActivityOpenAccOther;
/**
* \brief The base activity record for OpenMp records.
*
* \see CUpti_ActivityKind
*/
typedef struct PACKED_ALIGNMENT {
/**
* The kind of this activity.
*/
CUpti_ActivityKind kind;
/**
* CUPTI OpenMP event kind (\see CUpti_OpenMpEventKind)
*/
CUpti_OpenMpEventKind eventKind;
/**
* Version number
*/
uint32_t version;
/**
* ThreadId
*/
uint32_t threadId;
/**
* CUPTI start timestamp
*/
uint64_t start;
/**
* CUPTI end timestamp
*/
uint64_t end;
/**
* The ID of the process where the OpenMP activity is executing.
*/
uint32_t cuProcessId;
/**
* The ID of the thread where the OpenMP activity is executing.
*/
uint32_t cuThreadId;
} CUpti_ActivityOpenMp;
/**
* \brief The kind of external APIs supported for correlation.
*
* Custom correlation kinds are reserved for usage in external tools.
*
* \see CUpti_ActivityExternalCorrelation
*/
typedef enum {
CUPTI_EXTERNAL_CORRELATION_KIND_INVALID = 0,
/**
* The external API is unknown to CUPTI
*/
CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN = 1,
/**
* The external API is OpenACC
*/
CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC = 2,
/**
* The external API is custom0
*/
CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0 = 3,
/**
* The external API is custom1
*/
CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1 = 4,
/**
* The external API is custom2
*/
CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM2 = 5,
/**
* Add new kinds before this line
*/
CUPTI_EXTERNAL_CORRELATION_KIND_SIZE,
CUPTI_EXTERNAL_CORRELATION_KIND_FORCE_INT = 0x7fffffff
} CUpti_ExternalCorrelationKind;
/**
* \brief The activity record for correlation with external records
*
* This activity record correlates native CUDA records (e.g. CUDA Driver API,
* kernels, memcpys, ...) with records from external APIs such as OpenACC.
* (CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION).
*
* \see CUpti_ActivityKind
*/
typedef struct PACKED_ALIGNMENT {
/**
* The kind of this activity.
*/
CUpti_ActivityKind kind;
/**
* The kind of external API this record correlated to.
*/
CUpti_ExternalCorrelationKind externalKind;
/**
* The correlation ID of the associated non-CUDA API record.
* The exact field in the associated external record depends
* on that record's activity kind (\see externalKind).
*/
uint64_t externalId;
/**
* The correlation ID of the associated CUDA driver or runtime API record.
*/
uint32_t correlationId;
/**
* Undefined. Reserved for internal use.
*/
uint32_t reserved;
} CUpti_ActivityExternalCorrelation;
/**
* \brief The device type for device connected to NVLink.
*/
typedef enum {
CUPTI_DEV_TYPE_INVALID = 0,
/**
* The device type is GPU.
*/
CUPTI_DEV_TYPE_GPU = 1,
/**
* The device type is NVLink processing unit in CPU.
*/
CUPTI_DEV_TYPE_NPU = 2,
CUPTI_DEV_TYPE_FORCE_INT = 0x7fffffff
} CUpti_DevType;
/**
* \brief NVLink information. (deprecated in CUDA 9.0)
*
* This structure gives capabilities of each logical NVLink connection between two devices,
* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
* NVLink information are now reported using the
* CUpti_ActivityNvLink2 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
*/
CUpti_ActivityKind kind;
/**
* NVLink version.
*/
uint32_t nvlinkVersion;
/**
* Type of device 0 \ref CUpti_DevType
*/
CUpti_DevType typeDev0;
/**
* Type of device 1 \ref CUpti_DevType
*/
CUpti_DevType typeDev1;
/**
* If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4.
* If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
*/
union {
CUuuid uuidDev;
struct {
/**
* Index of the NPU. First index will always be zero.
*/
uint32_t index;
/**
* Domain ID of NPU. On Linux, this can be queried using lspci.
*/
uint32_t domainId;
} npu;
} idDev0;
/**
* If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4.
* If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
*/
union {
CUuuid uuidDev;
struct {
/**
* Index of the NPU. First index will always be zero.
*/
uint32_t index;
/**
* Domain ID of NPU. On Linux, this can be queried using lspci.
*/
uint32_t domainId;
} npu;
} idDev1;
/**
* Flag gives capabilities of the link \see CUpti_LinkFlag
*/
uint32_t flag;
/**
* Number of physical NVLinks present between two devices.
*/
uint32_t physicalNvLinkCount;
/**
* Port numbers for maximum 4 NVLinks connected to device 0.
* If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
* In case of invalid/unknown port number, this field will be set
* to value CUPTI_NVLINK_INVALID_PORT.
* This will be used to correlate the metric values to individual
* physical link and attribute traffic to the logical NVLink in
* the topology.
*/
int8_t portDev0[4];
/**
* Port numbers for maximum 4 NVLinks connected to device 1.
* If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
* In case of invalid/unknown port number, this field will be set
* to value CUPTI_NVLINK_INVALID_PORT.
* This will be used to correlate the metric values to individual
* physical link and attribute traffic to the logical NVLink in
* the topology.
*/
int8_t portDev1[4];
/**
* Banwidth of NVLink in kbytes/sec
*/
uint64_t bandwidth;
} CUpti_ActivityNvLink;
/**
* \brief NVLink information. (deprecated in CUDA 10.0)
*
* This structure gives capabilities of each logical NVLink connection between two devices,
* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
* NvLink information are now reported using the
* CUpti_ActivityNvLink4 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
*/
CUpti_ActivityKind kind;
/**
* NvLink version.
*/
uint32_t nvlinkVersion;
/**
* Type of device 0 \ref CUpti_DevType
*/
CUpti_DevType typeDev0;
/**
* Type of device 1 \ref CUpti_DevType
*/
CUpti_DevType typeDev1;
/**
* If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4.
* If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
*/
union {
CUuuid uuidDev;
struct {
/**
* Index of the NPU. First index will always be zero.
*/
uint32_t index;
/**
* Domain ID of NPU. On Linux, this can be queried using lspci.
*/
uint32_t domainId;
} npu;
} idDev0;
/**
* If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4.
* If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
*/
union {
CUuuid uuidDev;
struct {
/**
* Index of the NPU. First index will always be zero.
*/
uint32_t index;
/**
* Domain ID of NPU. On Linux, this can be queried using lspci.
*/
uint32_t domainId;
} npu;
} idDev1;
/**
* Flag gives capabilities of the link \see CUpti_LinkFlag
*/
uint32_t flag;
/**
* Number of physical NVLinks present between two devices.
*/
uint32_t physicalNvLinkCount;
/**
* Port numbers for maximum 16 NVLinks connected to device 0.
* If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
* In case of invalid/unknown port number, this field will be set
* to value CUPTI_NVLINK_INVALID_PORT.
* This will be used to correlate the metric values to individual
* physical link and attribute traffic to the logical NVLink in
* the topology.
*/
int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
/**
* Port numbers for maximum 16 NVLinks connected to device 1.
* If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
* In case of invalid/unknown port number, this field will be set
* to value CUPTI_NVLINK_INVALID_PORT.
* This will be used to correlate the metric values to individual
* physical link and attribute traffic to the logical NVLink in
* the topology.
*/
int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
/**
* Banwidth of NVLink in kbytes/sec
*/
uint64_t bandwidth;
} CUpti_ActivityNvLink2;
/**
* \brief NVLink information.
*
* This structure gives capabilities of each logical NVLink connection between two devices,
* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
* NvLink information are now reported using the
* CUpti_ActivityNvLink4 activity record.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
*/
CUpti_ActivityKind kind;
/**
* NvLink version.
*/
uint32_t nvlinkVersion;
/**
* Type of device 0 \ref CUpti_DevType
*/
CUpti_DevType typeDev0;
/**
* Type of device 1 \ref CUpti_DevType
*/
CUpti_DevType typeDev1;
/**
* If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4.
* If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
*/
union {
CUuuid uuidDev;
struct {
/**
* Index of the NPU. First index will always be zero.
*/
uint32_t index;
/**
* Domain ID of NPU. On Linux, this can be queried using lspci.
*/
uint32_t domainId;
} npu;
} idDev0;
/**
* If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4.
* If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
*/
union {
CUuuid uuidDev;
struct {
/**
* Index of the NPU. First index will always be zero.
*/
uint32_t index;
/**
* Domain ID of NPU. On Linux, this can be queried using lspci.
*/
uint32_t domainId;
} npu;
} idDev1;
/**
* Flag gives capabilities of the link \see CUpti_LinkFlag
*/
uint32_t flag;
/**
* Number of physical NVLinks present between two devices.
*/
uint32_t physicalNvLinkCount;
/**
* Port numbers for maximum 16 NVLinks connected to device 0.
* If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
* In case of invalid/unknown port number, this field will be set
* to value CUPTI_NVLINK_INVALID_PORT.
* This will be used to correlate the metric values to individual
* physical link and attribute traffic to the logical NVLink in
* the topology.
*/
int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
/**
* Port numbers for maximum 16 NVLinks connected to device 1.
* If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
* In case of invalid/unknown port number, this field will be set
* to value CUPTI_NVLINK_INVALID_PORT.
* This will be used to correlate the metric values to individual
* physical link and attribute traffic to the logical NVLink in
* the topology.
*/
int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
/**
* Banwidth of NVLink in kbytes/sec
*/
uint64_t bandwidth;
/**
* NVSwitch is connected as an intermediate node.
*/
uint8_t nvswitchConnected;
/**
* Undefined. reserved for internal use
*/
uint8_t pad[7];
} CUpti_ActivityNvLink3;
/**
* \brief NVLink information.
*
* This structure gives capabilities of each logical NVLink connection between two devices,
* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
*/
CUpti_ActivityKind kind;
/**
* NvLink version.
*/
uint32_t nvlinkVersion;
/**
* Type of device 0 \ref CUpti_DevType
*/
CUpti_DevType typeDev0;
/**
* Type of device 1 \ref CUpti_DevType
*/
CUpti_DevType typeDev1;
/**
* If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4.
* If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
*/
union {
CUuuid uuidDev;
struct {
/**
* Index of the NPU. First index will always be zero.
*/
uint32_t index;
/**
* Domain ID of NPU. On Linux, this can be queried using lspci.
*/
uint32_t domainId;
} npu;
} idDev0;
/**
* If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4.
* If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
*/
union {
CUuuid uuidDev;
struct {
/**
* Index of the NPU. First index will always be zero.
*/
uint32_t index;
/**
* Domain ID of NPU. On Linux, this can be queried using lspci.
*/
uint32_t domainId;
} npu;
} idDev1;
/**
* Flag gives capabilities of the link \see CUpti_LinkFlag
*/
uint32_t flag;
/**
* Number of physical NVLinks present between two devices.
*/
uint32_t physicalNvLinkCount;
/**
* Port numbers for maximum 32 NVLinks connected to device 0.
* If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
* In case of invalid/unknown port number, this field will be set
* to value CUPTI_NVLINK_INVALID_PORT.
* This will be used to correlate the metric values to individual
* physical link and attribute traffic to the logical NVLink in
* the topology.
*/
int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
/**
* Port numbers for maximum 32 NVLinks connected to device 1.
* If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
* In case of invalid/unknown port number, this field will be set
* to value CUPTI_NVLINK_INVALID_PORT.
* This will be used to correlate the metric values to individual
* physical link and attribute traffic to the logical NVLink in
* the topology.
*/
int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
/**
* Banwidth of NVLink in kbytes/sec
*/
uint64_t bandwidth;
/**
* NVSwitch is connected as an intermediate node.
*/
uint8_t nvswitchConnected;
/**
* Undefined. reserved for internal use
*/
uint8_t pad[7];
} CUpti_ActivityNvLink4;
#define CUPTI_MAX_GPUS 32
/**
* Field to differentiate whether PCIE Activity record
* is of a GPU or a PCI Bridge
*/
typedef enum {
/**
* PCIE GPU record
*/
CUPTI_PCIE_DEVICE_TYPE_GPU = 0,
/**
* PCIE Bridge record
*/
CUPTI_PCIE_DEVICE_TYPE_BRIDGE = 1,
CUPTI_PCIE_DEVICE_TYPE_FORCE_INT = 0x7fffffff
} CUpti_PcieDeviceType;
/**
* \brief PCI devices information required to construct topology
*
* This structure gives capabilities of GPU and PCI bridge connected to the PCIE bus
* which can be used to understand the topology.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_PCIE.
*/
CUpti_ActivityKind kind;
/**
* Type of device in topology, \ref CUpti_PcieDeviceType. If type is
* CUPTI_PCIE_DEVICE_TYPE_GPU use devId for id and gpuAttr and if type is
* CUPTI_PCIE_DEVICE_TYPE_BRIDGE use bridgeId for id and bridgeAttr.
*/
CUpti_PcieDeviceType type;
/**
* A unique identifier for GPU or Bridge in Topology
*/
union {
/**
* GPU device ID
*/
CUdevice devId;
/**
* A unique identifier for Bridge in the Topology
*/
uint32_t bridgeId;
} id;
/**
* Domain for the GPU or Bridge, required to identify which PCIE bus it belongs to in
* multiple NUMA systems.
*/
uint32_t domain;
/**
* PCIE Generation of GPU or Bridge.
*/
uint16_t pcieGeneration;
/**
* Link rate of the GPU or bridge in gigatransfers per second (GT/s)
*/
uint16_t linkRate;
/**
* Link width of the GPU or bridge
*/
uint16_t linkWidth;
/**
* Upstream bus ID for the GPU or PCI bridge. Required to identify which bus it is
* connected to in the topology.
*/
uint16_t upstreamBus;
/**
* Attributes for more information about GPU (gpuAttr) or PCI Bridge (bridgeAttr)
*/
union {
struct {
/**
* UUID for the device. \ref CUpti_ActivityDevice4.
*/
CUuuid uuidDev;
/**
* CUdevice with which this device has P2P capability.
* This can also be obtained by querying cuDeviceCanAccessPeer or
* cudaDeviceCanAccessPeer APIs
*/
CUdevice peerDev[CUPTI_MAX_GPUS];
} gpuAttr;
struct {
/**
* The downstream bus number, used to search downstream devices/bridges connected
* to this bridge.
*/
uint16_t secondaryBus;
/**
* Device ID of the bridge
*/
uint16_t deviceId;
/**
* Vendor ID of the bridge
*/
uint16_t vendorId;
/**
* Padding for alignment
*/
uint16_t pad0;
} bridgeAttr;
} attr;
} CUpti_ActivityPcie;
/**
* \brief PCIE Generation.
*
* Enumeration of PCIE Generation for
* pcie activity attribute pcieGeneration
*/
typedef enum {
/**
* PCIE Generation 1
*/
CUPTI_PCIE_GEN_GEN1 = 1,
/**
* PCIE Generation 2
*/
CUPTI_PCIE_GEN_GEN2 = 2,
/**
* PCIE Generation 3
*/
CUPTI_PCIE_GEN_GEN3 = 3,
/**
* PCIE Generation 4
*/
CUPTI_PCIE_GEN_GEN4 = 4,
/**
* PCIE Generation 5
*/
CUPTI_PCIE_GEN_GEN5 = 5,
CUPTI_PCIE_GEN_FORCE_INT = 0x7fffffff
} CUpti_PcieGen;
/**
* \brief The activity record for an instantaneous CUPTI event.
*
* This activity record represents a CUPTI event value
* (CUPTI_ACTIVITY_KIND_EVENT) sampled at a particular instant.
* This activity record kind is not produced by the activity API but is
* included for completeness and ease-of-use. Profiler frameworks built on
* top of CUPTI that collect event data at a particular time may choose to
* use this type to store the collected event data.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT.
*/
CUpti_ActivityKind kind;
/**
* The event ID.
*/
CUpti_EventID id;
/**
* The event value.
*/
uint64_t value;
/**
* The timestamp at which event is sampled
*/
uint64_t timestamp;
/**
* The device id
*/
uint32_t deviceId;
/**
* Undefined. reserved for internal use
*/
uint32_t reserved;
} CUpti_ActivityInstantaneousEvent;
/**
* \brief The activity record for an instantaneous CUPTI event
* with event domain instance information.
*
* This activity record represents the a CUPTI event value for a
* specific event domain instance
* (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE) sampled at a particular instant.
* This activity record kind is not produced by the activity API but is
* included for completeness and ease-of-use. Profiler frameworks built on
* top of CUPTI that collect event data may choose to use this type to store the
* collected event data. This activity record should be used when
* event domain instance information needs to be associated with the
* event.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE.
*/
CUpti_ActivityKind kind;
/**
* The event ID.
*/
CUpti_EventID id;
/**
* The event value.
*/
uint64_t value;
/**
* The timestamp at which event is sampled
*/
uint64_t timestamp;
/**
* The device id
*/
uint32_t deviceId;
/**
* The event domain instance
*/
uint8_t instance;
/**
* Undefined. reserved for internal use
*/
uint8_t pad[3];
} CUpti_ActivityInstantaneousEventInstance;
/**
* \brief The activity record for an instantaneous CUPTI metric.
*
* This activity record represents the collection of a CUPTI metric
* value (CUPTI_ACTIVITY_KIND_METRIC) at a particular instance.
* This activity record kind is not produced by the activity API but
* is included for completeness and ease-of-use. Profiler frameworks built
* on top of CUPTI that collect metric data may choose to use this type to
* store the collected metric data.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC.
*/
CUpti_ActivityKind kind;
/**
* The metric ID.
*/
CUpti_MetricID id;
/**
* The metric value.
*/
CUpti_MetricValue value;
/**
* The timestamp at which metric is sampled
*/
uint64_t timestamp;
/**
* The device id
*/
uint32_t deviceId;
/**
* The properties of this metric. \see CUpti_ActivityFlag
*/
uint8_t flags;
/**
* Undefined. reserved for internal use
*/
uint8_t pad[3];
} CUpti_ActivityInstantaneousMetric;
/**
* \brief The instantaneous activity record for a CUPTI metric with instance
* information.
* This activity record represents a CUPTI metric value
* for a specific metric domain instance
* (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE) sampled at a particular time. This
* activity record kind is not produced by the activity API but is included for
* completeness and ease-of-use. Profiler frameworks built on top of
* CUPTI that collect metric data may choose to use this type to store
* the collected metric data. This activity record should be used when
* metric domain instance information needs to be associated with the
* metric.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE.
*/
CUpti_ActivityKind kind;
/**
* The metric ID.
*/
CUpti_MetricID id;
/**
* The metric value.
*/
CUpti_MetricValue value;
/**
* The timestamp at which metric is sampled
*/
uint64_t timestamp;
/**
* The device id
*/
uint32_t deviceId;
/**
* The properties of this metric. \see CUpti_ActivityFlag
*/
uint8_t flags;
/**
* The metric domain instance
*/
uint8_t instance;
/**
* Undefined. reserved for internal use
*/
uint8_t pad[2];
} CUpti_ActivityInstantaneousMetricInstance;
/**
* \brief The types of JIT entry.
*
* To be used in CUpti_ActivityJit.
*/
typedef enum {
CUPTI_ACTIVITY_JIT_ENTRY_INVALID= 0,
/**
* PTX to CUBIN.
*/
CUPTI_ACTIVITY_JIT_ENTRY_PTX_TO_CUBIN = 1,
/**
* NVVM-IR to PTX
*/
CUPTI_ACTIVITY_JIT_ENTRY_NVVM_IR_TO_PTX = 2,
CUPTI_ACTIVITY_JIT_ENTRY_TYPE_FORCE_INT = 0x7fffffff
} CUpti_ActivityJitEntryType;
/**
* \brief The types of JIT compilation operations.
*
* To be used in CUpti_ActivityJit.
*/
typedef enum {
CUPTI_ACTIVITY_JIT_OPERATION_INVALID = 0,
/**
* Loaded from the compute cache.
*/
CUPTI_ACTIVITY_JIT_OPERATION_CACHE_LOAD = 1,
/**
* Stored in the compute cache.
*/
CUPTI_ACTIVITY_JIT_OPERATION_CACHE_STORE = 2,
/**
* JIT compilation.
*/
CUPTI_ACTIVITY_JIT_OPERATION_COMPILE = 3,
CUPTI_ACTIVITY_JIT_OPERATION_TYPE_FORCE_INT = 0x7fffffff
} CUpti_ActivityJitOperationType;
/**
* \brief The activity record for JIT operations.
* This activity represents the JIT operations (compile, load, store) of a CUmodule
* from the Compute Cache.
* Gives the exact hashed path of where the cached module is loaded from,
* or where the module will be stored after Just-In-Time (JIT) compilation.
*/
typedef struct PACKED_ALIGNMENT {
/**
* The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
*/
CUpti_ActivityKind kind;
/**
* The JIT entry type.
*/
CUpti_ActivityJitEntryType jitEntryType;
/**
* The JIT operation type.
*/
CUpti_ActivityJitOperationType jitOperationType;
/**
* The device ID.
*/
uint32_t deviceId;
/**
* The start timestamp for the JIT operation, in ns. A value of 0 for
* both the start and end timestamps indicates that timestamp
* information could not be collected for the JIT operation.
*/
uint64_t start;
/**
* The end timestamp for the JIT operation, in ns. A value of 0 for both
* the start and end timestamps indicates that timestamp information
* could not be collected for the JIT operation.
*/
uint64_t end;
/**
* The correlation ID of the JIT operation to which
* records belong to. Each JIT operation is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver or runtime API activity record that
* launched the JIT operation.
*/
uint32_t correlationId;
/**
* Internal use.
*/
uint32_t padding;
/**
* The correlation ID to correlate JIT compilation, load and store operations.
* Each JIT compilation unit is assigned a unique correlation ID
* at the time of the JIT compilation. This correlation id can be used
* to find the matching JIT cache load/store records.
*/
uint64_t jitOperationCorrelationId;
/**
* The size of compute cache.
*/
uint64_t cacheSize;
/**
* The path where the fat binary is cached.
*/
const char* cachePath;
} CUpti_ActivityJit;
/**
* \brief The activity record for trace of graph execution.
*
* This activity record represents execution for a graph without giving visibility
* about the execution of its nodes. This is intended to reduce overheads in tracing
* each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
*/
typedef struct {
/**
* The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
*/
CUpti_ActivityKind kind;
/**
* The correlation ID of the graph launch. Each graph launch is
* assigned a unique correlation ID that is identical to the
* correlation ID in the driver API activity record that launched
* the graph.
*/
uint32_t correlationId;
/**
* The start timestamp for the graph execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the graph.
*/
uint64_t start;
/**
* The end timestamp for the graph execution, in ns. A value of 0
* for both the start and end timestamps indicates that timestamp
* information could not be collected for the graph.
*/
uint64_t end;
/**
* The ID of the device where the graph execution is occurring.
*/
uint32_t deviceId;
/**
* The unique ID of the graph that is launched.
*/
uint32_t graphId;
/**
* The ID of the context where the graph is being launched.
*/
uint32_t contextId;
/**
* The ID of the stream where the graph is being launched.
*/
uint32_t streamId;
/**
* This field is reserved for internal use
*/
void *reserved;
} CUpti_ActivityGraphTrace;
END_PACKED_ALIGNMENT
/**
* \brief Activity attributes.
*
* These attributes are used to control the behavior of the activity
* API.
*/
typedef enum {
/**
* The device memory size (in bytes) reserved for storing profiling data for concurrent
* kernels (activity kind \ref CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), memcopies and memsets
* for each buffer on a context. The value is a size_t.
*
* There is a limit on how many device buffers can be allocated per context. User
* can query and set this limit using the attribute
* \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
* CUPTI doesn't pre-allocate all the buffers, it pre-allocates only those many
* buffers as set by the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
* When all of the data in a buffer is consumed, it is added in the reuse pool, and
* CUPTI picks a buffer from this pool when a new buffer is needed. Thus memory
* footprint does not scale with the kernel count. Applications with the high density
* of kernels, memcopies and memsets might result in having CUPTI to allocate more device buffers.
* CUPTI allocates another buffer only when it runs out of the buffers in the
* reuse pool.
*
* Since buffer allocation happens in the main application thread, this might result
* in stalls in the critical path. CUPTI pre-allocates 3 buffers of the same size to
* mitigate this issue. User can query and set the pre-allocation limit using the
* attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
*
* Having larger buffer size leaves less device memory for the application.
* Having smaller buffer size increases the risk of dropping timestamps for
* records if too many kernels or memcopies or memsets are launched at one time.
*
* This value only applies to new buffer allocations. Set this value before initializing
* CUDA or before creating a context to ensure it is considered for the following allocations.
*
* The default value is 3200000 (~3MB) which can accommodate profiling data
* up to 100,000 kernels, memcopies and memsets combined.
*
* Note: Starting with the CUDA 12.0 Update 1 release, CUPTI allocates profiling buffer in the
* device memory by default as this might help in improving the performance of the
* tracing run. Refer to the description of the attribute
* \ref CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED for more details.
* Size of the memory and maximum number of pools are still controlled by the attributes
* \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE and \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
*
* Note: The actual amount of device memory per buffer reserved by CUPTI might be larger.
*/
CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE = 0,
/**
* The device memory size (in bytes) reserved for storing profiling
* data for CDP operations for each buffer on a context. The
* value is a size_t.
*
* Having larger buffer size means less flush operations but
* consumes more device memory. This value only applies to new
* allocations.
*
* Set this value before initializing CUDA or before creating a
* context to ensure it is considered for the following allocations.
*
* The default value is 8388608 (8MB).
*
* Note: The actual amount of device memory per context reserved by
* CUPTI might be larger.
*/
CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP = 1,
/**
* The maximum number of device memory buffers per context. The value is a size_t.
*
* For an application with high rate of kernel launches, memcopies and memsets having a bigger pool
* limit helps in timestamp collection for all these activties at the expense of a larger memory footprint.
* Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
* for more details.
*
* Setting this value will not modify the number of memory buffers
* currently stored.
*
* Set this value before initializing CUDA to ensure the limit is
* not exceeded.
*
* The default value is 250.
*/
CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT = 2,
/**
* The profiling semaphore pool size reserved for storing profiling data for
* serialized kernels tracing (activity kind \ref CUPTI_ACTIVITY_KIND_KERNEL)
* for each context. The value is a size_t.
*
* There is a limit on how many semaphore pools can be allocated per context. User
* can query and set this limit using the attribute
* \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT.
* CUPTI doesn't pre-allocate all the semaphore pools, it pre-allocates only those many
* semaphore pools as set by the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
* When all of the data in a semaphore pool is consumed, it is added in the reuse pool, and
* CUPTI picks a semaphore pool from the reuse pool when a new semaphore pool is needed. Thus memory
* footprint does not scale with the kernel count. Applications with the high density
* of kernels might result in having CUPTI to allocate more semaphore pools.
* CUPTI allocates another semaphore pool only when it runs out of the semaphore pools in the
* reuse pool.
*
* Since semaphore pool allocation happens in the main application thread, this might result
* in stalls in the critical path. CUPTI pre-allocates 3 semaphore pools of the same size to
* mitigate this issue. User can query and set the pre-allocation limit using the
* attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
*
* Having larger semaphore pool size leaves less device memory for the application.
* Having smaller semaphore pool size increases the risk of dropping timestamps for
* kernel records if too many kernels are issued/launched at one time.
*
* This value only applies to new semaphore pool allocations. Set this value before initializing
* CUDA or before creating a context to ensure it is considered for the following allocations.
*
* The default value is 25000 which can accommodate profiling data for upto 25,000 kernels.
*
*/
CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE = 3,
/**
* The maximum number of profiling semaphore pools per context. The value is a size_t.
*
* For an application with high rate of kernel launches, having a bigger
* pool limit helps in timestamp collection for all the kernels, at the
* expense of a larger device memory footprint.
* Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
* for more details.
*
* Set this value before initializing CUDA to ensure the limit is not exceeded.
*
* The default value is 250.
*/
CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT = 4,
/**
* The flag to indicate whether user should provide activity buffer of zero value.
* The value is a uint8_t.
*
* If the value of this attribute is non-zero, user should provide
* a zero value buffer in the \ref CUpti_BuffersCallbackRequestFunc.
* If the user does not provide a zero value buffer after setting this to non-zero,
* the activity buffer may contain some uninitialized values when CUPTI returns it in
* \ref CUpti_BuffersCallbackCompleteFunc
*
* If the value of this attribute is zero, CUPTI will initialize the user buffer
* received in the \ref CUpti_BuffersCallbackRequestFunc to zero before filling it.
* If the user sets this to zero, a few stalls may appear in critical path because CUPTI
* will zero out the buffer in the main thread.
* Set this value before returning from \ref CUpti_BuffersCallbackRequestFunc to
* ensure it is considered for all the subsequent user buffers.
*
* The default value is 0.
*/
CUPTI_ACTIVITY_ATTR_ZEROED_OUT_ACTIVITY_BUFFER = 5,
/**
* Number of device buffers to pre-allocate for a context during the initialization phase.
* The value is a size_t.
*
* Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
* for details.
*
* This value must be less than the maximum number of device buffers set using
* the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT
*
* Set this value before initializing CUDA or before creating a context to ensure it
* is considered by the CUPTI.
*
* The default value is set to 3 to ping pong between these buffers (if possible).
*/
CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE = 6,
/**
* Number of profiling semaphore pools to pre-allocate for a context during the
* initialization phase. The value is a size_t.
*
* Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
* for details.
*
* This value must be less than the maximum number of profiling semaphore pools set
* using the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT
*
* Set this value before initializing CUDA or before creating a context to ensure it
* is considered by the CUPTI.
*
* The default value is set to 3 to ping pong between these pools (if possible).
*/
CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE = 7,
/**
* Allocate page-locked (pinned) host memory for storing profiling data for concurrent
* kernels, memcopies and memsets for each buffer on a context. The value is a uint8_t.
*
* Starting with the CUDA 11.2 release, CUPTI allocates profiling buffer in the pinned host
* memory by default as this might help in improving the performance of the tracing run.
* Allocating excessive amounts of pinned memory may degrade system performance, since it
* reduces the amount of memory available to the system for paging. For this reason user
* might want to change the location from pinned host memory to device memory by setting
* value of this attribute to 0.
*
* The default value is 1.
*/
CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED = 8,
CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_FORCE_INT = 0x7fffffff
} CUpti_ActivityAttribute;
/**
* \brief Thread-Id types.
*
* CUPTI uses different methods to obtain the thread-id depending on the
* support and the underlying platform. This enum documents these methods
* for each type. APIs \ref cuptiSetThreadIdType and \ref cuptiGetThreadIdType
* can be used to set and get the thread-id type.
*/
typedef enum {
/**
* Default type
* Windows uses API GetCurrentThreadId()
* Linux/Mac/Android/QNX use POSIX pthread API pthread_self()
*/
CUPTI_ACTIVITY_THREAD_ID_TYPE_DEFAULT = 0,
/**
* This type is based on the system API available on the underlying platform
* and thread-id obtained is supposed to be unique for the process lifetime.
* Windows uses API GetCurrentThreadId()
* Linux uses syscall SYS_gettid
* Mac uses syscall SYS_thread_selfid
* Android/QNX use gettid()
*/
CUPTI_ACTIVITY_THREAD_ID_TYPE_SYSTEM = 1,
CUPTI_ACTIVITY_THREAD_ID_TYPE_FORCE_INT = 0x7fffffff
} CUpti_ActivityThreadIdType;
/**
* \brief Get the CUPTI timestamp.
*
* Returns a timestamp normalized to correspond with the start and end
* timestamps reported in the CUPTI activity records. The timestamp is
* reported in nanoseconds.
*
* \param timestamp Returns the CUPTI timestamp
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p timestamp is NULL
*/
CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp);
/**
* \brief Get the ID of a context.
*
* Get the ID of a context.
*
* \param context The context
* \param contextId Returns a process-unique ID for the context
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_CONTEXT The context is NULL or not valid.
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p contextId is NULL
*/
CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId);
/**
* \brief Get the ID of a stream.
*
* Get the ID of a stream. The stream ID is unique within a context
* (i.e. all streams within a context will have unique stream
* IDs).
*
* \param context If non-NULL then the stream is checked to ensure
* that it belongs to this context. Typically this parameter should be
* null.
* \param stream The stream
* \param streamId Returns a context-unique ID for the stream
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
* if \p context is non-NULL and \p stream does not belong to the
* context
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
*
* **DEPRECATED** This method is deprecated as of CUDA 8.0.
* Use method cuptiGetStreamIdEx instead.
*/
CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId);
/**
* \brief Get the ID of a stream.
*
* Get the ID of a stream. The stream ID is unique within a context
* (i.e. all streams within a context will have unique stream
* IDs).
*
* \param context If non-NULL then the stream is checked to ensure
* that it belongs to this context. Typically this parameter should be
* null.
* \param stream The stream
* \param perThreadStream Flag to indicate if program is compiled for per-thread streams
* \param streamId Returns a context-unique ID for the stream
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
* if \p context is non-NULL and \p stream does not belong to the
* context
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
*/
CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream, uint8_t perThreadStream, uint32_t *streamId);
/**
* \brief Get the ID of a device
*
* If \p context is NULL, returns the ID of the device that contains
* the currently active context. If \p context is non-NULL, returns
* the ID of the device which contains that context. Operates in a
* similar manner to cudaGetDevice() or cuCtxGetDevice() but may be
* called from within callback functions.
*
* \param context The context, or NULL to indicate the current context.
* \param deviceId Returns the ID of the device that is current for
* the calling thread.
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_DEVICE if unable to get device ID
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p deviceId is NULL
*/
CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId);
/**
* \brief Get the unique ID of a graph node
*
* Returns the unique ID of the CUDA graph node.
*
* \param node The graph node.
* \param nodeId Returns the unique ID of the node
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p node is NULL
*/
CUptiResult CUPTIAPI cuptiGetGraphNodeId(CUgraphNode node, uint64_t *nodeId);
/**
* \brief Get the unique ID of graph
*
* Returns the unique ID of CUDA graph.
*
* \param graph The graph.
* \param pId Returns the unique ID of the graph
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL
*/
CUptiResult CUPTIAPI cuptiGetGraphId(CUgraph graph, uint32_t *pId);
/**
* \brief Enable collection of a specific kind of activity record.
*
* Enable collection of a specific kind of activity record. Multiple
* kinds can be enabled by calling this function multiple times. By
* default all activity kinds are disabled for collection.
*
* \param kind The kind of activity record to collect
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
* \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
*/
CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind);
/**
* \brief Enable collection of a specific kind of activity record. For certain activity kinds
* it dumps existing records.
*
* In general, the behavior of this API is similar to the API \ref cuptiActivityEnable i.e. it
* enables the collection of a specific kind of activity record.
* Additionally, this API can help in dumping the records for activities which happened in
* the past before enabling the corresponding activity kind.
* The API allows to get records for the current resource allocations done in CUDA
* For CUPTI_ACTIVITY_KIND_DEVICE, existing device records are dumped
* For CUPTI_ACTIVITY_KIND_CONTEXT, existing context records are dumped
* For CUPTI_ACTIVITY_KIND_STREAM, existing stream records are dumped
* For CUPTI_ACTIVITY_KIND_ NVLINK, existing NVLINK records are dumped
* For CUPTI_ACTIVITY_KIND_PCIE, existing PCIE records are dumped
* For other activities, the behavior is similar to the API \ref cuptiActivityEnable
*
* Device records are emitted in CUPTI on CUDA driver initialization. Those records
* can only be retrieved by the user if CUPTI is attached before CUDA initialization.
* Context and stream records are emitted on context and stream creation.
* The use case of the API is to provide the records for CUDA resources
* (contexs/streams/devices) that are currently active if user late attaches CUPTI.
*
* Before calling this function, the user must register buffer callbacks
* to get the activity records by calling \ref cuptiActivityRegisterCallbacks.
* If the user does not register the buffers and calls API \ref cuptiActivityEnableAndDump,
* then CUPTI will enable the activity kind but not provide any records for that
* activity kind.
*
* \param kind The kind of activity record to collect
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_UNKNOWN if buffer is not initialized.
* \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
* \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
*/
CUptiResult CUPTIAPI cuptiActivityEnableAndDump(CUpti_ActivityKind kind);
/**
* \brief Disable collection of a specific kind of activity record.
*
* Disable collection of a specific kind of activity record. Multiple
* kinds can be disabled by calling this function multiple times. By
* default all activity kinds are disabled for collection.
*
* \param kind The kind of activity record to stop collecting
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
*/
CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind);
/**
* \brief Enable collection of a specific kind of activity record for
* a context.
*
* Enable collection of a specific kind of activity record for a
* context. This setting done by this API will supersede the global
* settings for activity records enabled by \ref cuptiActivityEnable.
* Multiple kinds can be enabled by calling this function multiple
* times.
*
* \param context The context for which activity is to be enabled
* \param kind The kind of activity record to collect
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
* \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
*/
CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context, CUpti_ActivityKind kind);
/**
* \brief Disable collection of a specific kind of activity record for
* a context.
*
* Disable collection of a specific kind of activity record for a context.
* This setting done by this API will supersede the global settings
* for activity records.
* Multiple kinds can be enabled by calling this function multiple times.
*
* \param context The context for which activity is to be disabled
* \param kind The kind of activity record to stop collecting
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
*/
CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context, CUpti_ActivityKind kind);
/**
* \brief Get the number of activity records that were dropped of
* insufficient buffer space.
*
* Get the number of records that were dropped because of insufficient
* buffer space. The dropped count includes records that could not be
* recorded because CUPTI did not have activity buffer space available
* for the record (because the CUpti_BuffersCallbackRequestFunc
* callback did not return an empty buffer of sufficient size) and
* also CDP records that could not be record because the device-size
* buffer was full (size is controlled by the
* CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP attribute). The dropped
* count maintained for the queue is reset to zero when this function
* is called.
*
* \param context The context, or NULL to get dropped count from global queue
* \param streamId The stream ID
* \param dropped The number of records that were dropped since the last call
* to this function.
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p dropped is NULL
*/
CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId,
size_t *dropped);
/**
* \brief Iterate over the activity records in a buffer.
*
* This is a helper function to iterate over the activity records in a
* buffer. A buffer of activity records is typically obtained by
* receiving a CUpti_BuffersCallbackCompleteFunc callback.
*
* An example of typical usage:
* \code
* CUpti_Activity *record = NULL;
* CUptiResult status = CUPTI_SUCCESS;
* do {
* status = cuptiActivityGetNextRecord(buffer, validSize, &record);
* if(status == CUPTI_SUCCESS) {
* // Use record here...
* }
* else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
* break;
* else {
* goto Error;
* }
* } while (1);
* \endcode
*
* \param buffer The buffer containing activity records
* \param record Inputs the previous record returned by
* cuptiActivityGetNextRecord and returns the next activity record
* from the buffer. If input value is NULL, returns the first activity
* record in the buffer. Records of kind CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
* may contain invalid (0) timestamps, indicating that no timing information could
* be collected for lack of device memory.
* \param validBufferSizeBytes The number of valid bytes in the buffer.
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_MAX_LIMIT_REACHED if no more records in the buffer
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p buffer is NULL.
*/
CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes,
CUpti_Activity **record);
/**
* \brief Function type for callback used by CUPTI to request an empty
* buffer for storing activity records.
*
* This callback function signals the CUPTI client that an activity
* buffer is needed by CUPTI. The activity buffer is used by CUPTI to
* store activity records. The callback function can decline the
* request by setting \p *buffer to NULL. In this case CUPTI may drop
* activity records.
*
* \param buffer Returns the new buffer. If set to NULL then no buffer
* is returned.
* \param size Returns the size of the returned buffer.
* \param maxNumRecords Returns the maximum number of records that
* should be placed in the buffer. If 0 then the buffer is filled with
* as many records as possible. If > 0 the buffer is filled with at
* most that many records before it is returned.
*/
typedef void (CUPTIAPI *CUpti_BuffersCallbackRequestFunc)(
uint8_t **buffer,
size_t *size,
size_t *maxNumRecords);
/**
* \brief Function type for callback used by CUPTI to return a buffer
* of activity records.
*
* This callback function returns to the CUPTI client a buffer
* containing activity records. The buffer contains \p validSize
* bytes of activity records which should be read using
* cuptiActivityGetNextRecord. The number of dropped records can be
* read using cuptiActivityGetNumDroppedRecords. After this call CUPTI
* relinquished ownership of the buffer and will not use it
* anymore. The client may return the buffer to CUPTI using the
* CUpti_BuffersCallbackRequestFunc callback.
* Note: CUDA 6.0 onwards, all buffers returned by this callback are
* global buffers i.e. there is no context/stream specific buffer.
* User needs to parse the global buffer to extract the context/stream
* specific activity records.
*
* \param context The context this buffer is associated with. If NULL, the
* buffer is associated with the global activities. This field is deprecated
* as of CUDA 6.0 and will always be NULL.
* \param streamId The stream id this buffer is associated with.
* This field is deprecated as of CUDA 6.0 and will always be NULL.
* \param buffer The activity record buffer.
* \param size The total size of the buffer in bytes as set in
* CUpti_BuffersCallbackRequestFunc.
* \param validSize The number of valid bytes in the buffer.
*/
typedef void (CUPTIAPI *CUpti_BuffersCallbackCompleteFunc)(
CUcontext context,
uint32_t streamId,
uint8_t *buffer,
size_t size,
size_t validSize);
/**
* \brief Registers callback functions with CUPTI for activity buffer
* handling.
*
* This function registers two callback functions to be used in asynchronous
* buffer handling. If registered, activity record buffers are handled using
* asynchronous requested/completed callbacks from CUPTI.
*
* Registering these callbacks prevents the client from using CUPTI's
* blocking enqueue/dequeue functions.
*
* \param funcBufferRequested callback which is invoked when an empty
* buffer is requested by CUPTI
* \param funcBufferCompleted callback which is invoked when a buffer
* containing activity records is available from CUPTI
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER if either \p
* funcBufferRequested or \p funcBufferCompleted is NULL
*/
CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
CUpti_BuffersCallbackCompleteFunc funcBufferCompleted);
/**
* \brief Wait for all activity records to be delivered via the
* completion callback.
*
* This function does not return until all activity records associated
* with the specified context/stream are returned to the CUPTI client
* using the callback registered in cuptiActivityRegisterCallbacks. To
* ensure that all activity records are complete, the requested
* stream(s), if any, are synchronized.
*
* If \p context is NULL, the global activity records (i.e. those not
* associated with a particular stream) are flushed (in this case no
* streams are synchonized). If \p context is a valid CUcontext and
* \p streamId is 0, the buffers of all streams of this context are
* flushed. Otherwise, the buffers of the specified stream in this
* context is flushed.
*
* Before calling this function, the buffer handling callback api
* must be activated by calling cuptiActivityRegisterCallbacks.
*
* \param context A valid CUcontext or NULL.
* \param streamId The stream ID.
* \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_CUPTI_ERROR_INVALID_OPERATION if not preceeded
* by a successful call to cuptiActivityRegisterCallbacks
* \retval CUPTI_ERROR_UNKNOWN an internal error occurred
*
* **DEPRECATED** This method is deprecated
* CONTEXT and STREAMID will be ignored. Use cuptiActivityFlushAll
* to flush all data.
*/
CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId, uint32_t flag);
/**
* \brief Request to deliver activity records via the buffer completion callback.
*
* This function returns the activity records associated with all contexts/streams
* (and the global buffers not associated with any stream) to the CUPTI client
* using the callback registered in cuptiActivityRegisterCallbacks.
*
* This is a blocking call but it doesn't issue any CUDA synchronization calls
* implicitly thus it's not guaranteed that all activities are completed on the
* underlying devices. Activity record is considered as completed if it has all
* the information filled up including the timestamps if any. It is the client's
* responsibility to issue necessary CUDA synchronization calls before calling
* this function if all activity records with complete information are expected
* to be delivered.
*
* Behavior of the function based on the input flag:
* - ::For default flush i.e. when flag is set as 0, it returns all the
* activity buffers which have all the activity records completed, buffers need not
* to be full though. It doesn't return buffers which have one or more incomplete
* records. Default flush can be done at a regular interval in a separate thread.
* - ::For forced flush i.e. when flag CUPTI_ACTIVITY_FLAG_FLUSH_FORCED is passed
* to the function, it returns all the activity buffers including the ones which have
* one or more incomplete activity records. It's suggested for clients to do the
* force flush before the termination of the profiling session to allow remaining
* buffers to be delivered. In general, it can be done in the at-exit handler.
*
* Before calling this function, the buffer handling callback api must be activated
* by calling cuptiActivityRegisterCallbacks.
*
* \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_OPERATION if not preceeded by a
* successful call to cuptiActivityRegisterCallbacks
* \retval CUPTI_ERROR_UNKNOWN an internal error occurred
*
* \see cuptiActivityFlushPeriod
*/
CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag);
/**
* \brief Read an activity API attribute.
*
* Read an activity API attribute and return it in \p *value.
*
* \param attr The attribute to read
* \param valueSize Size of buffer pointed by the value, and
* returns the number of bytes written to \p value
* \param value Returns the value of the attribute
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
* if \p attr is not an activity attribute
* \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
* the \p value buffer is too small to hold the attribute value.
*/
CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
size_t *valueSize, void* value);
/**
* \brief Write an activity API attribute.
*
* Write an activity API attribute.
*
* \param attr The attribute to write
* \param valueSize The size, in bytes, of the value
* \param value The attribute value to write
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
* if \p attr is not an activity attribute
* \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
* the \p value buffer is too small to hold the attribute value.
*/
CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
size_t *valueSize, void* value);
/**
* \brief Set Unified Memory Counter configuration.
*
* \param config A pointer to \ref CUpti_ActivityUnifiedMemoryCounterConfig structures
* containing Unified Memory counter configuration.
* \param count Number of Unified Memory counter configuration structures
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
* any parameter in the \p config structures is not a valid value
* \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED One potential reason is that
* platform (OS/arch) does not support the unified memory counters
* \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE Indicates that the device
* does not support the unified memory counters
* \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES Indicates that
* multi-GPU configuration without P2P support between any pair of devices
* does not support the unified memory counters
*/
CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count);
/**
* \brief Get auto boost state
*
* The profiling results can be inconsistent in case auto boost is enabled.
* CUPTI tries to disable auto boost while profiling. It can fail to disable in
* cases where user does not have the permissions or CUDA_AUTO_BOOST env
* variable is set. The function can be used to query whether auto boost is
* enabled.
*
* \param context A valid CUcontext.
* \param state A pointer to \ref CUpti_ActivityAutoBoostState structure which
* contains the current state and the id of the process that has requested the
* current state
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p CUcontext or \p state is NULL
* \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the device does not support auto boost
* \retval CUPTI_ERROR_UNKNOWN an internal error occurred
*/
CUptiResult CUPTIAPI cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state);
/**
* \brief Set PC sampling configuration.
*
* For Pascal and older GPU architectures this API must be called before enabling
* activity kind CUPTI_ACTIVITY_KIND_PC_SAMPLING. There is no such requirement
* for Volta and newer GPU architectures.
*
* For Volta and newer GPU architectures if this API is called in the middle of
* execution, PC sampling configuration will be updated for subsequent kernel launches.
*
* \param ctx The context
* \param config A pointer to \ref CUpti_ActivityPCSamplingConfig structure
* containing PC sampling configuration.
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_OPERATION if this api is called while
* some valid event collection method is set.
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
* any parameter in the \p config structures is not a valid value
* \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the system/device
* does not support the unified memory counters
*/
CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(CUcontext ctx, CUpti_ActivityPCSamplingConfig *config);
/**
* \brief Returns the last error from a cupti call or callback
*
* Returns the last error that has been produced by any of the cupti api calls
* or the callback in the same host thread and resets it to CUPTI_SUCCESS.
*/
CUptiResult CUPTIAPI cuptiGetLastError(void);
/**
* \brief Set the thread-id type
*
* CUPTI uses the method corresponding to set type to generate the thread-id.
* See enum \ref CUpti_ActivityThreadIdType for the list of methods.
* Activity records having thread-id field contain the same value.
* Thread id type must not be changed during the profiling session to
* avoid thread-id value mismatch across activity records.
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_SUPPORTED if \p type is not supported on the platform
*/
CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type);
/**
* \brief Get the thread-id type
*
* Returns the thread-id type used in CUPTI
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p type is NULL
*/
CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type);
/**
* \brief Check support for a compute capability
*
* This function is used to check the support for a device based on
* it's compute capability. It sets the \p support when the compute
* capability is supported by the current version of CUPTI, and clears
* it otherwise. This version of CUPTI might not support all GPUs sharing
* the same compute capability. It is suggested to use API \ref
* cuptiDeviceSupported which provides correct information.
*
* \param major The major revision number of the compute capability
* \param minor The minor revision number of the compute capability
* \param support Pointer to an integer to return the support status
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
*
* \sa ::cuptiDeviceSupported
*/
CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor, int *support);
/**
* \brief Check support for a compute device
*
* This function is used to check the support for a compute device.
* It sets the \p support when the device is supported by the current
* version of CUPTI, and clears it otherwise.
*
* \param dev The device handle returned by CUDA Driver API cuDeviceGet
* \param support Pointer to an integer to return the support status
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
* \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
*
* \sa ::cuptiComputeCapabilitySupported
*/
CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support);
/**
* This indicates the virtualization mode in which CUDA device is running
*/
typedef enum {
/**
* No virtualization mode isassociated with the device
* i.e. it's a baremetal GPU
*/
CUPTI_DEVICE_VIRTUALIZATION_MODE_NONE = 0,
/**
* The device is associated with the pass-through GPU.
* In this mode, an entire physical GPU is directly assigned
* to one virtual machine (VM).
*/
CUPTI_DEVICE_VIRTUALIZATION_MODE_PASS_THROUGH = 1,
/**
* The device is associated with the virtual GPU (vGPU).
* In this mode multiple virtual machines (VMs) have simultaneous,
* direct access to a single physical GPU.
*/
CUPTI_DEVICE_VIRTUALIZATION_MODE_VIRTUAL_GPU = 2,
CUPTI_DEVICE_VIRTUALIZATION_MODE_FORCE_INT = 0x7fffffff
} CUpti_DeviceVirtualizationMode;
/**
* \brief Query the virtualization mode of the device
*
* This function is used to query the virtualization mode of the CUDA device.
*
* \param dev The device handle returned by CUDA Driver API cuDeviceGet
* \param mode Pointer to an CUpti_DeviceVirtualizationMode to return the virtualization mode
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p mode is NULL
*
*/
CUptiResult CUPTIAPI cuptiDeviceVirtualizationMode(CUdevice dev, CUpti_DeviceVirtualizationMode *mode);
/**
* \brief Detach CUPTI from the running process
*
* This API detaches the CUPTI from the running process. It destroys and cleans up all the
* resources associated with CUPTI in the current process. After CUPTI detaches from the process,
* the process will keep on running with no CUPTI attached to it.
* For safe operation of the API, it is recommended this API is invoked from the exit callsite
* of any of the CUDA Driver or Runtime API. Otherwise CUPTI client needs to make sure that
* required CUDA synchronization and CUPTI activity buffer flush is done before calling the API.
* Sample code showing the usage of the API in the cupti callback handler code:
* \code
void CUPTIAPI
cuptiCallbackHandler(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, void *cbdata)
{
const CUpti_CallbackData *cbInfo = (CUpti_CallbackData *)cbdata;
// Take this code path when CUPTI detach is requested
if (detachCupti) {
switch(domain)
{
case CUPTI_CB_DOMAIN_RUNTIME_API:
case CUPTI_CB_DOMAIN_DRIVER_API:
if (cbInfo->callbackSite == CUPTI_API_EXIT) {
// call the CUPTI detach API
cuptiFinalize();
}
break;
default:
break;
}
}
}
\endcode
*/
CUptiResult CUPTIAPI cuptiFinalize(void);
/**
* \brief Push an external correlation id for the calling thread
*
* This function notifies CUPTI that the calling thread is entering an external API region.
* When a CUPTI activity API record is created while within an external API region and
* CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION is enabled, the activity API record will
* be preceeded by a CUpti_ActivityExternalCorrelation record for each \ref CUpti_ExternalCorrelationKind.
*
* \param kind The kind of external API activities should be correlated with.
* \param id External correlation id.
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid
*/
CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t id);
/**
* \brief Pop an external correlation id for the calling thread
*
* This function notifies CUPTI that the calling thread is leaving an external API region.
*
* \param kind The kind of external API activities should be correlated with.
* \param lastId If the function returns successful, contains the last external correlation id for this \p kind, can be NULL.
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid.
* \retval CUPTI_ERROR_QUEUE_EMPTY No external id is currently associated with \p kind.
*/
CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t *lastId);
/**
* \brief Controls the collection of queued and submitted timestamps for kernels.
*
* This API is used to control the collection of queued and submitted timestamps
* for kernels whose records are provided through the struct \ref CUpti_ActivityKernel9.
* Default value is 0, i.e. these timestamps are not collected. This API needs
* to be called before initialization of CUDA and this setting should not be
* changed during the profiling session.
*
* \param enable is a boolean, denoting whether these timestamps should be
* collected
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
*/
CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable);
/**
* \brief Sets the flush period for the worker thread
*
* CUPTI creates a worker thread to minimize the perturbance for the application created
* threads. CUPTI offloads certain operations from the application threads to the worker
* thread, this includes synchronization of profiling resources between host and device,
* delivery of the activity buffers to the client using the callback registered in
* cuptiActivityRegisterCallbacks. For performance reasons, CUPTI wakes up the worker
* thread based on certain heuristics.
*
* This API is used to control the flush period of the worker thread. This setting will
* override the CUPTI heurtistics. Setting time to zero disables the periodic flush and
* restores the default behavior.
*
* Periodic flush can return only those activity buffers which are full and have all the
* activity records completed.
*
* It's allowed to use the API \ref cuptiActivityFlushAll to flush the data on-demand, even
* when client sets the periodic flush.
*
* \param time flush period in msec
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_NOT_INITIALIZED
*
* \see cuptiActivityFlushAll
*/
CUptiResult CUPTIAPI cuptiActivityFlushPeriod(uint32_t time);
/**
* \brief Controls the collection of launch attributes for kernels.
*
* This API is used to control the collection of launch attributes for kernels whose
* records are provided through the struct \ref CUpti_ActivityKernel9.
* Default value is 0, i.e. these attributes are not collected.
*
* \param enable is a boolean denoting whether these launch attributes should be collected
*/
CUptiResult CUPTIAPI cuptiActivityEnableLaunchAttributes(uint8_t enable);
/**
* \brief Function type for callback used by CUPTI to request a timestamp
* to be used in activity records.
*
* This callback function signals the CUPTI client that a timestamp needs
* to be returned. This timestamp would be treated as normalized timestamp
* to be used for various purposes in CUPTI. For example to store start and
* end timestamps reported in the CUPTI activity records.
* The returned timestamp must be in nanoseconds.
*
* \sa ::cuptiActivityRegisterTimestampCallback
*/
typedef uint64_t (CUPTIAPI *CUpti_TimestampCallbackFunc)(void);
/**
* \brief Registers callback function with CUPTI for providing timestamp.
*
* This function registers a callback function to obtain timestamp of user's
* choice instead of using CUPTI provided timestamp.
* By default CUPTI uses different methods, based on the underlying platform,
* to retrieve the timestamp
* Linux and Android use clock_gettime(CLOCK_REALTIME, ..)
* Windows uses QueryPerformanceCounter()
* Mac uses mach_absolute_time()
* QNX uses ClockCycles()
* Timestamps retrieved using these methods are converted to nanosecond if needed
* before usage.
*
* The registration of timestamp callback should be done before any of the CUPTI
* activity kinds are enabled to make sure that all the records report the timestamp using
* the callback function registered through cuptiActivityRegisterTimestampCallback API.
*
* Changing the timestamp callback function in CUPTI through
* cuptiActivityRegisterTimestampCallback API in the middle of the profiling
* session can cause records generated prior to the change to report
* timestamps through previous timestamp method.
*
* \param funcTimestamp callback which is invoked when a timestamp is
* needed by CUPTI
*
* \retval CUPTI_SUCCESS
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcTimestamp is NULL
* \retval CUPTI_ERROR_NOT_INITIALIZED
*/
CUptiResult CUPTIAPI cuptiActivityRegisterTimestampCallback(CUpti_TimestampCallbackFunc funcTimestamp);
/** @} */ /* END CUPTI_ACTIVITY_API */
#if defined(__GNUC__) && defined(CUPTI_LIB)
#pragma GCC visibility pop
#endif
#if defined(__cplusplus)
}
#endif
#endif /*_CUPTI_ACTIVITY_H_*/