/** 
 * @file    kCudaUtils.h
 * @brief   Utility functions.
 *
 * @internal
 * Copyright (C) 2019-2022 by LMI Technologies Inc.  All rights reserved.
 */
#ifndef K_FIRESYNC_CUDA_UTILS_H
#define K_FIRESYNC_CUDA_UTILS_H

#include <kApi/Data/kArrayProvider.h>
#include <kFireSync/Cuda/kCudaDef.h>
#include <kFireSync/Cuda/kCudaAlloc.h>
#include <kFireSync/Cuda/kCudaUtils.x.h>

/**
 * @class   kCudaUtils
 * @ingroup kFireSync-Cuda
 * @brief   Collection of Cuda utility functions. 
 */

/** @name Environment Support */
//@{

/**
 * Reports the overall status of the Cuda processing environment. 
 * 
 * @public    @memberof kCudaUtils
 * @return    Current Cuda environment status.
 */
kFsFx(kCudaEnvironmentStatus) kCudaUtils_EnvironmentStatus();

//@}

/** @name Value Support */
//@{

/**
 * Allocates storage for a value-type with the specified allocator.
 * 
 * If serial managed memory is allocated by this method, it will be automatically attached to the stream. 
 * 
 * @public              @memberof kCudaUtils
 * @param   stream      Cuda stream. 
 * @param   allocator   Allocator for new buffer.
 * @param   pointer     Allocated memory.
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status.
 */
template<class T>
kInlineFx(kStatus) kCudaUtils_AllocValue(kCudaStream stream, kAlloc allocator, T** pointer, kCudaSync sync = kCUDA_SYNC_DEFAULT)
{
    return xkCudaUtils_AllocValue(stream, allocator, pointer, sync); 
}

/**
 * Copies an item from its existing location to a new location.
 * 
 * This method does not automatically attach serial managed memory; use kCudaUtils_AttachToStream if necessary.
 * 
 * @public              @memberof kCudaUtils
 * @param   stream      Cuda stream. 
 * @param   dest        Destination memory.
 * @param   source      Source memory. 
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status.
 */
template<class T>
kInlineFx(kStatus) kCudaUtils_AssignValue(kCudaStream stream, T* dest, const T* source, kCudaSync sync = kCUDA_SYNC_DEFAULT)
{
    return kCudaUtils_Copy(stream, dest, source, sizeof(T), sync);
}

/**
 * Moves an item from its existing location to a new buffer created with the specified allocator.
 * 
 * This method does not automatically attach serial managed source memory; use kCudaUtils_AttachToStream if necessary.
 * 
 * If serial managed memory is allocated by this method, it will be automatically attached to the stream. 
 * 
 * @public              @memberof kCudaUtils
 * @param   stream      Cuda stream. 
 * @param   destAlloc   Allocator for new buffer.
 * @param   dest        Prepared memory.
 * @param   source      Source memory. 
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status.
 */
template<class T>
kInlineFx(kStatus) kCudaUtils_CloneValue(kCudaStream stream, kAlloc destAlloc, T** dest, const T* source, kCudaSync sync = kCUDA_SYNC_DEFAULT)
{
    return xkCudaUtils_CloneValue(stream, destAlloc, dest, source, sync); 
}

/**
 * Frees a value buffer and resets the buffer pointer to null.
 *
 * This method is equivalent to calling kAlloc_FreeRef, with slightly more convenient/safe syntax.
 * 
 * @public              @memberof kCudaUtils
 * @param   allocator   Memory allocator.
 * @param   pointer     Pointer to item buffer pointer.
 * @return              Operation status.
 */
template<class T>
kInlineFx(kStatus) kCudaUtils_FreeValueRef(kAlloc allocator, T** pointer)
{
    return kAlloc_FreeRef(allocator, pointer);
}

//@}
/** @name Raw Memory Support */
//@{

/**
 * Allocates a block of memory and attaches it to the specified stream.
 * 
 * For serial managed memory, the buffer will be attached to the stream; for all other memory 
 * types, this method is equivalent to kAlloc_Get. 
 *
 * @public              @memberof kCudaUtils
 * @param   stream      Stream to which memory should be initially attached.
 * @param   alloc       Allocator object.
 * @param   size        Size of memory to allocate, in bytes.
 * @param   mem         Receives pointer to allocated memory (pointer to a pointer).
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status.
 */
kFsFx(kStatus) kCudaUtils_Allocate(kCudaStream stream, kAlloc alloc, kSize size, void* mem, kCudaSync sync = kCUDA_SYNC_DEFAULT);

/**
* Attaches serial managed memory to a stream.
* 
* Refer to @ref kCudaManagedAlloc for details on serial managed memory attachment. 
* 
* This method has no effect on other types of memory.
*
* @public            @memberof kCudaUtils
* @param   stream    Cuda stream. 
* @param   pointer   Memory pointer.
* @param   sync      Synchronization mode; if unspecified, mode is determined from default stream sync mode.
* @return            Operation status.
* @see               kCudaManagedAlloc, kCudaUtils_DetachFromStream
*/
kInlineFx(kStatus) kCudaUtils_AttachToStream(kCudaStream stream, void* pointer, kCudaSync sync = kCUDA_SYNC_DEFAULT)
{
    return kCudaStream_AttachMemory(xkCudaStream_Fallback(stream), pointer, sync); 
}
   
/**
* Detaches serial managed memory from a stream.
*  
* Refer to @ref kCudaManagedAlloc for details on serial managed memory attachment. 
* 
* This method has no effect on other types of memory.
*
* @public           @memberof kCudaUtils
* @param   stream   Cuda stream. 
* @param   pointer  Memory pointer.
* @param   sync     Synchronization mode; if unspecified, mode is determined from default stream sync mode.
* @return           Operation status.
* @see              kCudaManagedAlloc, kCudaUtils_AttachToStream
*/
kInlineFx(kStatus) kCudaUtils_DetachFromStream(kCudaStream stream, void* pointer, kCudaSync sync = kCUDA_SYNC_DEFAULT)
{
    return kCudaStream_DetachMemory(xkCudaStream_Fallback(stream), pointer, sync); 
}

/** 
 * Sets a block of memory to the given byte value. 
 * 
 * For non-device memory, this operation will always be synchronous (equivalent to kMemCopy).
 *
 * @public              @memberof kCudaUtils
 * @param   stream      Cuda stream. 
 * @param   dest        Destination for the memory set operation.
 * @param   fill        Value to be set.
 * @param   size        Size of memory block to be set, in bytes.
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status.
 */
kFsFx(kStatus) kCudaUtils_MemSet(kCudaStream stream, void* dest, kByte fill, kSize size, kCudaSync sync = kCUDA_SYNC_DEFAULT);

/** 
 * Copies data from one memory address to another.
 * 
 * Supports arbitrary source/destination memory pointers; all memory types supported.
 *
 * @public              @memberof kCudaUtils
 * @param   stream      Cuda stream. 
 * @param   dest        Destination for the memory copy.
 * @param   src         Source for the memory copy.
 * @param   size        Size of memory block to be copied, in bytes.
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status. 
 */
kFsFx(kStatus) kCudaUtils_Copy(kCudaStream stream, void* dest, const void* src, kSize size, kCudaSync sync = kCUDA_SYNC_DEFAULT);

//@}
/** @name Object/Array Support */
//@{

/** 
 * Attaches an array's internal buffer to a stream.
 * 
 * Refer to @ref kCudaManagedAlloc for details on serial managed memory attachment. 
 * 
 * This method has no effect on other types of memory.
 *
 * @public              @memberof kCudaUtils
 * @param   stream      Cuda stream. 
 * @param   array       Array to be attached to stream.
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status. 
 */
kInlineFx(kStatus) kCudaUtils_AttachArrayToStream(kCudaStream stream, kArrayProvider array, kCudaSync sync = kCUDA_SYNC_DEFAULT)
{
    return kCudaUtils_AttachToStream(stream, kArrayProvider_Data(array), sync);
}

/** 
 * Detaches an array's internal buffer from a stream.
 *  
 * Refer to @ref kCudaManagedAlloc for details on serial managed memory attachment. 
 * 
 * This method has no effect on other types of memory.
 *
 * @public              @memberof kCudaUtils
 * @param   stream      Cuda stream. 
 * @param   array       Array to be attached to stream.
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status. 
 */
kInlineFx(kStatus) kCudaUtils_DetachArrayFromStream(kCudaStream stream, kArrayProvider array, kCudaSync sync = kCUDA_SYNC_DEFAULT)
{
    return kCudaUtils_DetachFromStream(stream, kArrayProvider_Data(array), sync);
}

/** 
 * Constructs a new object by copying an existing object, including any aggregated child elements. 
 * 
 * This method provides the same functionality as kObject_Clone, but allows the synchronization mode to be specified 
 * as a call argument, rather than automatically derived from the stream's default synchronization mode.
 *
 * @public                  @memberof kCudaUtils
 * @param   stream          Cuda stream. 
 * @param   object          Receives constructed object.
 * @param   source          Source array.
 * @param   objectAllocator Object memory allocator (or kNULL for default). 
 * @param   valueAllocator  Value memory allocator (or kNULL for default). 
 * @param   sync            Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return                  Operation status. 
 */
kFsFx(kStatus) kCudaUtils_CloneObject(kCudaStream stream, kObject* object, kObject source, kAlloc objectAllocator, kAlloc valueAllocator, kCudaSync sync = kCUDA_SYNC_DEFAULT);

/** 
 * Assigns an array-based data type.
 *
 * This method provides the same functionality as kArrayProvider_Assign, but allows the synchronization mode to be specified 
 * as a call argument, rather than automatically derived from the stream's default synchronization mode. 
 * 
 * @public              @memberof kCudaUtils
 * @param   stream      Cuda stream. 
 * @param   destination Destination array (must be of same type as source array).
 * @param   source      Source array.
 * @param   sync        Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return              Operation status. 
 */
kFsFx(kStatus) kCudaUtils_AssignArray(kCudaStream stream, kArrayProvider destination, kArrayProvider source, kCudaSync sync = kCUDA_SYNC_DEFAULT);

/**
 * Reports whether an array's buffer can potentially be accessed by a Cuda device.
 *
 * For serial managed memory, exclusive ownership can optionally be taken into account (i.e., if ownership is not exclusive, 
 * implying that the memory might be in use by other threads, then the memory cannot be safely attached to a stream for 
 * device use). If this method indicates that a serial managed memory buffer can potentially be used by a device, remember 
 * to attach the buffer to a stream before attempting to access the buffer from a device.
 * 
 * @public                  @memberof kCudaUtils
 * @param   array           Array to be checked.
 * @param   isExclusive     Stipulates whether the array is owned exclusively (i.e., not simultaneously in use elsewhere). 
 * @return                  kTRUE if memory is potentially device-accessible.
 */
kInlineFx(kBool) kCudaUtils_IsArrayDeviceAccessible(kArrayProvider array, kBool isExclusive = kTRUE)
{
    kAllocTrait traits = kAlloc_Traits(kArrayProvider_ValueAlloc(array)); 

    return kAllocTrait_IsCudaDevice(traits) || kAllocTrait_IsCudaPinned(traits) || (kAllocTrait_IsCudaManaged(traits) && (!kAllocTrait_IsSerial(traits) || isExclusive));
}

/**
 * Constructs an array object, using a template object to determine properties.
 * 
 * This method is typically used to allocate an uninitialized array that is suitable to hold the result 
 * of a computation in device memory. However, any type of value allocator may be used. 
 * 
 * If serial managed memory is allocated by this method, it will be automatically attached to the stream. 
 *
 * @public                  @memberof kCudaUtils
 * @param   stream          Cuda stream. 
 * @param   outputArray     Receives output array.
 * @param   templateArray   Input array, which may or may not already be device accessible.
 * @param   objectAlloc     Object allocator to be used, if cloning is required.
 * @param   valueAlloc      Data allocator to be used, if cloning is required (if kNULL, the device allocator will be used). 
 * @param   sync            Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return                  Operation status.
 */
kFsFx(kStatus) kCudaUtils_ConstructArrayFromTemplate(kCudaStream stream, kArrayProvider* outputArray, kArrayProvider templateArray, kAlloc objectAlloc = kNULL, kAlloc valueAlloc = kNULL, kCudaSync sync = kCUDA_SYNC_DEFAULT);

/**
 * Ensures the specified input array is device-accessible by either cloning or sharing it.
 * 
 * The purpose of this method is to move an input array into device-accessible memory. However, copying an array is  
 * expensive; if the array is already suitable for device use, then copying should ideally be avoided. 
 * 
 * As such, if the source array is already device-accessible, then this method will usually just increase the source array reference 
 * count and provide the same array as its output. However, if the input array exists in serial managed memory, 
 * then ownership of the array must also be considered (the array must be attached to a stream before it can be safely 
 * accessed, but if the array is potentially in use by multiple CPU threads, then its stream affinity cannot be altered). 
 * Accordingly, for serial managed memory, the inputExclusive argument is also required. If true, then the managed memory 
 * can be attached to the stream, enabling it to be used by a Cuda device. 
 * 
 * If the conditions described above for sharing the input array are not met, a device-accessible array is cloned from 
 * the input array and provided as the output. 
 * 
 * Regardless of how the output array is derived, it is the caller's responsibility to destroy the array when no 
 * longer needed.
 *
 * @public                  @memberof kCudaUtils
 * @param   stream          Cuda stream. 
 * @param   outputArray     Receives device-accessible output array.
 * @param   inputArray      Input array, which may or may not already be device accessible.
 * @param   copyData        If input array is not device accessible, should its data content be copied to the newly-constructed output array?
 * @param   objectAlloc     Object allocator to be used, if cloning is required.
 * @param   valueAlloc      Data allocator to be used, if cloning is required (must allocate device-accessible memory; if kNULL, the device allocator will be used). 
 * @param   inputExclusive  Is the input array exclusively owned (not shared)? If true, allows serial managed memory to be used as device memory.
 * @param   sync            Synchronization mode; if unspecified, mode is determined from default stream sync mode.
 * @return                  Operation status.
 */
kFsFx(kStatus) kCudaUtils_PrepareDeviceArray(kCudaStream stream, kArrayProvider* outputArray, kArrayProvider inputArray, kBool copyData = kTRUE, kAlloc objectAlloc = kNULL, kAlloc valueAlloc = kNULL, kBool inputExclusive = kTRUE, kCudaSync sync = kCUDA_SYNC_DEFAULT);

//@}

#endif
