diff --git a/src/XINHE_Runtime/XINHE_Runtime.md b/src/XINHE_Runtime/XINHE_Runtime.md new file mode 100644 index 0000000..3a9ebd0 --- /dev/null +++ b/src/XINHE_Runtime/XINHE_Runtime.md @@ -0,0 +1,5 @@ +# CMCC -- XINHE Runtime + +XINHE Runtime is an cross-arch runtime system for Multi-vendor & Multi-type architectures. XINHE Runtime discovers available functionality, manage multiple diverse programming +systems (e.g., CUDA, HIP, Level Zero, DTK, Vasti) in the same application, represents data dependencies, orchestrates data movement proactively, and allows configurable work schedulers for diverse multi-vendors devices. + diff --git a/src/XINHE_Runtime/include/HYPERCROSS/hycr_runtime_api.h b/src/XINHE_Runtime/include/HYPERCROSS/hycr_runtime_api.h new file mode 100644 index 0000000..be66051 --- /dev/null +++ b/src/XINHE_Runtime/include/HYPERCROSS/hycr_runtime_api.h @@ -0,0 +1,146 @@ +#ifndef HYCR_INCLUDE_HYCR_HYCR_RUNTIME_H +#define HYCR_INCLUDE_HYCR_HYCR_RUNTIME_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#else +typedef int8_t bool; +#endif + +#define HYCR_MAX_NPLATFORMS 64 +#define HYCR_MAX_NDEVS (1 << 8) - 1 + +#define hycr_default (1 << 5) +#define hycr_cpu (1 << 6) +#define hycr_gpu_nvidia (1 << 7) +#define hycr_gpu_amd (1 << 8) +#define hycr_gpu_intel (1 << 9) +#define hycr_gpu_hygon (1 << 10) +#define hycr_gpu_iltar (1 << 11) +#define hycr_ogpu (hycr_gpu_nvidia | hycr_gpu_amd | hycr_gpu_intel | hycr_gpu_hygon | hycr_gpu_iltar) +#define hycr_npu_ascend (1 << 13) +#define hycr_dla_vasti (1 << 14) +#define hycr_dla_enflame (1 << 15) + + +#define hycr_cuda 1 +#define hycr_hip 3 +#define hycr_levelzero 4 +#define hycr_dtk 5 +#define hycr_ixc 6 +#define hycr_cann 7 +#define hycr_vasr 8 + +#define hycr_r -1 +#define hycr_w -2 +#define hycr_rw -3 +#define hycr_xr -4 +#define hycr_xw -5 +#define hycr_xrw -6 + +#define hycr_int (1 << 0) +#define hycr_long (1 << 1) +#define hycr_float (1 << 2) +#define hycr_double (1 << 3) + +#define hycr_normal (1 << 10) +#define hycr_reduction (1 << 11) +#define hycr_sum ((1 << 12) | hycr_reduction) +#define hycr_max ((1 << 13) | hycr_reduction) +#define hycr_min ((1 << 14) | hycr_reduction) + +#define hycr_platform 0x3401 +#define hycr_vendor 0x3402 +#define hycr_name 0x3403 +#define hycr_type 0x3404 + +#define hycr_ncmds 1 +#define hycr_ncmds_kernel 2 +#define hycr_ncmds_memcpy 3 +#define hycr_cmds 4 + +#endif + +typedef struct _hycr_task* hycr_task; +typedef struct _hycr_mem* hycr_mem; +typedef struct _hycr_kernel* hycr_kernel; +typedef struct _hycr_graph* hycr_graph; + +typedef int (*hycr_host_task)(void* params, const int* device); +typedef int (*command_handler)(void* params, void* device); +typedef int (*hook_task)(void* task); +typedef int (*hook_command)(void* command); + +typedef int (*hycr_selector_kernel)(hycr_task task, void* params, char* kernel_name); + +extern int hycr_init(int* argc, char*** argv, int sync); +extern int hycr_finalize(); + + +extern int hycr_env_set(const char* key, const char* value); +extern int hycr_env_get(const char* key, char** value, size_t* vallen); + +extern int hycr_platform_count(int* nplatforms); +extern int hycr_platform_info(int platform, int param, void* value, size_t* size); + + +extern int hycr_device_count(int* ndevs); +extern int hycr_device_info(int device, int param, void* value, size_t* size); +extern int hycr_device_set_default(int device); +extern int hycr_device_get_default(int* device); +extern int hycr_device_synchronize(int ndevs, int* devices); + + + +extern int hycr_kernel_create(const char* name, hycr_kernel* kernel); +extern int hycr_kernel_get(const char* name, hycr_kernel* kernel); +extern int hycr_kernel_setarg(hycr_kernel kernel, int idx, size_t size, void* value); +extern int hycr_kernel_setmem(hycr_kernel kernel, int idx, hycr_mem mem, size_t mode); +extern int hycr_kernel_setmem_off(hycr_kernel kernel, int idx, hycr_mem mem, size_t off, size_t mode); +extern int hycr_kernel_setmap(hycr_kernel kernel, int idx, void* host, size_t mode); +extern int hycr_kernel_release(hycr_kernel kernel); + +extern int hycr_create(hycr_task* task); +extern int hycr_create_name(const char* name, hycr_task* task); +extern int hycr_depend(hycr_task task, int ntasks, hycr_task* tasks); +extern int hycr_malloc(hycr_task task, hycr_mem mem); +extern int hycr_cmd_reset_mem(hycr_task task, hycr_mem mem, uint8_t reset); +extern int hycr_h2d(hycr_task task, hycr_mem mem, size_t off, size_t size, void* host); +extern int hycr_h2d_offsets(hycr_task task, hycr_mem mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, void* host); +extern int hycr_d2h(hycr_task task, hycr_mem mem, size_t off, size_t size, void* host); +extern int hycr_d2h_offsets(hycr_task task, hycr_mem mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, void* host); +extern int hycr_dmem_flush_out(hycr_task task, hycr_mem mem); +extern int hycr_h2d_full(hycr_task task, hycr_mem mem, void* host); +extern int hycr_d2h_full(hycr_task task, hycr_mem mem, void* host); + + +extern int hycr_arch_kernel_object(hycr_task task, hycr_kernel kernel, int dim, size_t* off, size_t* gws, size_t* lws); +extern int hycr_arch_kernel_selector(hycr_task task, hycr_selector_kernel func, void* params, size_t params_size); +extern int hycr_arch_submit(hycr_task task, int device, const char* opt, int sync); +extern int hycr_arch_release(hycr_task task); + +extern int hycr_mem_create(size_t size, hycr_mem* mem); +extern int hycr_mem_init_reset(hycr_mem mem, int reset); +extern int hycr_mem_create(hycr_mem* mem, void *host, size_t size); +extern int hycr_mem_update(hycr_mem mem, void *host); +extern int hycr_mem_create_region(hycr_mem* mem, hycr_mem root_mem, int region); +extern int hycr_mem_enable_outer_dim_regions(hycr_mem mem); +extern int hycr_mem_create_tile(hycr_mem* mem, void *host, size_t *off, size_t *host_size, size_t *dev_size, size_t elem_size, int dim); +extern int hycr_mem_arch(hycr_mem mem, int device, void** arch); +extern int hycr_mem_reduce(hycr_mem mem, int mode, int type); +extern int hycr_mem_release(hycr_mem mem); + + +extern int hycr_record_start(); +extern int hycr_record_stop(); + +extern int hycr_timer_now(double* time); +extern void hycr_disable_consistency_check(); +extern void hycr_enable_consistency_check(); + + +#endif /* HYCR_INCLUDE_HYCR_HYCR_RUNTIME_H */ + diff --git a/src/XINHE_Runtime/src/runtime/Device.h b/src/XINHE_Runtime/src/runtime/Device.h new file mode 100644 index 0000000..feb582d --- /dev/null +++ b/src/XINHE_Runtime/src/runtime/Device.h @@ -0,0 +1,123 @@ +#ifndef HYCR_SRC_RT_DEVICE_H +#define HYCR_SRC_RT_DEVICE_H + +#include "Debug.h" +#include "Config.h" +#include "Timer.h" +#include + +#ifndef ASYNC_STREAMING + #define SYNC_EXECUTION +#endif + +namespace hycr { +namespace runtime { + +class Device { +public: + + Device(int devs, int platform); + virtual ~Device(); + + virtual void TaskPre(Task* task) { return; } + virtual void TaskPost(Task* task) { return; } + + void Execute(Task* task); + + void ExecuteInit(Command* cmd); + virtual void ExecuteKernel(Command* cmd); + void ExecuteMalloc(Command* cmd); + + void InvokeDMemInDataTransfer(Task *task, Command *cmd, DMemType *mem); + void ExecuteMemResetInput(Task *task, Command* cmd); + void ExecuteMemIn(Task *task, Command* cmd); + void ExecuteMemInDMemIn(Task *task, Command* cmd, DataMem *mem); + void ExecuteMemInDMemRegionIn(Task *task, Command* cmd, DataMemRegion *mem); + void ExecuteMemOut(Task *task, Command* cmd); + void ExecuteMemFlushOut(Command* cmd); + + void ExecuteH2D(Command* cmd); + void ExecuteH2DNP(Command* cmd); + void ExecuteD2H(Command* cmd); + void ExecuteMap(Command* cmd); + void ExecuteReleaseMem(Command* cmd); + void ExecuteHost(Command* cmd); + + virtual int ResetMemory(BaseMem *mem, uint8_t reset_value)=0; + virtual void ResetContext() { } + virtual bool IsContextChangeRequired() { return false; } + virtual int Compile(char* src) { return HYCR_SUCCESS; } + virtual int Init() = 0; + virtual int BuildProgram(char* path) { return HYCR_SUCCESS; } + virtual int MemAlloc(void** mem, size_t size, bool reset=false) = 0; + virtual int MemFree(void* mem) = 0; + virtual int MemD2D(Task *task, BaseMem *mem, void *dst, void *src, size_t size) { _error("Device:%d:%s doesn't support MemD2D", devs_, name()); return HYCR_ERROR; } + virtual int MemH2D(Task *task, BaseMem* mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, size_t size, void* host, const char *tag="") = 0; + virtual int MemD2H(Task *task, BaseMem* mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, size_t size, void* host, const char *tag="") = 0; + virtual int KernelGet(Kernel *kernel, void** kernel_bin, const char* name) = 0; + virtual int KernelLaunchInit(Kernel* kernel) { return HYCR_SUCCESS; } + virtual int KernelSetArg(Kernel* kernel, int idx, int kindex, size_t size, void* value) = 0; + virtual int KernelSetMem(Kernel* kernel, int idx, int kindex, BaseMem* mem, size_t off) = 0; + virtual int KernelLaunch(Kernel* kernel, int dim, size_t* off, size_t* gws, size_t* lws) = 0; + virtual int Synchronize() = 0; + virtual int AddCallback(Task* task) = 0; + virtual int Custom(int tag, char* params) { return HYCR_SUCCESS; } + virtual int RecreateContext() { return HYCR_ERROR; } + virtual void SetPeerDevices(int *peers, int count) { } + virtual const char* kernel_src() { return " "; } + virtual const char* kernel_bin() { return " "; } + + void set_shared_memory_buffers(bool flag=true) { shared_memory_buffers_ = flag; } + bool is_shared_memory_buffers() { return shared_memory_buffers_ && can_share_host_memory_; } + int platform() { return platform_; } + int devs() { return devs_; } + int type() { return type_; } + int model() { return model_; } + char* vendor() { return vendor_; } + char* name() { return name_; } + bool busy() { return busy_; } + bool idle() { return !busy_; } + bool enable() { return enable_; } + void enableD2D() { is_d2d_possible_ = true; } + bool isD2DEnabled() { return is_d2d_possible_; } + int ok() { return errid_; } + void set_worker(Worker* worker) { worker_ = worker; } + Worker* worker() { return worker_; } + double Now() { return timer_->Now(); } + + + +protected: + int devs_; + int platform_; + int type_; + int model_; + char vendor_[128]; + char name_[256]; + char version_[64]; + int driver_version_; + size_t max_compute_units_; + size_t max_work_group_size_; + size_t max_work_item_sizes_[3]; + int max_block_dims_[3]; + int nqueues_; + int q_; + int errid_; + + char kernel_path_[256]; + + bool busy_; + bool enable_; + bool shared_memory_buffers_; + bool can_share_host_memory_; + bool is_d2d_possible_; + + std::map cmd_handlers_; +}; + + + +} /* namespace rt */ +} /* namespace hycr */ + +#endif /* HYCR_SRC_RT_DEVICE_H */ diff --git a/src/XINHE_Runtime/src/runtime/DeviceCUDA.cpp b/src/XINHE_Runtime/src/runtime/DeviceCUDA.cpp new file mode 100644 index 0000000..ad78e5b --- /dev/null +++ b/src/XINHE_Runtime/src/runtime/DeviceCUDA.cpp @@ -0,0 +1,137 @@ +#include "DeviceCUDA.h" + +namespace hycr { +namespace runtime { + +DeviceCUDA::DeviceCUDA(CUDA* ld, Host2CUDA *host2cuda_ld, CUdevice cudev, int devs, int platform) : Device(devs, platform) { + + ld_ = ld; + host2cuda_ld_ = host2cuda_ld; + peers_count_ = 0; + max_arg_idx_ = 0; + ngarbage_ = 0; + shared_mem_bytes_ = 0; + dev_ = cudev; + + strcpy(vendor_, "NVIDIA"); + enableD2D(); + err_ = ld_->cuDeviceGetName(name_, sizeof(name_), dev_); + _cuerror(err_); + type_ = hycr_nvidia; + model_ = hycr_cuda; + err_ = ld_->cuDriverGetVersion(&driver_version_); + _cuerror(err_); + //err_ = ld_->cudaSetDevice(dev_); + _cuerror(err_); + sprintf(version_, "NVIDIA CUDA %d", driver_version_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev_); + err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev_); + +} + +DeviceCUDA::~DeviceCUDA() { + if (ld_->hycr_host2cuda_finalize){ + ld_->hycr_host2cuda_finalize(); + } + if (ld_->hycr_host2cuda_finalize_handles){ + ld_->hycr_host2cuda_finalize_handles(dev_); + } +} + +int DeviceCUDA::Compile(char* src) { + char cmd[1024]; + memset(cmd, 0, 256); + sprintf(cmd, "nvcc -ptx %s -o %s", src, kernel_path_); + if (system(cmd) != EXIT_SUCCESS) { + _error("cmd[%s]", cmd); + worker_->platform()->IncrementErrorCount(); + return HYCR_ERROR; + } + return HYCR_SUCCESS; +} + +int DeviceCUDA::Init() { + err_ = ld_->cudaSetDevice(dev_); + err_ = ld_->cuCtxCreate(&ctx_, CU_CTX_SCHED_AUTO, dev_); + EnablePeerAccess(); + _cuerror(err_); + + for (int i = 0; i < nqueues_; i++) { + err_ = ld_->cuStreamCreate(streams_ + i, CU_STREAM_DEFAULT); + _cuerror(err_); + } + + char* path = kernel_path_; + char* src = NULL; + size_t srclen = 0; + if (Utils::ReadFile(path, &src, &srclen) == HYCR_ERROR) { + return HYCR_SUCCESS; + } + + err_ = ld_->cuModuleLoad(&module_, path); + if (err_ != CUDA_SUCCESS) { + _cuerror(err_); + if (src) free(src); + platform()->IncrementErrorCount(); + return HYCR_ERROR; + } + if (src) free(src); + return HYCR_SUCCESS; +} + +int DeviceCUDA::ResetMemory(BaseMem *mem, uint8_t reset_value) { + err_ = ld_->cudaMemset(mem->arch(this), reset_value, mem->size()); + _cuerror(err_); + if (err_ != CUDA_SUCCESS){ + worker_->platform()->IncrementErrorCount(); + return HYCR_ERROR; + } + return HYCR_SUCCESS; +} + +int DeviceCUDA::MemAlloc(void** mem, size_t size, bool reset) { + CUdeviceptr* cumem = (CUdeviceptr*) mem; + err_ = ld_->cuMemAlloc(cumem, size); + + if (reset) ld_->cudaMemset(*mem, 0, size); + if (err_ != CUDA_SUCCESS){ + worker_->platform()->IncrementErrorCount(); + return HYCR_ERROR; + } + return HYCR_SUCCESS; +} + +int DeviceCUDA::MemFree(void* mem) { + CUdeviceptr cumem = (CUdeviceptr) mem; + if (ngarbage_ >= HYCR_MAX_GABAGES) _error("ngarbage[%d]", ngarbage_); + else garbage_[ngarbage_++] = cumem; + /* + _trace("dptr[%p]", cumem); + err_ = ld_->cuMemFree(cumem); + _cuerror(err_); + */ + return HYCR_SUCCESS; +} + +int DeviceCUDA::Synchronize() { + err_ = ld_->cuCtxSynchronize(); + _cuerror(err_); + if (err_ != CUDA_SUCCESS){ + worker_->platform()->IncrementErrorCount(); + return HYCR_ERROR; + } + return HYCR_SUCCESS; +} + + +} /* namespace runtime */ +} /* namespace hycr */ + diff --git a/src/XINHE_Runtime/src/runtime/DeviceCUDA.h b/src/XINHE_Runtime/src/runtime/DeviceCUDA.h new file mode 100644 index 0000000..aacda5a --- /dev/null +++ b/src/XINHE_Runtime/src/runtime/DeviceCUDA.h @@ -0,0 +1,74 @@ +#ifndef HYCR_SRC_RT_DEVICE_CUDA_H +#define HYCR_SRC_RT_DEVICE_CUDA_H + +#include "Device.h" +#include + +#define HYCR_MAX_GABAGES 256 + +namespace hycr { +namespace runtime { + +class DeviceCUDA : public Device { +public: + DeviceCUDA(CUDA* ld, Host2CUDA *host2cuda_ld, CUdevice cudev, int devs, int platform); + ~DeviceCUDA(); + + int Compile(char* src); + int Init(); + int ResetMemory(BaseMem *mem, uint8_t reset_value); + int MemAlloc(void** mem, size_t size, bool reset=false); + int MemFree(void* mem); + void EnablePeerAccess(); + void SetPeerDevices(int *peers, int count); + void MemCpy3D(CUdeviceptr dev, uint8_t *host, size_t *off, + size_t *dev_sizes, size_t *host_sizes, + size_t elem_size, bool host_2_dev); + int MemD2D(Task *task, BaseMem *mem, void *dst, void *src, size_t size); + int MemH2D(Task *task, BaseMem* mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, size_t size, void* host, const char *tag=""); + int MemD2H(Task *task, BaseMem* mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, size_t size, void* host, const char *tag=""); + int KernelGet(Kernel *kernel, void** kernel_bin, const char* name); + int KernelLaunchInit(Kernel* kernel); + int KernelSetArg(Kernel* kernel, int idx, int kindex, size_t size, void* value); + int KernelSetMem(Kernel* kernel, int idx, int kindex, BaseMem* mem, size_t off); + int KernelLaunch(Kernel* kernel, int dim, size_t* off, size_t* gws, size_t* lws); + int Synchronize(); + int AddCallback(Task* task); + int Custom(int tag, char* params); + + const char* kernel_src() { return "KERNEL_SRC_CUDA"; } + + virtual void TaskPre(Task* task); + + int cudev() { return dev_; } + void ResetContext(); + bool IsContextChangeRequired(); + +private: + static void Callback(CUstream stream, CUresult status, void* data); + void ClearGarbage(); + +private: + LoaderCUDA* ld_; + LoaderHost2CUDA* host2cuda_ld_; + CUdevice dev_; + CUdevice peers_[HYCR_MAX_NDEVS]; + int peers_count_; + CUcontext ctx_; + CUstream streams_[HYCR_MAX_DEVICE_NQUEUES]; + CUmodule module_; + CUresult err_; + unsigned int shared_mem_bytes_; + unsigned int shared_mem_offs_[HYCR_MAX_KERNEL_NARGS]; + void* params_[HYCR_MAX_KERNEL_NARGS]; + int max_arg_idx_; + CUdeviceptr garbage_[HYCR_MAX_GABAGES]; + int ngarbage_; + std::map kernels_offs_; +}; + +} /* namespace runtime */ +} /* namespace hycr */ + +#endif /* HYCR_SRC_RT_DEVICE_CUDA_H */ +