カスタム・アロケータ
CUDAでは目的/用途に応じて様々なメモリの確保/解放APIが用意されています。
- ピン留めされたHost-memory :
cudaMallocHost
/cudaFreeHost
- Host/Device双方で共用できるManaged-memory :
cudaMallocManaged
/cudaFree
- Device-memory :
cudaMalloc
:cudaFree
Device側はさておき、Host側は上記1.2および通常のnew[]
/ delete[]
の3種のメモリ確保/解放を使い分けることになります。
C++屋が(可変長)配列を扱う際。日常的にstd::vector
のお世話になるのですが、std::vector<T>
は(デフォルトで)new[]
/delete[]
が内部的なメモリ管理に使われます。
このメモリ管理をcudaMallocHost
/cudaFreeHost
あるいはcudaMallocManaged
/cudaFree
に差し替えることができれば ピン留めされたvector
と Host/Deviceの双方でで共用できるvector
が使えて便利。
ってわけで実装しました。ついでに unnique_device_ptr と cuda runtime 例外も。
/* cuda_except.h */ #ifndef CUDA_EXCEPT_H_ #define CUDA_EXCEPT_H_ #include <cuda_runtime.h> #include <stdexcept> namespace cu { class cuda_error : public std::runtime_error { cudaError_t err_; public: cuda_error(cudaError_t error) : std::runtime_error(cudaGetErrorString(error)), err_(error) {} cudaError_t code() const { return err_; } const char* name() const { return cudaGetErrorName(err_); } }; } #endif
/* cuda_allocator.h */ #ifndef CUDA_ALLOCATOR_H_ #define CUDA_ALLOCATOR_H_ #include "cuda_except.h" namespace cu { template <class T> struct host_allocator { typedef T value_type; host_allocator() noexcept {} //default ctor not required by STL template<class U> host_allocator(const host_allocator<U>&) noexcept {} template<class U> bool operator==(const host_allocator<U>&) const noexcept { return true; } template<class U> bool operator!=(const host_allocator<U>&) const noexcept { return false; } T* allocate(const size_t n) const; void deallocate(T* const p) const noexcept; void deallocate(T* const p, size_t) const noexcept { deallocate(p); } }; template <class T> struct managed_allocator { typedef T value_type; managed_allocator() noexcept {} //default ctor not required by STL template<class U> managed_allocator(const managed_allocator<U>&) noexcept {} template<class U> bool operator==(const managed_allocator<U>&) const noexcept { return true; } template<class U> bool operator!=(const managed_allocator<U>&) const noexcept { return false; } T* allocate(const size_t n) const; void deallocate(T* const p) const noexcept; void deallocate(T* const p, size_t) const noexcept { deallocate(p); } }; template <class T> struct device_allocator { typedef T value_type; device_allocator() noexcept {} //default ctor not required by STL template<class U> device_allocator(const device_allocator<U>&) noexcept {} template<class U> bool operator==(const device_allocator<U>&) const noexcept { return true; } template<class U> bool operator!=(const device_allocator<U>&) const noexcept { return false; } T* allocate(const size_t n) const; void deallocate(T* const p) const noexcept; void deallocate(T* const p, size_t) const noexcept { deallocate(p); } }; #include <cuda_runtime.h> template <class T> T* host_allocator<T>::allocate(const size_t n) const { if ( n == 0 ) return nullptr; if ( n > static_cast<size_t>(-1) / sizeof(T) ) throw std::bad_array_new_length(); void* pv = nullptr; cudaError_t err = cudaMallocHost(&pv, n*sizeof(T)); if ( err != cudaSuccess ) throw cuda_error(err); return static_cast<T*>(pv); } template<class T> void host_allocator<T>::deallocate(T * const p) const noexcept{ cudaError_t err = cudaFreeHost(p); //if ( err != cudaSuccess ) throw cuda_error(err); } template <class T> T* managed_allocator<T>::allocate(const size_t n) const { if ( n == 0 ) return nullptr; if ( n > static_cast<size_t>(-1) / sizeof(T) ) throw std::bad_array_new_length(); void* pv = nullptr; cudaError_t err = cudaMallocManaged(&pv, n*sizeof(T)); if ( err != cudaSuccess ) throw cuda_error(err); return static_cast<T*>(pv); } template<class T> void managed_allocator<T>::deallocate(T * const p) const noexcept { cudaError_t err = cudaFree(p); //if ( err != cudaSuccess ) throw cuda_error(err); } template <class T> T* device_allocator<T>::allocate(const size_t n) const { if ( n == 0 ) return nullptr; if ( n > static_cast<size_t>(-1) / sizeof(T) ) throw std::bad_array_new_length(); void* pv = nullptr; cudaError_t err = cudaMalloc(&pv, n*sizeof(T)); if ( err != cudaSuccess ) throw cuda_error(err); return static_cast<T*>(pv); } template<class T> void device_allocator<T>::deallocate(T * const p) const noexcept { cudaError_t err = cudaFree(p); //if ( err != cudaSuccess ) throw cuda_error(err); } } #endif
/* unique_device_ptr.h */ #ifndef UNIQUE_DEVICE_PTR_H_ #define UNIQUE_DEVICE_PTR_H_ #include <cuda_runtime.h> #include <memory> namespace cu { template<typename T> struct device_delete { device_delete() noexcept = default; void operator()(T* ptr) const { cudaFree(ptr); } }; template<typename T> struct device_delete<T[]> { device_delete() noexcept = default; void operator()(T* ptr) const { cudaFree(ptr); } }; template<typename T> using device_unique_ptr = std::unique_ptr<T,device_delete<T>>; } #endif
Windows版CUDA Toolkitではおなじみの配列の足し算をre-writeしてみました。メモリ管理とエラー処理がぐっと楽になります。
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include "cuda_allocator.h" #include "unique_device_ptr.h" #include <vector> template<typename T> using host_vector = std::vector<T, cu::host_allocator<T>>; void addWithCuda(int *c, const int *a, const int *b, unsigned int size); __global__ void addKernel(int *c, const int *a, const int *b) { int i = threadIdx.x; c[i] = a[i] + b[i]; } inline void cuda_check(cudaError_t status) { if (status != cudaSuccess) throw cu::cuda_error(status); } int main() { try { const int arraySize = 5; host_vector<int> a = { 1, 2, 3, 4, 5 }; host_vector<int> b = { 10, 20, 30, 40, 50 }; host_vector<int> c(arraySize, 0); // Add vectors in parallel. addWithCuda(c.data(), a.data(), b.data(), arraySize); printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]); } catch ( const cu::cuda_error& er ) { fprintf(stderr, "%s : %s\n", er.name(), er.what()); } // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaError_t cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } } // Helper function for using CUDA to add vectors in parallel. void addWithCuda(int *c, const int *a, const int *b, unsigned int size) { cudaError_t cudaStatus; cu::device_allocator<int> alloc; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); cuda_check(cudaStatus); cu::device_unique_ptr<int[]> dev_a(alloc.allocate(size)); cu::device_unique_ptr<int[]> dev_b(alloc.allocate(size)); cu::device_unique_ptr<int[]> dev_c(alloc.allocate(size)); cudaStatus = cudaMemcpyAsync(dev_a.get(), a, size*sizeof(int), cudaMemcpyHostToDevice); cuda_check(cudaStatus); cudaStatus = cudaMemcpyAsync(dev_b.get(), b, size*sizeof(int), cudaMemcpyHostToDevice); cuda_check(cudaStatus); // Launch a kernel on the GPU with one thread for each element. addKernel<<<1, size>>>(dev_c.get(), dev_a.get(), dev_b.get()); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); cuda_check(cudaStatus); cudaStatus = cudaMemcpyAsync(c, dev_c.get(), size*sizeof(int), cudaMemcpyDeviceToHost); cuda_check(cudaStatus); // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); cuda_check(cudaStatus); }