東方算程譚

επιστημηがヨタをこく、弾幕とは無縁のCUDAなタワゴト

カスタム・アロケータ

CUDAでは目的/用途に応じて様々なメモリの確保/解放APIが用意されています。

  1. ピン留めされたHost-memory : cudaMallocHost / cudaFreeHost
  2. Host/Device双方で共用できるManaged-memory : cudaMallocManaged / cudaFree
  3. Device-memory : cudaMalloc : cudaFree

Device側はさておき、Host側は上記1.2および通常のnew[] / delete[] の3種のメモリ確保/解放を使い分けることになります。

C++屋が(可変長)配列を扱う際。日常的にstd::vectorのお世話になるのですが、std::vector<T>は(デフォルトで)new[]/delete[]が内部的なメモリ管理に使われます。

このメモリ管理をcudaMallocHost/cudaFreeHostあるいはcudaMallocManaged/cudaFreeに差し替えることができれば ピン留めされたvectorHost/Deviceの双方でで共用できるvector が使えて便利。

ってわけで実装しました。ついでに unnique_device_ptr と cuda runtime 例外も。

/* cuda_except.h */
#ifndef CUDA_EXCEPT_H_
#define CUDA_EXCEPT_H_

#include <cuda_runtime.h>
#include <stdexcept>

namespace cu {

class cuda_error : public std::runtime_error {
  cudaError_t err_;
public:
  cuda_error(cudaError_t error) : std::runtime_error(cudaGetErrorString(error)), err_(error) {}
  cudaError_t code() const { return err_; }
  const char* name() const { return cudaGetErrorName(err_); }
};

}
#endif
/* cuda_allocator.h */
#ifndef CUDA_ALLOCATOR_H_
#define CUDA_ALLOCATOR_H_

#include "cuda_except.h"

namespace cu {

template <class T>
struct host_allocator {
  typedef T value_type;
  host_allocator() noexcept {} //default ctor not required by STL
  template<class U> host_allocator(const host_allocator<U>&) noexcept {}
  template<class U> bool operator==(const host_allocator<U>&) const noexcept { return true; }
  template<class U> bool operator!=(const host_allocator<U>&) const noexcept { return false; }
  T* allocate(const size_t n) const;
  void deallocate(T* const p) const noexcept;
  void deallocate(T* const p, size_t) const noexcept { deallocate(p); }
};

template <class T>
struct managed_allocator {
  typedef T value_type;
  managed_allocator() noexcept {} //default ctor not required by STL
  template<class U> managed_allocator(const managed_allocator<U>&) noexcept {}
  template<class U> bool operator==(const managed_allocator<U>&) const noexcept { return true; }
  template<class U> bool operator!=(const managed_allocator<U>&) const noexcept { return false; }
  T* allocate(const size_t n) const;
  void deallocate(T* const p) const noexcept;
  void deallocate(T* const p, size_t) const noexcept { deallocate(p); }
};

template <class T>
struct device_allocator {
  typedef T value_type;
  device_allocator() noexcept {} //default ctor not required by STL
  template<class U> device_allocator(const device_allocator<U>&) noexcept {}
  template<class U> bool operator==(const device_allocator<U>&) const noexcept { return true; }
  template<class U> bool operator!=(const device_allocator<U>&) const noexcept { return false; }
  T* allocate(const size_t n) const;
  void deallocate(T* const p) const noexcept;
  void deallocate(T* const p, size_t) const noexcept { deallocate(p); }
};

#include <cuda_runtime.h>

template <class T>
T* host_allocator<T>::allocate(const size_t n) const {
  if ( n == 0 ) return nullptr;
  if ( n > static_cast<size_t>(-1) / sizeof(T) ) throw std::bad_array_new_length();
  void* pv = nullptr;
  cudaError_t err = cudaMallocHost(&pv, n*sizeof(T));
  if ( err != cudaSuccess ) throw cuda_error(err);
  return static_cast<T*>(pv);
}

template<class T> 
void host_allocator<T>::deallocate(T * const p) const noexcept{
  cudaError_t err = cudaFreeHost(p);
//if ( err != cudaSuccess ) throw cuda_error(err);
}

template <class T>
T* managed_allocator<T>::allocate(const size_t n) const {
  if ( n == 0 ) return nullptr;
  if ( n > static_cast<size_t>(-1) / sizeof(T) ) throw std::bad_array_new_length();
  void* pv = nullptr;
  cudaError_t err = cudaMallocManaged(&pv, n*sizeof(T));
  if ( err != cudaSuccess ) throw cuda_error(err);
  return static_cast<T*>(pv);
}

template<class T> 
void managed_allocator<T>::deallocate(T * const p) const noexcept {
  cudaError_t err = cudaFree(p);
//if ( err != cudaSuccess ) throw cuda_error(err);
}

template <class T>
T* device_allocator<T>::allocate(const size_t n) const {
  if ( n == 0 ) return nullptr;
  if ( n > static_cast<size_t>(-1) / sizeof(T) ) throw std::bad_array_new_length();
  void* pv = nullptr;
  cudaError_t err = cudaMalloc(&pv, n*sizeof(T));
  if ( err != cudaSuccess ) throw cuda_error(err);
  return static_cast<T*>(pv);
}

template<class T> 
void device_allocator<T>::deallocate(T * const p) const noexcept {
  cudaError_t err = cudaFree(p);
//if ( err != cudaSuccess ) throw cuda_error(err);
}

}
#endif
/* unique_device_ptr.h */
#ifndef UNIQUE_DEVICE_PTR_H_
#define UNIQUE_DEVICE_PTR_H_

#include <cuda_runtime.h>
#include <memory>

namespace cu {

template<typename T> struct device_delete {
  device_delete() noexcept = default;
  void operator()(T* ptr) const { cudaFree(ptr); }
};

template<typename T> struct device_delete<T[]> {
  device_delete() noexcept = default;
  void operator()(T* ptr) const { cudaFree(ptr); }
};

template<typename T>
using device_unique_ptr = std::unique_ptr<T,device_delete<T>>;

}

#endif

Windows版CUDA Toolkitではおなじみの配列の足し算をre-writeしてみました。メモリ管理とエラー処理がぐっと楽になります。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include "cuda_allocator.h"
#include "unique_device_ptr.h"

#include <vector>

template<typename T> using host_vector = std::vector<T, cu::host_allocator<T>>;

void addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b) {
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

inline void cuda_check(cudaError_t status) {
  if (status != cudaSuccess) throw cu::cuda_error(status);
}

int main() {
  try {
    const int arraySize = 5;
    host_vector<int> a = {  1,  2,  3,  4,  5 };
    host_vector<int> b = { 10, 20, 30, 40, 50 };
    host_vector<int> c(arraySize, 0);

    // Add vectors in parallel.
    addWithCuda(c.data(), a.data(), b.data(), arraySize);

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);
  } catch ( const cu::cuda_error& er ) {
    fprintf(stderr, "%s : %s\n", er.name(), er.what());
  }

   // cudaDeviceReset must be called before exiting in order for profiling and
  // tracing tools such as Nsight and Visual Profiler to show complete traces.
  cudaError_t cudaStatus = cudaDeviceReset();
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
  }

}

// Helper function for using CUDA to add vectors in parallel.
void addWithCuda(int *c, const int *a, const int *b, unsigned int size) {
  cudaError_t cudaStatus;

  cu::device_allocator<int> alloc;

  // Choose which GPU to run on, change this on a multi-GPU system.
  cudaStatus = cudaSetDevice(0);
  cuda_check(cudaStatus);

  cu::device_unique_ptr<int[]> dev_a(alloc.allocate(size));
  cu::device_unique_ptr<int[]> dev_b(alloc.allocate(size));
  cu::device_unique_ptr<int[]> dev_c(alloc.allocate(size));

  cudaStatus = cudaMemcpyAsync(dev_a.get(), a, size*sizeof(int), cudaMemcpyHostToDevice);
  cuda_check(cudaStatus);
  cudaStatus = cudaMemcpyAsync(dev_b.get(), b, size*sizeof(int), cudaMemcpyHostToDevice);
  cuda_check(cudaStatus);

  // Launch a kernel on the GPU with one thread for each element.
  addKernel<<<1, size>>>(dev_c.get(), dev_a.get(), dev_b.get());

  // Check for any errors launching the kernel
  cudaStatus = cudaGetLastError();
  cuda_check(cudaStatus);
    
  cudaStatus = cudaMemcpyAsync(c, dev_c.get(), size*sizeof(int), cudaMemcpyDeviceToHost);
  cuda_check(cudaStatus);

  // cudaDeviceSynchronize waits for the kernel to finish, and returns
  // any errors encountered during the launch.
  cudaStatus = cudaDeviceSynchronize();
  cuda_check(cudaStatus);

}