東方算程譚

επιστημηがヨタをこく、弾幕とは無縁のCUDAなタワゴト

on-memory database の検索 : SELECT JOIN ~

SELECT WHERE の応用(?)として SELECT JOIN の試作。

SELECT * 
  FROM A, B
  JOIN A.id = B.id
  WHERE A.weight < 0.1;

ふたつの配列 A[Na], B[Nb] があって、A と B の直積すなわち A[x]. B[y] (x = 0..Na-1, y = 0..Nb-1) のすべてのペア(Na*Nb通り)に対し、A[x].id == B[y].id かつ A[x].weight < 0.1 を満たす x,y の組を列挙する、と。

直積ですから x,y に対する 二重のfor-loop で全組み合わせが作れます。CUDAではスレッド数を決定する grid, block が三次元(dim3)なので三重のloopまでならするっと書けます。

/*
   SELECT * FROM arecord, brecord
     JOIN arecord.id = brecord.id
     WHERE arecord.weight < 0.1
*/

// nvcc fake_DB.cpp --expt-extended-lambda 
// CUDA 8.0

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

struct Arecord {
  int   id;
  float weight;
};

struct Brecord {
  int   id;
  float height;
};

template<typename Join, typename Where>
__global__ void kernel_select(
    const Arecord* arecords, unsigned int asize,
    const Brecord* brecords, unsigned int bsize, 
    Join jpredicate, Where wpredicate,
    int* count, uint2* indices) {
  unsigned int sx = blockDim.x * blockIdx.x + threadIdx.x;
  unsigned int sy = blockDim.y * blockIdx.y + threadIdx.y;
  if ( sx ==0 && sy == 0 ) *count = 0;
  __syncthreads();

  for ( unsigned int x = sx; x < asize; x += gridDim.x * blockDim.x ) {
    for ( unsigned int y = sy; y < bsize; y += gridDim.y * blockDim.y ) {
      if ( jpredicate(arecords[x],brecords[y]) &&
           wpredicate(arecords[x],brecords[y]) ) {
        indices[atomicAdd(count, 1)] = uint2{x,y};
      }
    }
    
  }
}

#include <random>
#include <iostream>
#include <iomanip>
#include <algorithm>
using namespace std;

int main() {
  const int Na = 100;
  const int Nb = 100;
  Arecord arecords[Na];
  Brecord brecords[Nb];

  // fill records[] with random numbers
  mt19937 gen;
  uniform_int_distribution<int> idist(1,10);
  uniform_real_distribution<float> fdist;
  generate_n(arecords, Na, [&]() { return Arecord{ idist(gen),fdist(gen) }; });
  generate_n(brecords, Nb, [&]() { return Brecord{ idist(gen),fdist(gen) }; });

  // result
  uint2 indices[Na*Nb];
  int   count;

  // allocate device-mem.
  Arecord* dev_arecords;
  cudaMalloc(&dev_arecords, Na*sizeof(Arecord));
  Brecord* dev_brecords;
  cudaMalloc(&dev_brecords, Nb*sizeof(Brecord));
  uint2* dev_indices;
  cudaMalloc(&dev_indices, Na*Nb * sizeof(uint2));
  int* dev_count;
  cudaMalloc(&dev_count, sizeof(int));

  // copy records from HOST to DEVICE
  cudaMemcpy(dev_arecords, arecords, Na*sizeof(Arecord), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_brecords, brecords, Nb*sizeof(Brecord), cudaMemcpyHostToDevice);

  auto join  = [] __device__(const Arecord& arec, const Brecord& brec) { return arec.id == brec.id; };
  auto where = [] __device__(const Arecord& arec, const Brecord& brec) { return arec.weight < 0.1f; };

  dim3 grid { 2, 2 };
  dim3 block { 38, 8 };
  kernel_select<<<grid, block>>>(
    dev_arecords, Na, // FROM arecords
    dev_brecords, Nb, //     ,brecords
    join,             // JOIN arecords.id = brecords.id
    where,            // WHERE arecords.weight < 0.1
    dev_count, dev_indices); // SELECTed int-pair

  // copy result from DEVICE to HOST.
  cudaMemcpy(&count, dev_count, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(indices, dev_indices, count*sizeof(uint2), cudaMemcpyDeviceToHost);
//*/
  for (int i = 0; i < count; ++i) {
    cout << left 
         << setw(3) << indices[i].x << ":"
         << setw(10) << brecords[indices[i].y].height << " , "
         << setw(3) << indices[i].y << ":"
         << setw(10) << arecords[indices[i].x].weight << endl;
  }
//*/
  cout << count << " records found." << endl;

  cudaFree(dev_arecords);
  cudaFree(dev_brecords);
  cudaFree(dev_count);
  cudaFree(dev_indices);
  cudaDeviceReset();

}

f:id:Episteme:20170201191910p:plain

Device-memoryの制限がよりいっそうキツくなります。Device-memoryに載せる配列がふたつになりますし、検索結果(intのペア)の格納域がNa*Nb に比例して増えますもん。