on-memory database の検索 : SELECT JOIN ~
SELECT WHERE の応用(?)として SELECT JOIN の試作。
SELECT * FROM A, B JOIN A.id = B.id WHERE A.weight < 0.1;
ふたつの配列 A[Na], B[Nb] があって、A と B の直積すなわち A[x]. B[y] (x = 0..Na-1, y = 0..Nb-1) のすべてのペア(Na*Nb通り)に対し、A[x].id == B[y].id かつ A[x].weight < 0.1 を満たす x,y の組を列挙する、と。
直積ですから x,y に対する 二重のfor-loop で全組み合わせが作れます。CUDAではスレッド数を決定する grid, block が三次元(dim3
)なので三重のloopまでならするっと書けます。
/* SELECT * FROM arecord, brecord JOIN arecord.id = brecord.id WHERE arecord.weight < 0.1 */ // nvcc fake_DB.cpp --expt-extended-lambda // CUDA 8.0 #include <cuda_runtime.h> #include <device_launch_parameters.h> struct Arecord { int id; float weight; }; struct Brecord { int id; float height; }; template<typename Join, typename Where> __global__ void kernel_select( const Arecord* arecords, unsigned int asize, const Brecord* brecords, unsigned int bsize, Join jpredicate, Where wpredicate, int* count, uint2* indices) { unsigned int sx = blockDim.x * blockIdx.x + threadIdx.x; unsigned int sy = blockDim.y * blockIdx.y + threadIdx.y; if ( sx ==0 && sy == 0 ) *count = 0; __syncthreads(); for ( unsigned int x = sx; x < asize; x += gridDim.x * blockDim.x ) { for ( unsigned int y = sy; y < bsize; y += gridDim.y * blockDim.y ) { if ( jpredicate(arecords[x],brecords[y]) && wpredicate(arecords[x],brecords[y]) ) { indices[atomicAdd(count, 1)] = uint2{x,y}; } } } } #include <random> #include <iostream> #include <iomanip> #include <algorithm> using namespace std; int main() { const int Na = 100; const int Nb = 100; Arecord arecords[Na]; Brecord brecords[Nb]; // fill records[] with random numbers mt19937 gen; uniform_int_distribution<int> idist(1,10); uniform_real_distribution<float> fdist; generate_n(arecords, Na, [&]() { return Arecord{ idist(gen),fdist(gen) }; }); generate_n(brecords, Nb, [&]() { return Brecord{ idist(gen),fdist(gen) }; }); // result uint2 indices[Na*Nb]; int count; // allocate device-mem. Arecord* dev_arecords; cudaMalloc(&dev_arecords, Na*sizeof(Arecord)); Brecord* dev_brecords; cudaMalloc(&dev_brecords, Nb*sizeof(Brecord)); uint2* dev_indices; cudaMalloc(&dev_indices, Na*Nb * sizeof(uint2)); int* dev_count; cudaMalloc(&dev_count, sizeof(int)); // copy records from HOST to DEVICE cudaMemcpy(dev_arecords, arecords, Na*sizeof(Arecord), cudaMemcpyHostToDevice); cudaMemcpy(dev_brecords, brecords, Nb*sizeof(Brecord), cudaMemcpyHostToDevice); auto join = [] __device__(const Arecord& arec, const Brecord& brec) { return arec.id == brec.id; }; auto where = [] __device__(const Arecord& arec, const Brecord& brec) { return arec.weight < 0.1f; }; dim3 grid { 2, 2 }; dim3 block { 38, 8 }; kernel_select<<<grid, block>>>( dev_arecords, Na, // FROM arecords dev_brecords, Nb, // ,brecords join, // JOIN arecords.id = brecords.id where, // WHERE arecords.weight < 0.1 dev_count, dev_indices); // SELECTed int-pair // copy result from DEVICE to HOST. cudaMemcpy(&count, dev_count, sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(indices, dev_indices, count*sizeof(uint2), cudaMemcpyDeviceToHost); //*/ for (int i = 0; i < count; ++i) { cout << left << setw(3) << indices[i].x << ":" << setw(10) << brecords[indices[i].y].height << " , " << setw(3) << indices[i].y << ":" << setw(10) << arecords[indices[i].x].weight << endl; } //*/ cout << count << " records found." << endl; cudaFree(dev_arecords); cudaFree(dev_brecords); cudaFree(dev_count); cudaFree(dev_indices); cudaDeviceReset(); }
Device-memoryの制限がよりいっそうキツくなります。Device-memoryに載せる配列がふたつになりますし、検索結果(int
のペア)の格納域がNa*Nb
に比例して増えますもん。