#include #include #include "timestamp.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" typedef long arrayelt; arrayelt *A, *B, *C; long arraydim = 12345678, arraysize; int gputhreads = 1, gpublocks = 0, gpustreams = 1; enum { mem_host, mem_managed, mem_sep } gpumem; void getparms(int argc, char* argv[]) { if (argc > 1) arraydim = atol(argv[1]); if (argc > 2) gputhreads = atoi(argv[2]); if (argc > 3) gpublocks = atoi(argv[3]); if (argc > 4) switch (argv[4][0]) { default: case 'h': gpumem = mem_host; break; case 'm': gpumem = mem_managed; break; case 's': gpumem = mem_sep; break; } if (argc > 5) gpustreams = atoi(argv[5]); arraysize = arraydim * sizeof(arrayelt); if (gpublocks == 0) gpublocks = arraydim / gpustreams / gputhreads; printf("arraysize = %.0fMB, GPU grid = %d x %d, %s memory, %d streams\n", (float)arraysize / 1024 / 1024, gpublocks, gputhreads, (gpumem == mem_host ? "host" : (gpumem == mem_managed ? "managed" : "separate")), gpustreams); } void gethostarray(arrayelt*& a) { if (gpumem == mem_host) cudaMallocHost(&a, arraysize); else if (gpumem == mem_managed) cudaMallocManaged(&a, arraysize); else a = (arrayelt*)malloc(arraysize); } void freehostarray(arrayelt* a) { if (gpumem == mem_host) cudaFreeHost(&a); else if (gpumem == mem_managed) cudaFree(&a); else free(a); } void getdevarray(arrayelt*& a) { cudaMalloc(&a, arraysize); } void freedevarray(arrayelt* a) { cudaFree(a); } void copyhosttodev(arrayelt* a, arrayelt* da) { cudaMemcpy(da, a, arraysize, cudaMemcpyHostToDevice); } void copydevtohost(arrayelt* da, arrayelt* a) { cudaMemcpy(a, da, arraysize, cudaMemcpyDeviceToHost); } void initarrays(void) { for (long ai = 0; ai < arraydim; ai++) { A[ai] = ai; B[ai] = ai + 1; } } void setupsourcearrays(arrayelt *&dA, arrayelt *&dB, arrayelt *&dC) { if (gpumem == mem_sep) { getdevarray(dA); getdevarray(dB); getdevarray(dC); timestamp("... device arrays allocated"); copyhosttodev(A, dA); copyhosttodev(B, dB); timestamp("... source arrays copied to device"); freehostarray(A); freehostarray(B); timestamp("... source arrays deallocated"); } else { // mem_host or mem_managed gethostarray(C); timestamp("... destination array allocated"); dA = A; dB = B; dC = C; } } void setupdestarrays(arrayelt* dA, arrayelt* dB, arrayelt* dC) { if (gpumem == mem_sep) { gethostarray(C); timestamp("... destination array allocated"); copydevtohost(dC, C); timestamp("... destination array copied to host"); freedevarray(dA); freedevarray(dB); freedevarray(dC); timestamp("... device arrays deallocated"); } else { // mem_host or mem_managed freehostarray(A); freehostarray(B); timestamp("... source arrays deallocated"); } } __device__ inline arrayelt dmathalgo(const arrayelt a, const arrayelt b) { return (a * a + b); } __global__ void domath(long arraydim, arrayelt* A, arrayelt* B, arrayelt* C, int streamindex, int streamdim) { long index = streamindex * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x; long stride = streamdim * gridDim.x * blockDim.x; for (long ai = index; ai < arraydim; ai += stride) C[ai] = dmathalgo(A[ai], B[ai]); } int main(int argc, char* argv[]) { arrayelt *dA, * dB, * dC; cudaError_t cerr; cudaDeviceProp prop; printf("hello! - using GPU, grid of threads, host|managed|separate memory, multiple streams\n"); if ((cerr = cudaGetDeviceProperties(&prop, 0))) return (printf("!!! get GPU properties failed: %s\n", cudaGetErrorString(cerr)), -1); printf("GPU %s - %.0fMB, %d processors, %d threads per processor\n", prop.name, (float)prop.totalGlobalMem / 1024 / 1024, prop.multiProcessorCount, prop.maxThreadsPerMultiProcessor); getparms(argc, argv); timestamp(NULL); gethostarray(A); gethostarray(B); timestamp("... source arrays allocated"); initarrays(); timestamp("... source arrays initialized"); setupsourcearrays(dA, dB, dC); cudaStream_t cs[gpustreams]; for (int si = 0; si < gpustreams; si++) { cudaStreamCreate(&cs[si]); domath << < gpublocks, gputhreads, 0, cs[si] >> > (arraydim, dA, dB, dC, si, gpustreams); if ((cerr = cudaPeekAtLastError())) printf("GPU error : %s\n", cudaGetErrorString(cerr)); } cudaDeviceSynchronize(); for (int si = 0; si < gpustreams; si++) cudaStreamDestroy(cs[si]); timestamp("... computation done"); setupdestarrays(dA, dB, dC); freehostarray(C); timestamp("... destination array deallocated"); printf("goodbye\n"); return (0); }