#include #include #include "timestamp.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" typedef long arrayelt; arrayelt *A, *B, *C; long arraydim = 12345678, arraysize; int gputhreads = 1, gpublocks = 0; enum { mem_host, mem_managed, mem_sep } gpumem; void getparms(int argc, char* argv[]) { if (argc > 1) arraydim = atol(argv[1]); if (argc > 2) gputhreads = atoi(argv[2]); if (argc > 3) gpublocks = atoi(argv[3]); if (argc > 4) switch (argv[4][0]) { default: case 'h': gpumem = mem_host; break; case 'm': gpumem = mem_managed; break; case 's': gpumem = mem_sep; break; } arraysize = arraydim * sizeof(arrayelt); if (gpublocks == 0) gpublocks = arraydim / gputhreads; printf("arraysize = %.0fMB, GPU grid = %d x %d, %s memory\n", (float)arraysize / 1024 / 1024, gpublocks, gputhreads, (gpumem == mem_host ? "host" : (gpumem == mem_managed ? "managed" : "separate"))); } void gethostarray(arrayelt*& a) { if (gpumem == mem_host) cudaMallocHost(&a, arraysize); else if (gpumem == mem_managed) cudaMallocManaged(&a, arraysize); else a = (arrayelt*)malloc(arraysize); } void freehostarray(arrayelt* a) { if (gpumem == mem_host) cudaFreeHost(&a); else if (gpumem == mem_managed) cudaFree(&a); else free(a); } void getdevarray(arrayelt*& a) { cudaMalloc(&a, arraysize); } void freedevarray(arrayelt* a) { cudaFree(a); } void copyhosttodev(arrayelt* a, arrayelt* da) { cudaMemcpy(da, a, arraysize, cudaMemcpyHostToDevice); } void copydevtohost(arrayelt* da, arrayelt* a) { cudaMemcpy(a, da, arraysize, cudaMemcpyDeviceToHost); } void initarrays(void) { for (long ai = 0; ai < arraydim; ai++) { A[ai] = ai; B[ai] = ai + 1; } } void setupsourcearrays(arrayelt*& dA, arrayelt*& dB, arrayelt*& dC) { if (gpumem == mem_sep) { getdevarray(dA); getdevarray(dB); getdevarray(dC); timestamp("... device arrays allocated"); copyhosttodev(A, dA); copyhosttodev(B, dB); timestamp("... source arrays copied to device"); freehostarray(A); freehostarray(B); timestamp("... source arrays deallocated"); } else { // mem_host or mem_managed gethostarray(C); timestamp("... destination array allocated"); dA = A; dB = B; dC = C; } } void setupdestarrays(arrayelt* dA, arrayelt* dB, arrayelt* dC) { if (gpumem == mem_sep) { gethostarray(C); timestamp("... destination array allocated"); copydevtohost(dC, C); timestamp("... destination array copied to host"); freedevarray(dA); freedevarray(dB); freedevarray(dC); timestamp("... device arrays deallocated"); } else { // mem_host or mem_managed freehostarray(A); freehostarray(B); timestamp("... source arrays deallocated"); } } __device__ inline arrayelt mathalgo(const arrayelt a, const arrayelt b) { return (a * a + b); } __global__ void domath(long arraydim, arrayelt* A, arrayelt* B, arrayelt* C) { long index = blockIdx.x * blockDim.x + threadIdx.x; long stride = gridDim.x * blockDim.x; for (long ai = index; ai < arraydim; ai += stride) C[ai] = mathalgo(A[ai], B[ai]); } int main(int argc, char* argv[]) { arrayelt *dA, * dB, * dC; printf("hello! - using GPU, grid of threads, host|managed|separate memory\n"); getparms(argc, argv); timestamp(NULL); gethostarray(A); gethostarray(B); timestamp("... source arrays allocated"); initarrays(); timestamp("... source arrays initialized"); setupsourcearrays(dA, dB, dC); domath << < gpublocks, gputhreads >> > (arraydim, dA, dB, dC); cudaDeviceSynchronize(); timestamp("... computation done"); setupdestarrays(dA, dB, dC); freehostarray(C); timestamp("... destination array deallocated"); printf("goodbye\n"); return (0); }