#include #include #include "timestamp.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" typedef long arrayelt; arrayelt *A, *B, *C; long arraydim = 12345678, arraysize; int gputhreads = 1; void getparms(int argc, char* argv[]) { if (argc > 1) arraydim = atol(argv[1]); if (argc > 2) gputhreads = atoi(argv[2]); arraysize = arraydim * sizeof(arrayelt); printf("arraysize = %.0fMB, GPU threads = %d\n", (float)arraysize / 1024 / 1024, gputhreads); } void getarray(arrayelt*& a) { cudaMallocHost(&a, arraysize); } void freearray(arrayelt* a) { cudaFreeHost(a); } void initarrays(void) { for (long ai = 0; ai < arraydim; ai++) { A[ai] = ai; B[ai] = ai + 1; } } __device__ inline arrayelt dmathalgo(const arrayelt a, const arrayelt b) { return (a * a + b); } __global__ void domath(long arraydim, arrayelt* A, arrayelt* B, arrayelt* C) { long index = threadIdx.x; long stride = blockDim.x; for (long ai = index; ai < arraydim; ai += stride) C[ai] = dmathalgo(A[ai], B[ai]); } int main(int argc, char* argv[]) { printf("hello! - using GPU, multiple threads\n"); getparms(argc, argv); timestamp(NULL); getarray(A); getarray(B); getarray(C); timestamp("... arrays allocated"); initarrays(); timestamp("... arrays initialized"); domath << < 1, gputhreads >> > (arraydim, A, B, C); cudaDeviceSynchronize(); timestamp("... computation done"); freearray(A); freearray(B); freearray(C); timestamp("... arrays deallocated"); printf("goodbye\n"); return (0); }