#include <stdio.h>
#include <stdlib.h>
#include "timestamp.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

typedef long arrayelt;

arrayelt *A, *B, *C;
long arraydim = 12345678, arraysize;
int gputhreads = 1, gpublocks = 0, gpustreams = 1;
enum { mem_host, mem_managed, mem_sep } gpumem;

void getparms(int argc, char* argv[])
{
	if (argc > 1)
		arraydim = atol(argv[1]);
	if (argc > 2)
		gputhreads = atoi(argv[2]);
	if (argc > 3)
		gpublocks = atoi(argv[3]);
	if (argc > 4)
		switch (argv[4][0]) {
		default:
		case 'h':
			gpumem = mem_host; break;
		case 'm':
			gpumem = mem_managed; break;
		case 's':
			gpumem = mem_sep; break;
		}
	if (argc > 5)
		gpustreams = atoi(argv[5]);
	arraysize = arraydim * sizeof(arrayelt);
	if (gpublocks == 0)
		gpublocks = arraydim / gpustreams / gputhreads;
	printf("arraysize = %.0fMB, GPU grid = %d x %d, %s memory, %d streams\n", (float)arraysize / 1024 / 1024, gpublocks, 
		gputhreads, (gpumem == mem_host ? "host" : (gpumem == mem_managed ? "managed" : "separate")), gpustreams);
}

void gethostarray(arrayelt*& a)
{
	if (gpumem == mem_host)
		cudaMallocHost(&a, arraysize);
	else if (gpumem == mem_managed)
		cudaMallocManaged(&a, arraysize);
	else
		a = (arrayelt*)malloc(arraysize);
}

void freehostarray(arrayelt* a)
{
	if (gpumem == mem_host)
		cudaFreeHost(&a);
	else if (gpumem == mem_managed)
		cudaFree(&a);
	else
		free(a);
}

void getdevarray(arrayelt*& a)
{
	cudaMalloc(&a, arraysize);
}

void freedevarray(arrayelt* a)
{
	cudaFree(a);
}

void copyhosttodev(arrayelt* a, arrayelt* da)
{
	cudaMemcpy(da, a, arraysize, cudaMemcpyHostToDevice);
}

void copydevtohost(arrayelt* da, arrayelt* a)
{
	cudaMemcpy(a, da, arraysize, cudaMemcpyDeviceToHost);
}

void initarrays(void)
{
	for (long ai = 0; ai < arraydim; ai++) {
		A[ai] = ai;
		B[ai] = ai + 1;
	}
}

void setupsourcearrays(arrayelt *&dA, arrayelt *&dB, arrayelt *&dC)
{
	if (gpumem == mem_sep) {
		getdevarray(dA); getdevarray(dB); getdevarray(dC);
		timestamp("... device arrays allocated");

		copyhosttodev(A, dA); copyhosttodev(B, dB);
		timestamp("... source arrays copied to device");

		freehostarray(A); freehostarray(B);
		timestamp("... source arrays deallocated");
	}
	else {	// mem_host or mem_managed
		gethostarray(C);
		timestamp("... destination array allocated");

		dA = A; dB = B; dC = C;
	}
}

void setupdestarrays(arrayelt* dA, arrayelt* dB, arrayelt* dC)
{
	if (gpumem == mem_sep) {
		gethostarray(C);
		timestamp("... destination array allocated");
		
		copydevtohost(dC, C);
		timestamp("... destination array copied to host");
		
		freedevarray(dA); freedevarray(dB); freedevarray(dC);
		timestamp("... device arrays deallocated");
	}
	else {	// mem_host or mem_managed
		freehostarray(A); freehostarray(B);
		timestamp("... source arrays deallocated");
	}
}

__device__
inline arrayelt dmathalgo(const arrayelt a, const arrayelt b)
{
	return (a * a + b);
}

__global__
void domath(long arraydim, arrayelt* A, arrayelt* B, arrayelt* C, int streamindex, int streamdim)
{
	long index = streamindex * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x; 
	long stride = streamdim * gridDim.x * blockDim.x;
	for (long ai = index; ai < arraydim; ai += stride)
		C[ai] = dmathalgo(A[ai], B[ai]);
}

int main(int argc, char* argv[])
{
	arrayelt *dA, * dB, * dC;
	cudaError_t cerr;
	cudaDeviceProp prop;

	printf("hello! - using GPU, grid of threads, host|managed|separate memory, multiple streams\n");

	if ((cerr = cudaGetDeviceProperties(&prop, 0)))
		return (printf("!!! get GPU properties failed: %s\n", cudaGetErrorString(cerr)), -1);
	printf("GPU %s - %.0fMB, %d processors, %d threads per processor\n", prop.name, 
		(float)prop.totalGlobalMem / 1024 / 1024, prop.multiProcessorCount, prop.maxThreadsPerMultiProcessor);

	getparms(argc, argv);
	timestamp(NULL);

	gethostarray(A); gethostarray(B);
	timestamp("... source arrays allocated");

	initarrays();
	timestamp("... source arrays initialized");

	setupsourcearrays(dA, dB, dC);

	cudaStream_t cs[gpustreams];
		
	for (int si = 0; si < gpustreams; si++) {
		cudaStreamCreate(&cs[si]);
		domath << < gpublocks, gputhreads, 0, cs[si] >> > (arraydim, dA, dB, dC, si, gpustreams);
		if ((cerr = cudaPeekAtLastError()))
			printf("GPU error : %s\n", cudaGetErrorString(cerr));
	}

	cudaDeviceSynchronize();
	for (int si = 0; si < gpustreams; si++)
		cudaStreamDestroy(cs[si]);
	timestamp("... computation done");
	
	setupdestarrays(dA, dB, dC);

	freehostarray(C);
	timestamp("... destination array deallocated");

	printf("goodbye\n");
	return (0);
}