#include <stdio.h>
#include <stdlib.h>
#include "timestamp.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

typedef long arrayelt;

arrayelt *A, *B, *C;
long arraydim = 12345678, arraysize;
int gputhreads = 1, gpublocks = 0;
enum { mem_host, mem_managed, mem_sep } gpumem;

void getparms(int argc, char* argv[])
{
	if (argc > 1)
		arraydim = atol(argv[1]);
	if (argc > 2)
		gputhreads = atoi(argv[2]);
	if (argc > 3)
		gpublocks = atoi(argv[3]);
	if (argc > 4)
		switch (argv[4][0]) {
		default:
		case 'h':
			gpumem = mem_host; break;
		case 'm':
			gpumem = mem_managed; break;
		case 's':
			gpumem = mem_sep; break;
		}
	arraysize = arraydim * sizeof(arrayelt);
	if (gpublocks == 0)
		gpublocks = arraydim / gputhreads;
	printf("arraysize = %.0fMB, GPU grid = %d x %d, %s memory\n", (float)arraysize / 1024 / 1024, gpublocks, gputhreads,
		(gpumem == mem_host ? "host" : (gpumem == mem_managed ? "managed" : "separate")));
}

void gethostarray(arrayelt*& a)
{
	if (gpumem == mem_host)
		cudaMallocHost(&a, arraysize);
	else if (gpumem == mem_managed)
		cudaMallocManaged(&a, arraysize);
	else
		a = (arrayelt*)malloc(arraysize);
}

void freehostarray(arrayelt* a)
{
	if (gpumem == mem_host)
		cudaFreeHost(&a);
	else if (gpumem == mem_managed)
		cudaFree(&a);
	else
		free(a);
}

void getdevarray(arrayelt*& a)
{
	cudaMalloc(&a, arraysize);
}

void freedevarray(arrayelt* a)
{
	cudaFree(a);
}

void copyhosttodev(arrayelt* a, arrayelt* da)
{
	cudaMemcpy(da, a, arraysize, cudaMemcpyHostToDevice);
}

void copydevtohost(arrayelt* da, arrayelt* a)
{
	cudaMemcpy(a, da, arraysize, cudaMemcpyDeviceToHost);
}

void initarrays(void)
{
	for (long ai = 0; ai < arraydim; ai++) {
		A[ai] = ai;
		B[ai] = ai + 1;
	}
}

void setupsourcearrays(arrayelt*& dA, arrayelt*& dB, arrayelt*& dC)
{
	if (gpumem == mem_sep) {
		getdevarray(dA); getdevarray(dB); getdevarray(dC);
		timestamp("... device arrays allocated");

		copyhosttodev(A, dA); copyhosttodev(B, dB);
		timestamp("... source arrays copied to device");

		freehostarray(A); freehostarray(B);
		timestamp("... source arrays deallocated");
	}
	else {	// mem_host or mem_managed
		gethostarray(C);
		timestamp("... destination array allocated");

		dA = A; dB = B; dC = C;
	}
}

void setupdestarrays(arrayelt* dA, arrayelt* dB, arrayelt* dC)
{
	if (gpumem == mem_sep) {
		gethostarray(C);
		timestamp("... destination array allocated");

		copydevtohost(dC, C);
		timestamp("... destination array copied to host");

		freedevarray(dA); freedevarray(dB); freedevarray(dC);
		timestamp("... device arrays deallocated");
	}
	else {	// mem_host or mem_managed
		freehostarray(A); freehostarray(B);
		timestamp("... source arrays deallocated");
	}
}

__device__
inline arrayelt mathalgo(const arrayelt a, const arrayelt b)
{
	return (a * a + b);
}

__global__
void domath(long arraydim, arrayelt* A, arrayelt* B, arrayelt* C)
{
	long index = blockIdx.x * blockDim.x + threadIdx.x;
	long stride = gridDim.x * blockDim.x;
	for (long ai = index; ai < arraydim; ai += stride)
		C[ai] = mathalgo(A[ai], B[ai]);
}

int main(int argc, char* argv[])
{
	arrayelt *dA, * dB, * dC;

	printf("hello! - using GPU, grid of threads, host|managed|separate memory\n");

	getparms(argc, argv);
	timestamp(NULL);

	gethostarray(A); gethostarray(B);
	timestamp("... source arrays allocated");

	initarrays();
	timestamp("... source arrays initialized");

	setupsourcearrays(dA, dB, dC);

	domath << < gpublocks, gputhreads >> > (arraydim, dA, dB, dC);
	cudaDeviceSynchronize();
	timestamp("... computation done");

	setupdestarrays(dA, dB, dC);

	freehostarray(C);
	timestamp("... destination array deallocated");

	printf("goodbye\n");
	return (0);
}