# Sample source code from the Tutorial Introduction in the documentation.

import hashlib
import numpy as np
import math
import pycuda.driver as cuda
from pycuda.driver import Stream
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import random

'''
a = numpy.random.randn(4,4)

a = a.astype(numpy.float32)

a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)

cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
    __global__ void doublify(float *a)
    {
      int idx = threadIdx.x + threadIdx.y*4;
      a[idx] *= 2;
    }
    """)

func = mod.get_function("doublify")
func(a_gpu, block=(4,4,1))

a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print("original array:")
print(a)
print("doubled with kernel:")
print(a_doubled)

# alternate kernel invocation -------------------------------------------------

func(cuda.InOut(a), block=(4, 4, 1))
print("doubled with InOut:")
print(a)

# part 2 ----------------------------------------------------------------------

a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32))
a_doubled = (2*a_gpu).get()

print("original array:")
print(a_gpu)
print("doubled with gpuarray:")
print(a_doubled)
'''

N = 8
M = 2
sample_size = 64

def encode(v, offset):
    byte_values = []
    for i in range(0, math.ceil(N / 8)):
        x = 0
        for j in range(0, 8):
            index = i * 8 + j
            if offset + index >= len(v):
                break
            x <<= 1
            x |= int(v[offset + index])
        byte_values.append(x)
    return bytearray(x)

def sha(v, offset):
    global M
    x = encode(v, offset)
    m = hashlib.sha256()
    m.update(x)
    result = m.digest()
    return result[0] % M

def create_program_r(model, output_var):
    global N, M
    (constant, scalars, child) = model
    program = 'int ' + output_var + ' = ' + str(constant) + ';\n'
    scalars_part = ' + '.join([str(scalars[i]) + ' * x[gid * ' + str(N) + ' + ' + str(i) + ']' for i in range(0, len(scalars)) if scalars[i] > 0])
    if len(scalars_part) > 0:
        program += output_var + ' += ' + scalars_part + ';\n'
    if not child is None:
        left_output = output_var + '0'
        right_output = output_var + '1'
        (left, right) = child
        program += create_program_r(left, left_output)
        program += create_program_r(right, right_output)
        program += output_var + ' += ' + left_output + ' * ' + right_output + ';\n'
    program += output_var + ' %= ' + str(M) + ';\n'
    return program

def create_program(model, name, offset):
    output_var = 'output'
    program = '__global__ void ' + name + '(const int *x, int *out) {\n'
    program += 'int gid = threadIdx.x + blockIdx.x * blockDim.x;\n'
    program += create_program_r(model, output_var)
    program += 'out[' + str(offset) + ' + gid] = ' + output_var + ';\n'
    program += '}\n'
    return program

def distances_program():
    global N, sample_size
    program = "__global__ void p(const int *x, double *distances) {\n"
    program += "  int gid = threadIdx.x + blockIdx.x * blockDim.x;\n"
    program += "  int i = gid / " + str(sample_size) + ";\n"
    program += "  int j = gid % " + str(sample_size) + ";\n"
    program += "  if (i == j) {\n"
    program += "    distances[gid] = 0;\n"
    program += "    return;\n"
    program += "  }\n"
    program += "  int distance = 0;\n"
    program += "  for (int k = 0; k < " + str(N) + "; k++) {\n"
    program += "    distance += x[i * " + str(N) + " + k] ^ x[j * " + str(N) + " + k];\n"
    program += "  }\n"
    program += "  distances[gid] = pow((double)2.0, (double)-distance);\n"
    program += "}\n"
    return program

def coherence_program():
    global sample_size
    program = "__global__ void p(const int *y, const int *z, const double *distances, double *coherences) {\n"
    program += "  int gid = threadIdx.x + blockIdx.x * blockDim.x;\n"
    program += "  double numerator = 0;\n"
    program += "  double denominator = 0;\n"
    program += "  for (int i = 0; i < " + str(sample_size) + "; i++) {\n"
    program += "    int p = z[i] ^ y[gid * " + str(sample_size) + " + i];\n"
    program += "    for (int j = 0; j < " + str(sample_size) + "; j++) {\n"
    program += "      int q = z[j] ^ y[gid * " + str(sample_size) + " + j];\n"
    program += "      double distance = distances[i * " + str(sample_size) + " + j];\n"
    program += "      denominator += distance;\n"
    program += "      if (p == q) {\n"
    program += "        numerator += distance;\n"
    program += "      }\n"
    program += "    }\n"
    program += "  }\n"
    program += "  coherences[gid] = numerator / denominator;\n"
    program += "}\n"
    return program

def random_sample():
    global N, sample_size
    x = np.zeros((N * sample_size,)).astype(np.int32)
    for i in range(0, len(x)):
        x[i] = random.randint(0, 1)
    return x

def clone_model(model, p_mutation):
    global N, M

    p_constant = p_mutation * random.random()
    p_flip = p_mutation * random.random()
    p_add_child = p_mutation * random.random()
    p_drop_child = p_mutation * random.random()

    (constant, xors, child) = model
    if random.random() < p_constant:
        constant += random.randint(0, M - 1)
        constant %= M
    clone_xors = np.zeros((N,))
    np.copyto(clone_xors, xors)
    for i in range(0, N):
        if random.random() < p_flip:
            offset = 1 if M == 2 else random.randint(1, M - 1)
            clone_xors[i] += offset
            clone_xors[i] %= M
    if child is None:
        if random.random() < p_add_child:
            left = random_child(p_mutation)
            right = random_child(p_mutation)
            return (constant, clone_xors, (left, right))
        return (constant, clone_xors, None)
    if random.random() < p_drop_child:
        return (constant, clone_xors, None)
    (left, right) = child
    clone_left = clone_model(left, p_mutation)
    clone_right = clone_model(right, p_mutation)
    return (constant, clone_xors, (clone_left, clone_right))

def random_child(p_mutation):
    global N, M
    constant = random.randint(0, M - 1)
    xors = np.zeros((N,))

    p_flip = p_mutation * random.random()
    p_child = p_mutation * random.random()

    index = random.randint(0, N - 1)
    xors[index] = 1 if M == 2 else random.randint(1, M - 1)
    for i in range(0, N):
        if i != index and random.random() < p_flip:
            xors[i] = 1 if M == 2 else random.randint(1, M - 1)
    if random.random() < p_child:
        left = random_child(p_mutation * random.random())
        right = random_child(p_mutation * random.random())
        return (constant, xors, (left, right))
    return (constant, xors, None)

def null_candidate():
    global N
    return (0, np.zeros((N,)), None)

def main():
    global N, M, sample_size
    epochs = 1000
    num_survivors = 100
    num_offspring = 10
    num_candidates = num_survivors + num_survivors * num_offspring
    block_size = 1

    x = random_sample()
    z = np.zeros((sample_size,)).astype(np.int32)
    coherences = np.zeros((num_candidates,)).astype(np.float64)
    candidates = [null_candidate() for _ in range(0, num_candidates)]

    for i in range(0, sample_size):
        z[i] = sha(x, N * i)
    # print(z)

    x_gpu = cuda.mem_alloc(4 * N * sample_size)
    cuda.memcpy_htod(x_gpu, x)
    z_gpu = cuda.mem_alloc(4 * sample_size)
    cuda.memcpy_htod(z_gpu, z)
    distances_gpu = cuda.mem_alloc(8 * sample_size * sample_size)
    coherences_gpu = cuda.mem_alloc(8 * num_candidates)
    outputs_gpu = cuda.mem_alloc(4 * sample_size * num_candidates)

    distances_kernel = SourceModule(distances_program()).get_function('p')
    coherence_kernel = SourceModule(coherence_program()).get_function('p')

    distances_kernel(x_gpu, distances_gpu, block=(block_size, 1, 1), grid=(int(sample_size * sample_size / block_size), 1, 1))
    # distances = np.zeros((sample_size,sample_size)).astype(np.double)
    # cuda.memcpy_dtoh(distances, distances_gpu)
    # print(distances)

    for epoch in range(0, epochs):
        mod = SourceModule('\n'.join([create_program(candidates[i], 'k' + str(i), i * sample_size) for i in range(0, num_candidates)]))
        stream = Stream()
        for i in range(0, num_candidates):
            f = mod.get_function('k' + str(i))
            f(x_gpu, outputs_gpu, stream=stream, block=(block_size, 1, 1), grid=(int(sample_size / block_size), 1, 1))
        stream.synchronize()

        # outputs = np.zeros((sample_size * num_candidates,)).astype(np.int32)
        # cuda.memcpy_dtoh(outputs, outputs_gpu)
        # print(outputs)

        coherence_kernel(outputs_gpu, z_gpu, distances_gpu, coherences_gpu, block=(block_size, 1, 1), grid=(int(num_candidates / block_size), 1, 1))
        cuda.memcpy_dtoh(coherences, coherences_gpu)

        top_n = sorted(range(len(coherences)), key=lambda i: coherences[i])[-num_survivors:]
        survivors = [candidates[index] for index in top_n]
        print(epoch, coherences[top_n[-1]])

        for i in range(0, num_survivors):
            candidate = survivors[i]
            candidates[i] = candidate
            for j in range(0, num_offspring):
                index = num_survivors + j * num_survivors + i
                candidates[index] = clone_model(candidate, random.random())

if __name__ == "__main__":
    main()