# Sample source code from the Tutorial Introduction in the documentation. import hashlib import numpy as np import math import pycuda.driver as cuda from pycuda.driver import Stream import pycuda.autoinit from pycuda.compiler import SourceModule import pycuda.gpuarray as gpuarray import random ''' a = numpy.random.randn(4,4) a = a.astype(numpy.float32) a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize) cuda.memcpy_htod(a_gpu, a) mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """) func = mod.get_function("doublify") func(a_gpu, block=(4,4,1)) a_doubled = numpy.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) print("original array:") print(a) print("doubled with kernel:") print(a_doubled) # alternate kernel invocation ------------------------------------------------- func(cuda.InOut(a), block=(4, 4, 1)) print("doubled with InOut:") print(a) # part 2 ---------------------------------------------------------------------- a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32)) a_doubled = (2*a_gpu).get() print("original array:") print(a_gpu) print("doubled with gpuarray:") print(a_doubled) ''' N = 8 M = 2 sample_size = 64 def encode(v, offset): byte_values = [] for i in range(0, math.ceil(N / 8)): x = 0 for j in range(0, 8): index = i * 8 + j if offset + index >= len(v): break x <<= 1 x |= int(v[offset + index]) byte_values.append(x) return bytearray(x) def sha(v, offset): global M x = encode(v, offset) m = hashlib.sha256() m.update(x) result = m.digest() return result[0] % M def create_program_r(model, output_var): global N, M (constant, scalars, child) = model program = 'int ' + output_var + ' = ' + str(constant) + ';\n' scalars_part = ' + '.join([str(scalars[i]) + ' * x[gid * ' + str(N) + ' + ' + str(i) + ']' for i in range(0, len(scalars)) if scalars[i] > 0]) if len(scalars_part) > 0: program += output_var + ' += ' + scalars_part + ';\n' if not child is None: left_output = output_var + '0' right_output = output_var + '1' (left, right) = child program += create_program_r(left, left_output) program += create_program_r(right, right_output) program += output_var + ' += ' + left_output + ' * ' + right_output + ';\n' program += output_var + ' %= ' + str(M) + ';\n' return program def create_program(model, name, offset): output_var = 'output' program = '__global__ void ' + name + '(const int *x, int *out) {\n' program += 'int gid = threadIdx.x + blockIdx.x * blockDim.x;\n' program += create_program_r(model, output_var) program += 'out[' + str(offset) + ' + gid] = ' + output_var + ';\n' program += '}\n' return program def distances_program(): global N, sample_size program = "__global__ void p(const int *x, double *distances) {\n" program += " int gid = threadIdx.x + blockIdx.x * blockDim.x;\n" program += " int i = gid / " + str(sample_size) + ";\n" program += " int j = gid % " + str(sample_size) + ";\n" program += " if (i == j) {\n" program += " distances[gid] = 0;\n" program += " return;\n" program += " }\n" program += " int distance = 0;\n" program += " for (int k = 0; k < " + str(N) + "; k++) {\n" program += " distance += x[i * " + str(N) + " + k] ^ x[j * " + str(N) + " + k];\n" program += " }\n" program += " distances[gid] = pow((double)2.0, (double)-distance);\n" program += "}\n" return program def coherence_program(): global sample_size program = "__global__ void p(const int *y, const int *z, const double *distances, double *coherences) {\n" program += " int gid = threadIdx.x + blockIdx.x * blockDim.x;\n" program += " double numerator = 0;\n" program += " double denominator = 0;\n" program += " for (int i = 0; i < " + str(sample_size) + "; i++) {\n" program += " int p = z[i] ^ y[gid * " + str(sample_size) + " + i];\n" program += " for (int j = 0; j < " + str(sample_size) + "; j++) {\n" program += " int q = z[j] ^ y[gid * " + str(sample_size) + " + j];\n" program += " double distance = distances[i * " + str(sample_size) + " + j];\n" program += " denominator += distance;\n" program += " if (p == q) {\n" program += " numerator += distance;\n" program += " }\n" program += " }\n" program += " }\n" program += " coherences[gid] = numerator / denominator;\n" program += "}\n" return program def random_sample(): global N, sample_size x = np.zeros((N * sample_size,)).astype(np.int32) for i in range(0, len(x)): x[i] = random.randint(0, 1) return x def clone_model(model, p_mutation): global N, M p_constant = p_mutation * random.random() p_flip = p_mutation * random.random() p_add_child = p_mutation * random.random() p_drop_child = p_mutation * random.random() (constant, xors, child) = model if random.random() < p_constant: constant += random.randint(0, M - 1) constant %= M clone_xors = np.zeros((N,)) np.copyto(clone_xors, xors) for i in range(0, N): if random.random() < p_flip: offset = 1 if M == 2 else random.randint(1, M - 1) clone_xors[i] += offset clone_xors[i] %= M if child is None: if random.random() < p_add_child: left = random_child(p_mutation) right = random_child(p_mutation) return (constant, clone_xors, (left, right)) return (constant, clone_xors, None) if random.random() < p_drop_child: return (constant, clone_xors, None) (left, right) = child clone_left = clone_model(left, p_mutation) clone_right = clone_model(right, p_mutation) return (constant, clone_xors, (clone_left, clone_right)) def random_child(p_mutation): global N, M constant = random.randint(0, M - 1) xors = np.zeros((N,)) p_flip = p_mutation * random.random() p_child = p_mutation * random.random() index = random.randint(0, N - 1) xors[index] = 1 if M == 2 else random.randint(1, M - 1) for i in range(0, N): if i != index and random.random() < p_flip: xors[i] = 1 if M == 2 else random.randint(1, M - 1) if random.random() < p_child: left = random_child(p_mutation * random.random()) right = random_child(p_mutation * random.random()) return (constant, xors, (left, right)) return (constant, xors, None) def null_candidate(): global N return (0, np.zeros((N,)), None) def main(): global N, M, sample_size epochs = 1000 num_survivors = 100 num_offspring = 10 num_candidates = num_survivors + num_survivors * num_offspring block_size = 1 x = random_sample() z = np.zeros((sample_size,)).astype(np.int32) coherences = np.zeros((num_candidates,)).astype(np.float64) candidates = [null_candidate() for _ in range(0, num_candidates)] for i in range(0, sample_size): z[i] = sha(x, N * i) # print(z) x_gpu = cuda.mem_alloc(4 * N * sample_size) cuda.memcpy_htod(x_gpu, x) z_gpu = cuda.mem_alloc(4 * sample_size) cuda.memcpy_htod(z_gpu, z) distances_gpu = cuda.mem_alloc(8 * sample_size * sample_size) coherences_gpu = cuda.mem_alloc(8 * num_candidates) outputs_gpu = cuda.mem_alloc(4 * sample_size * num_candidates) distances_kernel = SourceModule(distances_program()).get_function('p') coherence_kernel = SourceModule(coherence_program()).get_function('p') distances_kernel(x_gpu, distances_gpu, block=(block_size, 1, 1), grid=(int(sample_size * sample_size / block_size), 1, 1)) # distances = np.zeros((sample_size,sample_size)).astype(np.double) # cuda.memcpy_dtoh(distances, distances_gpu) # print(distances) for epoch in range(0, epochs): mod = SourceModule('\n'.join([create_program(candidates[i], 'k' + str(i), i * sample_size) for i in range(0, num_candidates)])) stream = Stream() for i in range(0, num_candidates): f = mod.get_function('k' + str(i)) f(x_gpu, outputs_gpu, stream=stream, block=(block_size, 1, 1), grid=(int(sample_size / block_size), 1, 1)) stream.synchronize() # outputs = np.zeros((sample_size * num_candidates,)).astype(np.int32) # cuda.memcpy_dtoh(outputs, outputs_gpu) # print(outputs) coherence_kernel(outputs_gpu, z_gpu, distances_gpu, coherences_gpu, block=(block_size, 1, 1), grid=(int(num_candidates / block_size), 1, 1)) cuda.memcpy_dtoh(coherences, coherences_gpu) top_n = sorted(range(len(coherences)), key=lambda i: coherences[i])[-num_survivors:] survivors = [candidates[index] for index in top_n] print(epoch, coherences[top_n[-1]]) for i in range(0, num_survivors): candidate = survivors[i] candidates[i] = candidate for j in range(0, num_offspring): index = num_survivors + j * num_survivors + i candidates[index] = clone_model(candidate, random.random()) if __name__ == "__main__": main()