270 lines
9.2 KiB
Python
270 lines
9.2 KiB
Python
|
# Sample source code from the Tutorial Introduction in the documentation.
|
||
|
|
||
|
import hashlib
|
||
|
import numpy as np
|
||
|
import math
|
||
|
import pycuda.driver as cuda
|
||
|
from pycuda.driver import Stream
|
||
|
import pycuda.autoinit
|
||
|
from pycuda.compiler import SourceModule
|
||
|
import pycuda.gpuarray as gpuarray
|
||
|
import random
|
||
|
|
||
|
'''
|
||
|
a = numpy.random.randn(4,4)
|
||
|
|
||
|
a = a.astype(numpy.float32)
|
||
|
|
||
|
a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
|
||
|
|
||
|
cuda.memcpy_htod(a_gpu, a)
|
||
|
|
||
|
mod = SourceModule("""
|
||
|
__global__ void doublify(float *a)
|
||
|
{
|
||
|
int idx = threadIdx.x + threadIdx.y*4;
|
||
|
a[idx] *= 2;
|
||
|
}
|
||
|
""")
|
||
|
|
||
|
func = mod.get_function("doublify")
|
||
|
func(a_gpu, block=(4,4,1))
|
||
|
|
||
|
a_doubled = numpy.empty_like(a)
|
||
|
cuda.memcpy_dtoh(a_doubled, a_gpu)
|
||
|
print("original array:")
|
||
|
print(a)
|
||
|
print("doubled with kernel:")
|
||
|
print(a_doubled)
|
||
|
|
||
|
# alternate kernel invocation -------------------------------------------------
|
||
|
|
||
|
func(cuda.InOut(a), block=(4, 4, 1))
|
||
|
print("doubled with InOut:")
|
||
|
print(a)
|
||
|
|
||
|
# part 2 ----------------------------------------------------------------------
|
||
|
|
||
|
a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32))
|
||
|
a_doubled = (2*a_gpu).get()
|
||
|
|
||
|
print("original array:")
|
||
|
print(a_gpu)
|
||
|
print("doubled with gpuarray:")
|
||
|
print(a_doubled)
|
||
|
'''
|
||
|
|
||
|
N = 8
|
||
|
M = 2
|
||
|
sample_size = 64
|
||
|
|
||
|
def encode(v, offset):
|
||
|
byte_values = []
|
||
|
for i in range(0, math.ceil(N / 8)):
|
||
|
x = 0
|
||
|
for j in range(0, 8):
|
||
|
index = i * 8 + j
|
||
|
if offset + index >= len(v):
|
||
|
break
|
||
|
x <<= 1
|
||
|
x |= int(v[offset + index])
|
||
|
byte_values.append(x)
|
||
|
return bytearray(x)
|
||
|
|
||
|
def sha(v, offset):
|
||
|
global M
|
||
|
x = encode(v, offset)
|
||
|
m = hashlib.sha256()
|
||
|
m.update(x)
|
||
|
result = m.digest()
|
||
|
return result[0] % M
|
||
|
|
||
|
def create_program_r(model, output_var):
|
||
|
global N, M
|
||
|
(constant, scalars, child) = model
|
||
|
program = 'int ' + output_var + ' = ' + str(constant) + ';\n'
|
||
|
scalars_part = ' + '.join([str(scalars[i]) + ' * x[gid * ' + str(N) + ' + ' + str(i) + ']' for i in range(0, len(scalars)) if scalars[i] > 0])
|
||
|
if len(scalars_part) > 0:
|
||
|
program += output_var + ' += ' + scalars_part + ';\n'
|
||
|
if not child is None:
|
||
|
left_output = output_var + '0'
|
||
|
right_output = output_var + '1'
|
||
|
(left, right) = child
|
||
|
program += create_program_r(left, left_output)
|
||
|
program += create_program_r(right, right_output)
|
||
|
program += output_var + ' += ' + left_output + ' * ' + right_output + ';\n'
|
||
|
program += output_var + ' %= ' + str(M) + ';\n'
|
||
|
return program
|
||
|
|
||
|
def create_program(model, name, offset):
|
||
|
output_var = 'output'
|
||
|
program = '__global__ void ' + name + '(const int *x, int *out) {\n'
|
||
|
program += 'int gid = threadIdx.x + blockIdx.x * blockDim.x;\n'
|
||
|
program += create_program_r(model, output_var)
|
||
|
program += 'out[' + str(offset) + ' + gid] = ' + output_var + ';\n'
|
||
|
program += '}\n'
|
||
|
return program
|
||
|
|
||
|
def distances_program():
|
||
|
global N, sample_size
|
||
|
program = "__global__ void p(const int *x, double *distances) {\n"
|
||
|
program += " int gid = threadIdx.x + blockIdx.x * blockDim.x;\n"
|
||
|
program += " int i = gid / " + str(sample_size) + ";\n"
|
||
|
program += " int j = gid % " + str(sample_size) + ";\n"
|
||
|
program += " if (i == j) {\n"
|
||
|
program += " distances[gid] = 0;\n"
|
||
|
program += " return;\n"
|
||
|
program += " }\n"
|
||
|
program += " int distance = 0;\n"
|
||
|
program += " for (int k = 0; k < " + str(N) + "; k++) {\n"
|
||
|
program += " distance += x[i * " + str(N) + " + k] ^ x[j * " + str(N) + " + k];\n"
|
||
|
program += " }\n"
|
||
|
program += " distances[gid] = pow((double)2.0, (double)-distance);\n"
|
||
|
program += "}\n"
|
||
|
return program
|
||
|
|
||
|
def coherence_program():
|
||
|
global sample_size
|
||
|
program = "__global__ void p(const int *y, const int *z, const double *distances, double *coherences) {\n"
|
||
|
program += " int gid = threadIdx.x + blockIdx.x * blockDim.x;\n"
|
||
|
program += " double numerator = 0;\n"
|
||
|
program += " double denominator = 0;\n"
|
||
|
program += " for (int i = 0; i < " + str(sample_size) + "; i++) {\n"
|
||
|
program += " int p = z[i] ^ y[gid * " + str(sample_size) + " + i];\n"
|
||
|
program += " for (int j = 0; j < " + str(sample_size) + "; j++) {\n"
|
||
|
program += " int q = z[j] ^ y[gid * " + str(sample_size) + " + j];\n"
|
||
|
program += " double distance = distances[i * " + str(sample_size) + " + j];\n"
|
||
|
program += " denominator += distance;\n"
|
||
|
program += " if (p == q) {\n"
|
||
|
program += " numerator += distance;\n"
|
||
|
program += " }\n"
|
||
|
program += " }\n"
|
||
|
program += " }\n"
|
||
|
program += " coherences[gid] = numerator / denominator;\n"
|
||
|
program += "}\n"
|
||
|
return program
|
||
|
|
||
|
def random_sample():
|
||
|
global N, sample_size
|
||
|
x = np.zeros((N * sample_size,)).astype(np.int32)
|
||
|
for i in range(0, len(x)):
|
||
|
x[i] = random.randint(0, 1)
|
||
|
return x
|
||
|
|
||
|
def clone_model(model, p_mutation):
|
||
|
global N, M
|
||
|
|
||
|
p_constant = p_mutation * random.random()
|
||
|
p_flip = p_mutation * random.random()
|
||
|
p_add_child = p_mutation * random.random()
|
||
|
p_drop_child = p_mutation * random.random()
|
||
|
|
||
|
(constant, xors, child) = model
|
||
|
if random.random() < p_constant:
|
||
|
constant += random.randint(0, M - 1)
|
||
|
constant %= M
|
||
|
clone_xors = np.zeros((N,))
|
||
|
np.copyto(clone_xors, xors)
|
||
|
for i in range(0, N):
|
||
|
if random.random() < p_flip:
|
||
|
offset = 1 if M == 2 else random.randint(1, M - 1)
|
||
|
clone_xors[i] += offset
|
||
|
clone_xors[i] %= M
|
||
|
if child is None:
|
||
|
if random.random() < p_add_child:
|
||
|
left = random_child(p_mutation)
|
||
|
right = random_child(p_mutation)
|
||
|
return (constant, clone_xors, (left, right))
|
||
|
return (constant, clone_xors, None)
|
||
|
if random.random() < p_drop_child:
|
||
|
return (constant, clone_xors, None)
|
||
|
(left, right) = child
|
||
|
clone_left = clone_model(left, p_mutation)
|
||
|
clone_right = clone_model(right, p_mutation)
|
||
|
return (constant, clone_xors, (clone_left, clone_right))
|
||
|
|
||
|
def random_child(p_mutation):
|
||
|
global N, M
|
||
|
constant = random.randint(0, M - 1)
|
||
|
xors = np.zeros((N,))
|
||
|
|
||
|
p_flip = p_mutation * random.random()
|
||
|
p_child = p_mutation * random.random()
|
||
|
|
||
|
index = random.randint(0, N - 1)
|
||
|
xors[index] = 1 if M == 2 else random.randint(1, M - 1)
|
||
|
for i in range(0, N):
|
||
|
if i != index and random.random() < p_flip:
|
||
|
xors[i] = 1 if M == 2 else random.randint(1, M - 1)
|
||
|
if random.random() < p_child:
|
||
|
left = random_child(p_mutation * random.random())
|
||
|
right = random_child(p_mutation * random.random())
|
||
|
return (constant, xors, (left, right))
|
||
|
return (constant, xors, None)
|
||
|
|
||
|
def null_candidate():
|
||
|
global N
|
||
|
return (0, np.zeros((N,)), None)
|
||
|
|
||
|
def main():
|
||
|
global N, M, sample_size
|
||
|
epochs = 1000
|
||
|
num_survivors = 100
|
||
|
num_offspring = 10
|
||
|
num_candidates = num_survivors + num_survivors * num_offspring
|
||
|
block_size = 1
|
||
|
|
||
|
x = random_sample()
|
||
|
z = np.zeros((sample_size,)).astype(np.int32)
|
||
|
coherences = np.zeros((num_candidates,)).astype(np.float64)
|
||
|
candidates = [null_candidate() for _ in range(0, num_candidates)]
|
||
|
|
||
|
for i in range(0, sample_size):
|
||
|
z[i] = sha(x, N * i)
|
||
|
# print(z)
|
||
|
|
||
|
x_gpu = cuda.mem_alloc(4 * N * sample_size)
|
||
|
cuda.memcpy_htod(x_gpu, x)
|
||
|
z_gpu = cuda.mem_alloc(4 * sample_size)
|
||
|
cuda.memcpy_htod(z_gpu, z)
|
||
|
distances_gpu = cuda.mem_alloc(8 * sample_size * sample_size)
|
||
|
coherences_gpu = cuda.mem_alloc(8 * num_candidates)
|
||
|
outputs_gpu = cuda.mem_alloc(4 * sample_size * num_candidates)
|
||
|
|
||
|
distances_kernel = SourceModule(distances_program()).get_function('p')
|
||
|
coherence_kernel = SourceModule(coherence_program()).get_function('p')
|
||
|
|
||
|
distances_kernel(x_gpu, distances_gpu, block=(block_size, 1, 1), grid=(int(sample_size * sample_size / block_size), 1, 1))
|
||
|
# distances = np.zeros((sample_size,sample_size)).astype(np.double)
|
||
|
# cuda.memcpy_dtoh(distances, distances_gpu)
|
||
|
# print(distances)
|
||
|
|
||
|
for epoch in range(0, epochs):
|
||
|
mod = SourceModule('\n'.join([create_program(candidates[i], 'k' + str(i), i * sample_size) for i in range(0, num_candidates)]))
|
||
|
stream = Stream()
|
||
|
for i in range(0, num_candidates):
|
||
|
f = mod.get_function('k' + str(i))
|
||
|
f(x_gpu, outputs_gpu, stream=stream, block=(block_size, 1, 1), grid=(int(sample_size / block_size), 1, 1))
|
||
|
stream.synchronize()
|
||
|
|
||
|
# outputs = np.zeros((sample_size * num_candidates,)).astype(np.int32)
|
||
|
# cuda.memcpy_dtoh(outputs, outputs_gpu)
|
||
|
# print(outputs)
|
||
|
|
||
|
coherence_kernel(outputs_gpu, z_gpu, distances_gpu, coherences_gpu, block=(block_size, 1, 1), grid=(int(num_candidates / block_size), 1, 1))
|
||
|
cuda.memcpy_dtoh(coherences, coherences_gpu)
|
||
|
|
||
|
top_n = sorted(range(len(coherences)), key=lambda i: coherences[i])[-num_survivors:]
|
||
|
survivors = [candidates[index] for index in top_n]
|
||
|
print(epoch, coherences[top_n[-1]])
|
||
|
|
||
|
for i in range(0, num_survivors):
|
||
|
candidate = survivors[i]
|
||
|
candidates[i] = candidate
|
||
|
for j in range(0, num_offspring):
|
||
|
index = num_survivors + j * num_survivors + i
|
||
|
candidates[index] = clone_model(candidate, random.random())
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|