cudaRelax.py
#!/usr/bin/env python
from sys import argv
from pylab import *
from numba import *
from numbapro import cuda
from timeit import default_timer as time
import os
"""
A function to pause the program
"""
def pause():
wait = raw_input("Press enter to continue")
"""
A function to clear the terminal screen.
"""
def clear():
os.system('cls' if os.name=='nt' else 'clear')
"""
Instructions for CUDA:
Go to https://store.continuum.io/cshop/anaconda/
and download Anaconda, (it is free)
Then through the same website go to and install
Anaconda Accelerate, (Not free but free academic license)
here: https://store.continuum.io/cshop/academicanaconda
Follow directions (Easy install)
Enjoy! :)
"""
# tpb is the thread count per block being operated on by the gpu
tpb = 16
"""
function get_max is a basic function for determining the
larger of two values. While this is unnecessary for python
when writing CUDA code python functions and descriptors do
not work, so the must be rewritten.
We see that this is a CUDA function from the cuda.jit
"just in time" compiler command at the beginning of the
definition.
This function returns an 8-bit float and needs two 8-bit floats
as arguments.
device=True indicates that this is a function to be computed
on the gpu device as opposed to the cpu host
inline=True forces the function to be inline in the CUDA kernel
"""
@cuda.jit(f8(f8, f8), device=True, inline=True)
def get_max(a, b):
if a > b: return a
else: return b
"""
This is the main CUDA kernel.
CUDA operation must be initialized by calling a CUDA kernel,
the kernel is a function definition that operates only on the
CUDA device.
A CUDA kernel cannot return anything.
This is declared to the system as a CUDA kernel through the
jit argument given prior to the function definition.
You will notice that all passed in variables begin with 'd_'
this is a naming convention in order to keep track of variables
on the 'd'evice as compared to variables on the 'h'ost.
CUDA kernels are to be written is a serial context.
When invoked the kernel launches on N threads in the gpu
each thread uses the code serially and only when combined
with all other threads does it become parallel.
"""
@cuda.jit(void(f8[:,:], f8[:,:], f8[:,:], f8[:,:]))
def c_relaxer(d_present, d_next, d_fixed, error):
# Declare a shared array to store the error values, the changes
err_sm = cuda.shared.array((tpb,tpb), dtype=f8)
"""
CUDA threads inherently are able to keep track of their
positions in the thread block, essentially a self aware
array of CUDA threads.
CUDA threads come standard with 3-dimensional ID calls,
we can use one, two or all three at will.
These calls assign variables to the thread locations,
in this case we have x and y directions for our 2-dimensional
arrays.
"""
ty = cuda.threadIdx.x
tx = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
"""
CUDA blocks also have their own tracking calls, one 3-dim
vector for the block ID, and one 3-dim vector for the block
dimensions.
We establish the total size of the grid with these equations
(for a 2 dimensional array)
x = threadIdx.x + blockIdx.x * blockDim.x
y = threadIdx.y + blockIdx.y * blockDim.y
These formulas are used so frequently that a shortcut was made:
cuda.grid('number of dimensions')
"""
i,j = cuda.grid(2)
# Finding the shape of the array to be calculated.
n = d_present.shape[0]
m = d_present.shape[1]
err_sm[ty,tx] = 0
# Stay within the edges
if j >= 1 and j < n-1 and i >= 1 and i < m-1:
# if the corresponding fixed value is zero then calculate the new value
if d_fixed[j,i] == 0:
d_next[j,i] = 0.25 * (d_present[j,i+1]+d_present[j,i-1]+d_present[j+1,i]+d_present[j-1,i])
# Calculate the change
err_sm[ty,tx] = d_next[j,i] - d_present[j,i]
else:
d_next[j,i] = d_present[j,i]
"""
CUDA threads are not required to run an any specific order. In fact
CUDA guarantees that they will be executed out of order, this poses
a problem when trying to use data from the same locations.
syncthreads acts as a barrier for the threads, the GPU will not proceed
to the next commands until all previous commands on all threads have been
completed.
"""
cuda.syncthreads()
"""
This chunk of code comes directly from an example I found online,
I am not 100% sure what they are specifically for but I believe
that they are pulling out the largest error/change in each thread
block and assigning it to the error grid.
"""
# max-reduce err_sm vertically
t = tpb//2
while t > 0:
if ty < t:
err_sm[ty,tx] = get_max(err_sm[ty,tx], err_sm[ty+t,tx])
t //= 2
cuda.syncthreads()
# max-reduce err_sm horizontally
t = tpb // 2
while t > 0:
if tx < t and ty == 0:
err_sm[ty,tx] = get_max(err_sm[ty,tx], err_sm[ty,tx+t])
t //= 2
cuda.syncthreads()
if tx ==0 and ty == 0:
error[by,bx] = err_sm[0,0]
def main():
# Clear the console screen
clear()
# Get the filename
try:
filename = argv[1]
except:
print "Please enter the name of the file to initialArray."
filename = raw_input(">>>")
# Open the file
fin = open(filename, 'r')
# Set up the initialArray Array
initialArray = loadtxt(fin, unpack=True)
#print initialArray
#print "DEBUG: ", len(initialArray)
#print "DEBUG: ", len(initialArray[0])
#print "DEBUG: ", initialArray[0][0]
# Set up the h_fixed array
lenX = len(initialArray[0])
lenY = len(initialArray)
h_fixed = zeros([ lenY, lenX ])
for i in range(lenY):
for j in range(lenX):
h_fixed[i,j] = float(initialArray[i][j])
# Create the h_present array
h_present = copy(initialArray)
# and the next array
h_next = zeros([lenY, lenX])
# Establish the block dimensions, a 2-D array of length tpb
blockdim = (tpb, tpb)
# Establish the overall CUDA grid dimensions.
# By declaring the dimensions in this manner the code becomes
# instantly scalable, if you move to a different computer with
# more CUDA cores available it will automatically scale up.
griddim = (lenX/blockdim[0], lenY/blockdim[1])
error_grid = zeros(griddim)
"""
The CUDA stream is a trail of commands that get sent to the GPU.
While not necessary the stream helps to ensure that commands
get processed in the correct order.
If multiple GPUs are available you can establish multiple streams
and send commands to each GPU.
"""
stream = cuda.stream()
"""
The GPU can only perform functions on information that is in it's
memory, so we have to allocate that memory for what we need.
We create usable device variables by sending our host variables
'to_device'
"""
d_present = cuda.to_device(h_present, stream=stream)
d_next = cuda.to_device(h_next, stream=stream)
d_fixed = cuda.to_device(h_fixed, stream=stream)
d_error_grid = cuda.to_device(error_grid, stream=stream)
#stream.synchronize()
# set up the stopping point for the calculations
tolerance = 1e-6
error = 1.0
# Using a built in timer feature we keep track of how long the process
# takes by subtracting the start time from the end time.
start = time()
# We also keep track of how many times the function iterates
iter = 0
print "Beginning GPU calculations"
# This while loop is the iterations for the GPU calculations
while error > tolerance:
#print iter
"""
Here we have the kernel call for the GPU function.
As you can see the kernel requires the block and thread
dimensions in square brackets, also we can see that it
is a part of our stream.
We then pass the desired variables, device variables only,
to the kernel.
"""
c_relaxer[griddim, blockdim, stream](d_present, d_next, d_fixed, d_error_grid)
# Like before, in order to do any operations with the CPU we have to
# pass the variable back to the host.
# This is done in order to mitigate the while loop
d_error_grid.to_host(stream)
stream.synchronize()
error = abs(error_grid.max())
# This print statement is for confirmation that the program is running
# we can further increase the speed by eliminating this functionality.
#print error
iter += 1
# Swap the present and next array for the next set of calculations.
tmp = d_present
d_present = d_next
d_next = tmp
end = time()
cudaTime = end - start
print "%d iterations in %0.5f seconds" %(iter, cudaTime)
d_present.copy_to_host(h_present,stream)
stream.synchronize()
#print h_present
figure()
CP = contourf(h_present)
title("CUDA: %d by %d array, %0.5f seconds" %(lenX, lenY, cudaTime))
filename = filename.split('.')
name = filename[0]
#show()
savefig(name + '_%0.2f.png' %(cudaTime))
if __name__ == '__main__':
main()
Generated by GNU Enscript 1.6.6.