Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hw3 #425

Open
wants to merge 4 commits into
base: HW3
Choose a base branch
from
Open

Hw3 #425

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
Problem 3
3.1 GHz Intel Core i7
16 GB 1867 MHz DDR3
Intel Iris Graphics 6100 1536 MB

#################### Coalesced reads output ########################
coalesced reads, workgroups: 8, num_workers: 4, 0.1509724 seconds
coalesced reads, workgroups: 8, num_workers: 8, 0.0738424 seconds
coalesced reads, workgroups: 8, num_workers: 16, 0.04797616 seconds
coalesced reads, workgroups: 8, num_workers: 32, 0.02097256 seconds
coalesced reads, workgroups: 8, num_workers: 64, 0.01119976 seconds
coalesced reads, workgroups: 8, num_workers: 128, 0.00631752 seconds
coalesced reads, workgroups: 16, num_workers: 4, 0.07417096 seconds
coalesced reads, workgroups: 16, num_workers: 8, 0.0370732 seconds
coalesced reads, workgroups: 16, num_workers: 16, 0.0206708 seconds
coalesced reads, workgroups: 16, num_workers: 32, 0.01128632 seconds
coalesced reads, workgroups: 16, num_workers: 64, 0.00626792 seconds
coalesced reads, workgroups: 16, num_workers: 128, 0.0048784 seconds
coalesced reads, workgroups: 32, num_workers: 4, 0.0385776 seconds
coalesced reads, workgroups: 32, num_workers: 8, 0.01847856 seconds
coalesced reads, workgroups: 32, num_workers: 16, 0.01112848 seconds
coalesced reads, workgroups: 32, num_workers: 32, 0.00626464 seconds
coalesced reads, workgroups: 32, num_workers: 64, 0.00322904 seconds
coalesced reads, workgroups: 32, num_workers: 128, 0.00254848 seconds
coalesced reads, workgroups: 64, num_workers: 4, 0.01852248 seconds
coalesced reads, workgroups: 64, num_workers: 8, 0.01010552 seconds
coalesced reads, workgroups: 64, num_workers: 16, 0.00922304 seconds
coalesced reads, workgroups: 64, num_workers: 32, 0.00324184 seconds
coalesced reads, workgroups: 64, num_workers: 64, 0.00268032 seconds
coalesced reads, workgroups: 64, num_workers: 128, 0.00276096 seconds
coalesced reads, workgroups: 128, num_workers: 4, 0.02534048 seconds
coalesced reads, workgroups: 128, num_workers: 8, 0.011612 seconds
coalesced reads, workgroups: 128, num_workers: 16, 0.00621888 seconds
coalesced reads, workgroups: 128, num_workers: 32, 0.00337832 seconds
coalesced reads, workgroups: 128, num_workers: 64, 0.00275672 seconds
coalesced reads, workgroups: 128, num_workers: 128, 0.00271792 seconds
coalesced reads, workgroups: 256, num_workers: 4, 0.01766056 seconds
coalesced reads, workgroups: 256, num_workers: 8, 0.00872592 seconds
coalesced reads, workgroups: 256, num_workers: 16, 0.0054876 seconds
coalesced reads, workgroups: 256, num_workers: 32, 0.00271448 seconds
coalesced reads, workgroups: 256, num_workers: 64, 0.00279576 seconds
coalesced reads, workgroups: 256, num_workers: 128, 0.00255944 seconds
coalesced reads, workgroups: 512, num_workers: 4, 0.01784168 seconds
coalesced reads, workgroups: 512, num_workers: 8, 0.00853328 seconds
coalesced reads, workgroups: 512, num_workers: 16, 0.00477104 seconds
coalesced reads, workgroups: 512, num_workers: 32, 0.00276224 seconds
coalesced reads, workgroups: 512, num_workers: 64, 0.00294664 seconds
coalesced reads, workgroups: 512, num_workers: 128, 0.00282192 seconds

######################### Blocks output ##############################

blocked reads, workgroups: 8, num_workers: 4, 0.14206752 seconds
blocked reads, workgroups: 8, num_workers: 8, 0.07778488 seconds
blocked reads, workgroups: 8, num_workers: 16, 0.05246424 seconds
blocked reads, workgroups: 8, num_workers: 32, 0.02621472 seconds
blocked reads, workgroups: 8, num_workers: 64, 0.01133264 seconds
blocked reads, workgroups: 8, num_workers: 128, 0.00752464 seconds
blocked reads, workgroups: 16, num_workers: 4, 0.07089848 seconds
blocked reads, workgroups: 16, num_workers: 8, 0.03954336 seconds
blocked reads, workgroups: 16, num_workers: 16, 0.03290536 seconds
blocked reads, workgroups: 16, num_workers: 32, 0.01196296 seconds
blocked reads, workgroups: 16, num_workers: 64, 0.00750312 seconds
blocked reads, workgroups: 16, num_workers: 128, 0.00618032 seconds
blocked reads, workgroups: 32, num_workers: 4, 0.03575768 seconds
blocked reads, workgroups: 32, num_workers: 8, 0.02162808 seconds
blocked reads, workgroups: 32, num_workers: 16, 0.01131608 seconds
blocked reads, workgroups: 32, num_workers: 32, 0.00879392 seconds
blocked reads, workgroups: 32, num_workers: 64, 0.00660272 seconds
blocked reads, workgroups: 32, num_workers: 128, 0.00650592 seconds
blocked reads, workgroups: 64, num_workers: 4, 0.01826832 seconds
blocked reads, workgroups: 64, num_workers: 8, 0.01026856 seconds
blocked reads, workgroups: 64, num_workers: 16, 0.00737376 seconds
blocked reads, workgroups: 64, num_workers: 32, 0.0061656 seconds
blocked reads, workgroups: 64, num_workers: 64, 0.0065076 seconds
blocked reads, workgroups: 64, num_workers: 128, 0.00643744 seconds
blocked reads, workgroups: 128, num_workers: 4, 0.01825976 seconds
blocked reads, workgroups: 128, num_workers: 8, 0.01133552 seconds
blocked reads, workgroups: 128, num_workers: 16, 0.00850736 seconds
blocked reads, workgroups: 128, num_workers: 32, 0.0065948 seconds
blocked reads, workgroups: 128, num_workers: 64, 0.00705792 seconds
blocked reads, workgroups: 128, num_workers: 128, 0.00624128 seconds
blocked reads, workgroups: 256, num_workers: 4, 0.0137916 seconds
blocked reads, workgroups: 256, num_workers: 8, 0.00877536 seconds
blocked reads, workgroups: 256, num_workers: 16, 0.00589888 seconds
blocked reads, workgroups: 256, num_workers: 32, 0.0057244 seconds
blocked reads, workgroups: 256, num_workers: 64, 0.00661848 seconds
blocked reads, workgroups: 256, num_workers: 128, 0.00618208 seconds
blocked reads, workgroups: 512, num_workers: 4, 0.01365016 seconds
blocked reads, workgroups: 512, num_workers: 8, 0.0086716 seconds
blocked reads, workgroups: 512, num_workers: 16, 0.00666688 seconds
blocked reads, workgroups: 512, num_workers: 32, 0.00613208 seconds
blocked reads, workgroups: 512, num_workers: 64, 0.0063936 seconds
blocked reads, workgroups: 512, num_workers: 128, 0.00588088 seconds
29 changes: 20 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
size_t global_id = get_global_id(0);

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (size_t i = global_id; i<N; i += get_global_size(0)) { // YOUR CODE HERE
sum += x[i]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -24,8 +25,12 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
size_t local_size = get_local_size(0);
for (size_t s = local_size/2; s>0; s >>= 1) { // YOUR CODE HERE
if (local_id < s) {
fast[local_id] += fast[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,7 +43,8 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));
int k = ceil((float)N / get_global_size(0));
size_t global_id = get_global_id(0);

// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,8 +54,8 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (size_t i = global_id*k;i<N && i<global_id*k+k ;++i) { // YOUR CODE HERE
sum += x[i]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -64,8 +70,13 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

size_t local_size = get_local_size(0);
for (size_t s = local_size/2; s>0; s >>= 1) { // YOUR CODE HERE
if (local_id < s) {
fast[local_id] += fast[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
4 changes: 2 additions & 2 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +40,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand Down
63 changes: 62 additions & 1 deletion HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
#include "median9.h"

// Helper minimum and maximum functions.
int minn(int x, int y) {
return (x > y) ? y : x;
}

int maxx(int x, int y) {
return (x > y) ? x : y;
}

// check the boundary of the buffer.
float convert(__global __read_only float *in_values,
int w, int h,
int x, int y)
{
int a = maxx(minn(x, w-1), 0);
int b = maxx(minn(y, h-1), 0);
return in_values[b*w + a];
}

// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand All @@ -13,7 +32,6 @@ median_3x3(__global __read_only float *in_values,
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.


// Load into buffer (with 1-pixel halo).
//
// It may be helpful to consult HW3 Problem 5, and
Expand All @@ -31,4 +49,47 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

int row;

if (idx_1D < buf_w)
for (row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] = \
convert(in_values, w, h, buf_corner_x + idx_1D, buf_corner_y + row);
}

barrier(CLK_LOCAL_MEM_FENCE);

// write output by calling the median9 function from median9.h.
if ((y < h) && (x < w)) // stay in bounds
out_values[y * w + x] = median9(buffer[(buf_y-1) * buf_w + buf_x-1],
buffer[(buf_y-1) * buf_w + buf_x],
buffer[(buf_y-1) * buf_w + buf_x+1],
buffer[(buf_y) * buf_w + buf_x-1],
buffer[(buf_y) * buf_w + buf_x],
buffer[(buf_y) * buf_w + buf_x+1],
buffer[(buf_y+1) * buf_w + buf_x-1],
buffer[(buf_y+1) * buf_w + buf_x],
buffer[(buf_y+1) * buf_w + buf_x+1]);

}

5 changes: 3 additions & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import imread
import pylab
import os.path

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

program = cl.Program(context, open('median_filter.cl').read()).build(options='')
curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down
28 changes: 28 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

Part 1:
Maze 1: Finished after 915 iterations, 207.802 ms total, 0.227106010929 ms per iteration
Maze 1: Finished after 532 iterations, 120.78392 ms total, 0.227037443609 ms per iteration

Part 2:
Maze 1: Finished after 529 iterations, 116.80128 ms total, 0.22079637051 ms per iteration
Maze 2: Finished after 273 iterations, 61.12912 ms total, 0.223916190476 ms per iteration

Part 3:
Maze 1: Finished after 10 iterations, 3.0388 ms total, 0.30388 ms per iteration
Maze 2: Finished after 10 iterations, 2.88184 ms total, 0.288184 ms per iteration

Part 4:
Maze 1: Finished after 10 iterations, 7.55592 ms total, 0.755592 ms per iteration
Maze 2: Finished after 9 iterations, 6.84968 ms total, 0.761075555556 ms per iteration

Justification:
3.1 GHz Intel Core i7
Intel Iris Graphics 6100 1536 MB

My computer shows that using single thread slows down the program. This can be justified by claiming for this particular problem, one will benefit more from the parallelization than reduce in memory read / write given my hardware configuration. However, this may vary if the hardware changes.

Part 5:
Maze 1: Finished after 10 iterations, 7.94048 ms total, 0.794048 ms per iteration
Maze 2: Finished after 9 iterations, 7.23608 ms total, 0.804008888889 ms per iteration

The use of min instead of atomic_min increases the total time and keeps the number of iterations unchanged, thus increases the time per iterations. However, this might vary across machines.
50 changes: 48 additions & 2 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,20 @@ initialize_labels(__global __read_only int *image,
}
}

int minn(int x, int y) {
return (x > y) ? y : x;
}

int maxx(int x, int y) {
return (x > y) ? x : y;
}

int
update_label(int old, int l, int r, int t, int b)
{
return minn(minn(minn(minn(old, l), r), t), b);
}

int
get_clamped_value(__global __read_only int *labels,
int w, int h,
Expand Down Expand Up @@ -80,18 +94,50 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)

// Part 2, update buffer for those within the threshold only.
/*
if (old_label < w*h) {
buffer[buf_y * buf_w + buf_x] = labels[old_label];
}
*/

if (idx_1D == 0) {
int last_fetch = -1;
int last_idx = -1;
for (int yiter = halo; yiter < buf_h - halo; ++yiter) {
for (int xiter = halo; xiter < buf_w - halo; ++xiter) {
int cur_idx = yiter * buf_h + xiter;
if (buffer[cur_idx] >= w*h) continue;
if (buffer[cur_idx] != last_idx) {
last_idx = cur_idx;
last_fetch = labels[buffer[cur_idx]];
}
buffer[cur_idx] = last_fetch;
}
}
}

barrier(CLK_LOCAL_MEM_FENCE);

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;
if (old_label >= w * h) new_label = old_label;
else new_label = update_label(
old_label,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After parts 2 and 4, you should use buffer[buf_w * buf_y + buf_x] instead of old_label.

buffer[(buf_y-1)*buf_w + buf_x],
buffer[(buf_y+1)*buf_w + buf_x],
buffer[buf_y * buf_w + buf_x - 1],
buffer[buf_y * buf_w + buf_x + 1]);

if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
atomic_min(old_label + labels, new_label);
//labels[old_label] = min(labels[old_label], new_label);
*(changed_flag) += 1;
labels[y * w + x] = new_label;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also be done using atomic_min.

}
Expand Down