harvard-cs205 · FroxieYe · Nov 3, 2015 · Nov 5, 2015 · Nov 5, 2015 · Nov 20, 2015
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,93 @@
+Problem 3
+3.1 GHz Intel Core i7
+16 GB 1867 MHz DDR3
+Intel Iris Graphics 6100 1536 MB
+
+#################### Coalesced reads output ########################
+coalesced reads, workgroups: 8, num_workers: 4, 0.1509724 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.0738424 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.04797616 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.02097256 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.01119976 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.00631752 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.07417096 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.0370732 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.0206708 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.01128632 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00626792 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.0048784 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.0385776 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.01847856 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.01112848 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.00626464 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00322904 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00254848 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.01852248 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01010552 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.00922304 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00324184 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.00268032 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00276096 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.02534048 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.011612 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.00621888 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00337832 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00275672 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00271792 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.01766056 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.00872592 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.0054876 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00271448 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00279576 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00255944 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.01784168 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.00853328 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.00477104 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00276224 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00294664 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00282192 seconds
+
+######################### Blocks output ##############################
+
+blocked reads, workgroups: 8, num_workers: 4, 0.14206752 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.07778488 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.05246424 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.02621472 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.01133264 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.00752464 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.07089848 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.03954336 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.03290536 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.01196296 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.00750312 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.00618032 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.03575768 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.02162808 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.01131608 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.00879392 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.00660272 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.00650592 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.01826832 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.01026856 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.00737376 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.0061656 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.0065076 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.00643744 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.01825976 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.01133552 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.00850736 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.0065948 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.00705792 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.00624128 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.0137916 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.00877536 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.00589888 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.0057244 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.00661848 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.00618208 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.01365016 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.0086716 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.00666688 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.00613208 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.0063936 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.00588088 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,11 +5,12 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t global_id = get_global_id(0);
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (size_t i = global_id; i<N; i += get_global_size(0)) { // YOUR CODE HERE
+        sum += x[i]; // YOUR CODE HERE 
     }
 
     fast[local_id] = sum;
@@ -24,8 +25,12 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    size_t local_size = get_local_size(0);
+    for (size_t s = local_size/2; s>0; s >>= 1) { // YOUR CODE HERE
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s]; 
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +43,8 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
+    size_t global_id = get_global_id(0);
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +54,8 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (size_t i = global_id*k;i<N && i<global_id*k+k ;++i) { // YOUR CODE HERE
+        sum += x[i]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +70,13 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    size_t local_size = get_local_size(0);
+    for (size_t s = local_size/2; s>0; s >>= 1) { // YOUR CODE HERE
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s]; 
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,5 +1,24 @@
 #include "median9.h"
 
+// Helper minimum and maximum functions.
+int minn(int x, int y) {
+  return (x > y) ? y : x;
+}
+
+int maxx(int x, int y) {
+  return (x > y) ? x : y;
+}
+
+// check the boundary of the buffer.
+float convert(__global __read_only float *in_values,
+              int w, int h,
+              int x, int y)
+{
+    int a = maxx(minn(x, w-1), 0);
+    int b = maxx(minn(y, h-1), 0);
+    return in_values[b*w + a];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -13,7 +32,6 @@ median_3x3(__global __read_only float *in_values,
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
 
-
     // Load into buffer (with 1-pixel halo).
     //
     // It may be helpful to consult HW3 Problem 5, and
@@ -31,4 +49,47 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    if (idx_1D < buf_w)
+        for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                convert(in_values, w, h, buf_corner_x + idx_1D, buf_corner_y + row);
+        }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // write output by calling the median9 function from median9.h.
+    if ((y < h) && (x < w)) // stay in bounds
+        out_values[y * w + x] = median9(buffer[(buf_y-1) * buf_w + buf_x-1], 
+                                    buffer[(buf_y-1) * buf_w + buf_x], 
+                                    buffer[(buf_y-1) * buf_w + buf_x+1],
+                                    buffer[(buf_y) * buf_w + buf_x-1],
+                                    buffer[(buf_y) * buf_w + buf_x],
+                                    buffer[(buf_y) * buf_w + buf_x+1],
+                                    buffer[(buf_y+1) * buf_w + buf_x-1], 
+                                    buffer[(buf_y+1) * buf_w + buf_x], 
+                                    buffer[(buf_y+1) * buf_w + buf_x+1]);
+
 }
+
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,28 @@
+
+Part 1:
+Maze 1: Finished after 915 iterations, 207.802 ms total, 0.227106010929 ms per iteration
+Maze 1: Finished after 532 iterations, 120.78392 ms total, 0.227037443609 ms per iteration
+
+Part 2:
+Maze 1: Finished after 529 iterations, 116.80128 ms total, 0.22079637051 ms per iteration
+Maze 2: Finished after 273 iterations, 61.12912 ms total, 0.223916190476 ms per iteration
+
+Part 3:
+Maze 1: Finished after 10 iterations, 3.0388 ms total, 0.30388 ms per iteration
+Maze 2: Finished after 10 iterations, 2.88184 ms total, 0.288184 ms per iteration
+
+Part 4:
+Maze 1: Finished after 10 iterations, 7.55592 ms total, 0.755592 ms per iteration
+Maze 2: Finished after 9 iterations, 6.84968 ms total, 0.761075555556 ms per iteration
+
+Justification:
+3.1 GHz Intel Core i7
+Intel Iris Graphics 6100 1536 MB
+
+My computer shows that using single thread slows down the program. This can be justified by claiming for this particular problem, one will benefit more from the parallelization than reduce in memory read / write given my hardware configuration. However, this may vary if the hardware changes.
+
+Part 5:
+Maze 1: Finished after 10 iterations, 7.94048 ms total, 0.794048 ms per iteration
+Maze 2: Finished after 9 iterations, 7.23608 ms total, 0.804008888889 ms per iteration
+
+The use of min instead of atomic_min increases the total time and keeps the number of iterations unchanged, thus increases the time per iterations. However, this might vary across machines.
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
@@ -17,6 +17,20 @@ initialize_labels(__global __read_only int *image,
     }
 }
 
+int minn(int x, int y) {
+  return (x > y) ? y : x;
+}
+
+int maxx(int x, int y) {
+  return (x > y) ? x : y;
+}
+
+int
+update_label(int old, int l, int r, int t, int b)
+{
+    return minn(minn(minn(minn(old, l), r), t), b);
+}
+
 int
 get_clamped_value(__global __read_only int *labels,
                   int w, int h,
@@ -80,18 +94,50 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-
+    // Part 2, update buffer for those within the threshold only.
+    /*
+    if (old_label < w*h) {
+        buffer[buf_y * buf_w + buf_x] = labels[old_label];
+    }
+    */
+
+    if (idx_1D == 0) {
+        int last_fetch = -1;
+        int last_idx = -1;
+        for (int yiter = halo; yiter < buf_h - halo; ++yiter) {
+            for (int xiter = halo; xiter < buf_w - halo; ++xiter) {
+                int cur_idx = yiter * buf_h + xiter;
+                if (buffer[cur_idx] >= w*h) continue;
+                if (buffer[cur_idx] != last_idx) {
+                    last_idx = cur_idx;
+                    last_fetch = labels[buffer[cur_idx]];
+                }
+                buffer[cur_idx] = last_fetch;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+        if (old_label >= w * h) new_label = old_label;
+        else new_label = update_label(
+                            old_label,
+                            buffer[(buf_y-1)*buf_w + buf_x],
+                            buffer[(buf_y+1)*buf_w + buf_x],
+                            buffer[buf_y * buf_w + buf_x - 1],
+                            buffer[buf_y * buf_w + buf_x + 1]);
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
+            atomic_min(old_label + labels, new_label);
+            //labels[old_label] = min(labels[old_label], new_label);
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
         }