From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Mon, 2 Nov 2015 21:49:00 -0500 Subject: [PATCH 1/5] remove extraneous +4 --- HW3/P3/tune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Wed, 4 Nov 2015 21:46:08 -0500 Subject: [PATCH 2/5] typo in typecast --- HW3/P3/sum.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..ee914740 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Thu, 5 Nov 2015 09:43:23 -0500 Subject: [PATCH 3/5] unused module, set include path --- HW3/P4/median_filter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..a181c05a 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,8 +1,8 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread import pylab +import os.path def round_up(global_size, group_size): r = global_size % group_size @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10): properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name - program = cl.Program(context, open('median_filter.cl').read()).build(options='') + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() host_image_filtered = np.zeros_like(host_image) From b1eb609988a0cf462da4de6ec15e19143376b8ee Mon Sep 17 00:00:00 2001 From: Xide Xia Date: Fri, 20 Nov 2015 06:03:38 -0500 Subject: [PATCH 4/5] hw3 --- HW3/P3/P3.txt | 87 +++++++++++++++++++++++++++++++++++++++++ HW3/P3/sum.cl | 42 +++++++++++--------- HW3/P4/median_filter.cl | 46 ++++++++++++++++++++-- HW3/P5/P5.txt | 43 ++++++++++++++++++++ HW3/P5/label_regions.cl | 74 +++++++++++++++++++++++++++-------- 5 files changed, 254 insertions(+), 38 deletions(-) create mode 100644 HW3/P3/P3.txt create mode 100755 HW3/P5/P5.txt diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..deb099c6 --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,87 @@ +#0: Intel(R) Core(TM) i7-5557U CPU @ 3.10GHz on Apple +#1: Intel(R) Iris(TM) Graphics 6100 on Apple +coalesced reads, workgroups: 8, num_workers: 4, 0.1539116 seconds +coalesced reads, workgroups: 8, num_workers: 8, 0.08159168 seconds +coalesced reads, workgroups: 8, num_workers: 16, 0.05025496 seconds +coalesced reads, workgroups: 8, num_workers: 32, 0.02677488 seconds +coalesced reads, workgroups: 8, num_workers: 64, 0.01586712 seconds +coalesced reads, workgroups: 8, num_workers: 128, 0.00901392 seconds +coalesced reads, workgroups: 16, num_workers: 4, 0.07981168 seconds +coalesced reads, workgroups: 16, num_workers: 8, 0.04508424 seconds +coalesced reads, workgroups: 16, num_workers: 16, 0.0276228 seconds +coalesced reads, workgroups: 16, num_workers: 32, 0.01530312 seconds +coalesced reads, workgroups: 16, num_workers: 64, 0.00905 seconds +coalesced reads, workgroups: 16, num_workers: 128, 0.00379776 seconds +coalesced reads, workgroups: 32, num_workers: 4, 0.04075392 seconds +coalesced reads, workgroups: 32, num_workers: 8, 0.0240396 seconds +coalesced reads, workgroups: 32, num_workers: 16, 0.01509976 seconds +coalesced reads, workgroups: 32, num_workers: 32, 0.00901072 seconds +coalesced reads, workgroups: 32, num_workers: 64, 0.0048736 seconds +coalesced reads, workgroups: 32, num_workers: 128, 0.004536 seconds +coalesced reads, workgroups: 64, num_workers: 4, 0.02377664 seconds +coalesced reads, workgroups: 64, num_workers: 8, 0.01393464 seconds +coalesced reads, workgroups: 64, num_workers: 16, 0.00937744 seconds +coalesced reads, workgroups: 64, num_workers: 32, 0.00493296 seconds +coalesced reads, workgroups: 64, num_workers: 64, 0.00444424 seconds +coalesced reads, workgroups: 64, num_workers: 128, 0.00383528 seconds +coalesced reads, workgroups: 128, num_workers: 4, 0.02512672 seconds +coalesced reads, workgroups: 128, num_workers: 8, 0.01553176 seconds +coalesced reads, workgroups: 128, num_workers: 16, 0.009058 seconds +coalesced reads, workgroups: 128, num_workers: 32, 0.00472944 seconds +coalesced reads, workgroups: 128, num_workers: 64, 0.00383848 seconds +coalesced reads, workgroups: 128, num_workers: 128, 0.00367832 seconds +coalesced reads, workgroups: 256, num_workers: 4, 0.02256088 seconds +coalesced reads, workgroups: 256, num_workers: 8, 0.01199136 seconds +coalesced reads, workgroups: 256, num_workers: 16, 0.00709632 seconds +coalesced reads, workgroups: 256, num_workers: 32, 0.00401824 seconds +coalesced reads, workgroups: 256, num_workers: 64, 0.00396376 seconds +coalesced reads, workgroups: 256, num_workers: 128, 0.0034156 seconds +coalesced reads, workgroups: 512, num_workers: 4, 0.022924 seconds +coalesced reads, workgroups: 512, num_workers: 8, 0.011864 seconds +coalesced reads, workgroups: 512, num_workers: 16, 0.00672344 seconds +coalesced reads, workgroups: 512, num_workers: 32, 0.00368856 seconds +coalesced reads, workgroups: 512, num_workers: 64, 0.00337808 seconds +coalesced reads, workgroups: 512, num_workers: 128, 0.0035404 seconds +blocked reads, workgroups: 8, num_workers: 4, 0.14802352 seconds +blocked reads, workgroups: 8, num_workers: 8, 0.08757504 seconds +blocked reads, workgroups: 8, num_workers: 16, 0.05723184 seconds +blocked reads, workgroups: 8, num_workers: 32, 0.03349552 seconds +blocked reads, workgroups: 8, num_workers: 64, 0.0155752 seconds +blocked reads, workgroups: 8, num_workers: 128, 0.0105428 seconds +blocked reads, workgroups: 16, num_workers: 4, 0.07797992 seconds +blocked reads, workgroups: 16, num_workers: 8, 0.04836992 seconds +blocked reads, workgroups: 16, num_workers: 16, 0.03260912 seconds +blocked reads, workgroups: 16, num_workers: 32, 0.0158576 seconds +blocked reads, workgroups: 16, num_workers: 64, 0.01049312 seconds +blocked reads, workgroups: 16, num_workers: 128, 0.00736264 seconds +blocked reads, workgroups: 32, num_workers: 4, 0.04226272 seconds +blocked reads, workgroups: 32, num_workers: 8, 0.02656104 seconds +blocked reads, workgroups: 32, num_workers: 16, 0.01613416 seconds +blocked reads, workgroups: 32, num_workers: 32, 0.0100472 seconds +blocked reads, workgroups: 32, num_workers: 64, 0.00740984 seconds +blocked reads, workgroups: 32, num_workers: 128, 0.00793808 seconds +blocked reads, workgroups: 64, num_workers: 4, 0.02388056 seconds +blocked reads, workgroups: 64, num_workers: 8, 0.01382888 seconds +blocked reads, workgroups: 64, num_workers: 16, 0.01021512 seconds +blocked reads, workgroups: 64, num_workers: 32, 0.00744776 seconds +blocked reads, workgroups: 64, num_workers: 64, 0.00763472 seconds +blocked reads, workgroups: 64, num_workers: 128, 0.00713896 seconds +blocked reads, workgroups: 128, num_workers: 4, 0.02438576 seconds +blocked reads, workgroups: 128, num_workers: 8, 0.015862 seconds +blocked reads, workgroups: 128, num_workers: 16, 0.01180312 seconds +blocked reads, workgroups: 128, num_workers: 32, 0.00880544 seconds +blocked reads, workgroups: 128, num_workers: 64, 0.00774904 seconds +blocked reads, workgroups: 128, num_workers: 128, 0.00706464 seconds +blocked reads, workgroups: 256, num_workers: 4, 0.02058416 seconds +blocked reads, workgroups: 256, num_workers: 8, 0.0128224 seconds +blocked reads, workgroups: 256, num_workers: 16, 0.00820904 seconds +blocked reads, workgroups: 256, num_workers: 32, 0.00728816 seconds +blocked reads, workgroups: 256, num_workers: 64, 0.00783352 seconds +blocked reads, workgroups: 256, num_workers: 128, 0.00655888 seconds +blocked reads, workgroups: 512, num_workers: 4, 0.01911864 seconds +blocked reads, workgroups: 512, num_workers: 8, 0.01279408 seconds +blocked reads, workgroups: 512, num_workers: 16, 0.00918208 seconds +blocked reads, workgroups: 512, num_workers: 32, 0.00653176 seconds +blocked reads, workgroups: 512, num_workers: 64, 0.00648824 seconds +blocked reads, workgroups: 512, num_workers: 128, 0.00676712 seconds +configuration ('coalesced', 512, 64): 0.00337808 seconds \ No newline at end of file diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index ee914740..2a4a1759 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,16 +5,19 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - + size_t global_id = get_global_id(0); + int global_size = get_global_size(0); + // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + for (int i=global_id; i0 ; offset >>= 1) { + if (offset > local_id ) fast[local_id] += fast[local_id + offset]; + barrier(CLK_LOCAL_MEM_FENCE); } - + if (local_id == 0) partial[get_group_id(0)] = fast[0]; } @@ -39,22 +45,21 @@ __kernel void sum_blocked(__global float* x, float sum = 0; size_t local_id = get_local_id(0); int k = ceil((float)N / get_global_size(0)); - + // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 // thread with global_id 2 should add 2k..3k-1 // ... // with k = ceil(N / get_global_size()). - // + // // Be careful that each thread stays in bounds, both relative to - // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int ii = 0; ii < k & get_global_id(0) * k + ii < N; ii++) { // YOUR CODE HERE + sum += x[get_global_id(0) * k + ii]; // YOUR CODE HERE } - + fast[local_id] = sum; barrier(CLK_LOCAL_MEM_FENCE); - + // binary reduction // // thread i should sum fast[i] and fast[i + offset] and store back @@ -64,9 +69,10 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint offset = get_local_size(0)/2; offset>0 ; offset >>= 1) { // YOUR CODE HERE + if (offset > local_id ) fast[local_id] += fast[local_id + offset]; + barrier(CLK_LOCAL_MEM_FENCE); } - + if (local_id == 0) partial[get_group_id(0)] = fast[0]; } diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..bea422dc 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -12,7 +12,24 @@ median_3x3(__global __read_only float *in_values, // Note: It may be easier for you to implement median filtering // without using the local buffer, first, then adjust your code to // use such a buffer after you have that working. + + //get global location + int x = get_global_id(0); + int y = get_global_id(1); + + //get local location + int lx = get_local_id(0); + int ly = get_local_id(1); + + //get corner location of buffer + int buf_corner_x = x - lx - halo; + int buf_corner_y = y - ly - halo; + + //get location of buffer + int buf_x = lx + halo; + int buf_y = ly + halo; + int idx_1D = ly * get_local_size(0) + lx; // Load into buffer (with 1-pixel halo). // @@ -21,14 +38,35 @@ median_3x3(__global __read_only float *in_values, // // Note that globally out-of-bounds pixels should be replaced // with the nearest valid pixel's value. + + if(idx_1D < buf_w){ + // replace x_id to stay in-bounds + int x_id = buf_corner_x + idx_1D; + if(x_id < 0) x_id = 0; + if(x_id >= w) x_id = w - 1; - - // Compute 3x3 median for each pixel in core (non-halo) pixels - // + // replace y_id to stay in-bounds + for(int r = 0; r < buf_h; r++){ + int y_id = buf_corner_y + r; + if(y_id < 0) y_id = 0; + if(y_id >= h) y_id = h-1; + buffer[r*buf_w+idx_1D] = in_values[y_id*w+x_id]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + + + +// Compute 3x3 median for each pixel in core (non-halo) pixels // We've given you median9.h, and included it above, so you can // use the median9() function. - + + float median = median9(buffer[(buf_y-1)*buf_w+buf_x-1], buffer[(buf_y-1)*buf_w+buf_x], buffer[(buf_y-1)*buf_w+buf_x+1], + buffer[(buf_y)*buf_w+buf_x-1], buffer[(buf_y)*buf_w+buf_x], buffer[(buf_y)*buf_w+buf_x+1], + buffer[(buf_y+1)*buf_w+buf_x-1], buffer[(buf_y+1)*buf_w+buf_x], buffer[(buf_y+1)*buf_w+buf_x+1]); // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. + if(x < w && y < h) out_values[x + y * w] = median; } diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100755 index 00000000..4cc51b99 --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,43 @@ + +——————————————— Part 1 ———————————————————- + +i) Maze 1 +- Finished after 914 iterations, 222.05546 ms total, 0.242949080963 ms per iteration. +- Found 2 regions. +ii) Maze 2 +- Finished after 532 iterations, 132.33624 ms total, 0.248752348352 ms per iteration. +- Found 35 regions. + +——————————————— Part 2 ———————————————————- + +i) Maze 1 +- Finished after 529 iterations, 128.96085 ms total, 0.243782340345ms per iteration. +- Found 2 regions. +ii) Maze 2 +- Finished after 273 iterations, 65.91784 ms total, 0.241457293493 ms per iteration. +- Found 35 regions. + +——————————————— Part 3 ———————————————————- + +i) Maze 1 +- Finished after 10 iterations, 2.48348 ms total, 0. 248348 ms per iteration. +- Found 2 regions. +ii) Maze +- Finished after 9 iterations, 2.18921 ms total, 0.243246534875 ms per iteration. +- Found 35 regions. + +——————————————— Part 4 ———————————————————- +i) Maze 1 +- Finished after 10 iterations, 4.87712 ms total, 0. 487712 ms per iteration. +- Found 2 regions. +ii) Maze 2 +- Finished after 9 iterations, 4.34102 ms total, 0.482336 ms per iteration. +- Found 35 regions. + +In this part, we used a single thread so that redundant global memory reads could be partly reduced. However, from the results we can see that a single thread is actually not a good choice. I think that’s because most of the labels are different which at the beginning which may require global memory reads. Thus, the time saved by avoiding unnecessary global memory reads is not as much as we expected. + +——————————————— Part 5 ———————————————————- +From the results of this part, I found that min() and atomic_min don’t have much difference on the correctness of the algorithm. However, atomic_min() requires less iterations. I think the reason is that the value in labels will be changed by other threads if we called min(). On the other hand, atomic_min() can guarantee that only one thread could update the label at one time. Thus, I think atomic_min() is a better choice. + + + diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..f06920dd 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -5,7 +5,7 @@ initialize_labels(__global __read_only int *image, { const int x = get_global_id(0); const int y = get_global_id(1); - + if ((x < w) && (y < h)) { if (image[y * w + x] > 0) { // set each pixel > 0 to its linear index @@ -27,6 +27,12 @@ get_clamped_value(__global __read_only int *labels, return labels[y * w + x]; } +int +mymin(int a, int b) +{ + return (a < b) ? a : b; +} + __kernel void propagate_labels(__global __read_write int *labels, __global __write_only int *changed_flag, @@ -36,24 +42,24 @@ propagate_labels(__global __read_write int *labels, const int halo) { // halo is the additional number of cells in one direction - + // Global position of output pixel const int x = get_global_id(0); const int y = get_global_id(1); - + // Local position relative to (0, 0) in workgroup const int lx = get_local_id(0); const int ly = get_local_id(1); - + // coordinates of the upper left corner of the buffer in image // space, including halo const int buf_corner_x = x - lx - halo; const int buf_corner_y = y - ly - halo; - + // coordinates of our pixel in the local buffer const int buf_x = lx + halo; const int buf_y = ly + halo; - + // 1D index of thread within our work-group const int idx_1D = ly * get_local_size(0) + lx; @@ -61,25 +67,52 @@ propagate_labels(__global __read_write int *labels, // Will store the output value int new_label; - // Load the relevant labels to a local buffer with a halo + int last_label = -1; + int last_index = -1; + + // Load the relevant labels to a local buffer with a halo if (idx_1D < buf_w) { for (int row = 0; row < buf_h; row++) { - buffer[row * buf_w + idx_1D] = - get_clamped_value(labels, - w, h, - buf_corner_x + idx_1D, buf_corner_y + row); + buffer[row * buf_w + idx_1D] = + get_clamped_value(labels, + w, h, + buf_corner_x + idx_1D, buf_corner_y + row); } } - + // Make sure all threads reach the next part after // the local buffer is loaded barrier(CLK_LOCAL_MEM_FENCE); - + // Fetch the value from the buffer the corresponds to // the pixel for this thread old_label = buffer[buf_y * buf_w + buf_x]; - - // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) + + + // part 2 + // if (old_label Date: Fri, 20 Nov 2015 06:13:42 -0500 Subject: [PATCH 5/5] p4 change --- HW3/P4/median_filter.cl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index bea422dc..2015646e 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -62,9 +62,15 @@ median_3x3(__global __read_only float *in_values, // We've given you median9.h, and included it above, so you can // use the median9() function. - float median = median9(buffer[(buf_y-1)*buf_w+buf_x-1], buffer[(buf_y-1)*buf_w+buf_x], buffer[(buf_y-1)*buf_w+buf_x+1], - buffer[(buf_y)*buf_w+buf_x-1], buffer[(buf_y)*buf_w+buf_x], buffer[(buf_y)*buf_w+buf_x+1], - buffer[(buf_y+1)*buf_w+buf_x-1], buffer[(buf_y+1)*buf_w+buf_x], buffer[(buf_y+1)*buf_w+buf_x+1]); + float median = median9(buffer[(buf_y-1)*buf_w+buf_x-1], + buffer[(buf_y-1)*buf_w+buf_x], + buffer[(buf_y-1)*buf_w+buf_x+1], + buffer[(buf_y)*buf_w+buf_x-1], + buffer[(buf_y)*buf_w+buf_x], + buffer[(buf_y)*buf_w+buf_x+1], + buffer[(buf_y+1)*buf_w+buf_x-1], + buffer[(buf_y+1)*buf_w+buf_x], + buffer[(buf_y+1)*buf_w+buf_x+1]); // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median.