Skip to content

Commit

Permalink
use the best experiment for max tflops (#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
stas00 authored Mar 7, 2024
1 parent 56aeee1 commit 233f003
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions benchmarks/sizing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def benchmark_mm(m, n, k, num_iterations, num_warmup_iterations):
torch.cuda.synchronize()
times[i] = start.elapsed_time(end)
times = times[num_warmup_iterations:]
elapsed_time = np.amax(times)/1000
elapsed_time = np.amin(times)/1000
print(f"Elapsed time for {m}x{n}x{k}: {elapsed_time:.3f}")
print(f"Throughput (in TFLOP/s) for {m}x{n}x{k}: {(2 * m * n * k) / (elapsed_time * 10**12):.3f}")
print("-" * 80)
Expand All @@ -99,7 +99,7 @@ def benchmark_mm_b(m, n, k, label, b, num_iterations,num_warmup_iterations):
torch.cuda.synchronize()
times[i] = start.elapsed_time(end)
times = times[num_warmup_iterations:]
elapsed_time = np.amax(times)/1000
elapsed_time = np.amin(times)/1000
print(f"Elapsed time for {label} ({m}x{n}x{k}, b={b}): {elapsed_time :.4f}")
print(f"Throughput (in TFLOP/s) for {label} ({m}x{n}x{k}, b={b}): "
f"{(2 * b * m * n * k) / (elapsed_time * 10**12):.3f}")
Expand All @@ -120,7 +120,7 @@ def benchmark_bmm(b, m, n, k, label,num_iterations, num_warmup_iterations):
torch.cuda.synchronize()
times[i] = start.elapsed_time(end)
times = times[num_warmup_iterations:]
elapsed_time = np.amax(times)/1000
elapsed_time = np.amin(times)/1000
print(f"Elapsed time for {label} ({b}x{m}x{n}x{k}): {elapsed_time :.4f}")
print(f"Throughput (in TFLOP/s) for {label} ({b}x{m}x{n}x{k}): "
f"{(2 * b * m * n * k) / (elapsed_time * 10**12):.3f}")
Expand All @@ -141,7 +141,7 @@ def benchmark_dropout(A_dim, label, num_iterations, num_warmup_iterations):
torch.cuda.synchronize()
times[i] = start.elapsed_time(end)
times = times[num_warmup_iterations:]
elapsed_time = np.amax(times)/1000
elapsed_time = np.amin(times)/1000
print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
return elapsed_time

Expand All @@ -166,7 +166,7 @@ def benchmark_softmax(scores_shape, seq_length, label, num_iterations,num_warmup
torch.cuda.synchronize()
times[i] = start.elapsed_time(end)
times = times[num_warmup_iterations:]
elapsed_time = np.amax(times)/1000
elapsed_time = np.amin(times)/1000
print(f"Elapsed time for {label} ({display(scores_shape)}): {elapsed_time :.4f}")
return elapsed_time

Expand All @@ -184,7 +184,7 @@ def benchmark_fused_gelu(A_dim, b_dim, label, num_iterations, num_warmup_iterati
torch.cuda.synchronize()
times[i] = start.elapsed_time(end)
times = times[num_warmup_iterations:]
elapsed_time = np.amax(times)/1000
elapsed_time = np.amin(times)/1000
print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
return elapsed_time

Expand All @@ -202,7 +202,7 @@ def benchmark_layer_norm(A_dim, normalized_shape, label, num_iterations, num_war
torch.cuda.synchronize()
times[i] = start.elapsed_time(end)
times = times[num_warmup_iterations:]
elapsed_time = np.amax(times)/1000
elapsed_time = np.amin(times)/1000
print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
return elapsed_time

Expand All @@ -221,6 +221,6 @@ def benchmark_add_bias_dropout(shape, label, num_iterations, num_warmup_iteratio
torch.cuda.synchronize()
times[i] = start.elapsed_time(end)
times = times[num_warmup_iterations:]
elapsed_time = np.amax(times)/1000
elapsed_time = np.amin(times)/1000
print(f"Elapsed time for {label} ({display(shape)}): {elapsed_time :.4f}")
return elapsed_time

0 comments on commit 233f003

Please sign in to comment.