Skip to content

Commit

Permalink
Add newlines, large benchmark, and throughput measures to benchmarks
Browse files Browse the repository at this point in the history
Added a throughput measurement to the cirterion report. This will make it easier
to measure GB/s for a particular algorithm. Also added a large benchmark that
will test crossing chunk boundaries. Lastly added the following line separators
to the benchmarks:

- Line Feed
- Vertical Tab
- Form Feed
- Carriage Return
- Line Separator
- Next Line
- Paragraph Separator
  • Loading branch information
CeleritasCelery committed Feb 6, 2023
1 parent e92188e commit a00ab3d
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 45 deletions.
175 changes: 130 additions & 45 deletions benches/all.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use std::fs;

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use str_indices::{chars, lines, lines_crlf, lines_lf, utf16};

fn all(c: &mut Criterion) {
// Load benchmark strings.
let test_strings: Vec<(&str, String)> = vec![
let test_strings = vec![
("en_0001", "E".into()),
(
"en_0010",
Expand All @@ -19,6 +19,12 @@ fn all(c: &mut Criterion) {
"en_1000",
fs::read_to_string("benches/text/en_1000.txt").expect("Cannot find benchmark text."),
),
(
"en_10000",
fs::read_to_string("benches/text/en_1000.txt")
.expect("Cannot find benchmark text.")
.repeat(10),
),
("jp_0003", "日".into()),
(
"jp_0102",
Expand All @@ -28,6 +34,32 @@ fn all(c: &mut Criterion) {
"jp_1001",
fs::read_to_string("benches/text/jp_1001.txt").expect("Cannot find benchmark text."),
),
(
"jp_10000",
fs::read_to_string("benches/text/jp_1001.txt")
.expect("Cannot find benchmark text.")
.repeat(10),
),
];

let line_strings = vec![
(
"lines_100",
fs::read_to_string("benches/text/lines.txt")
.expect("Cannot find benchmark text."),
),
(
"lines_1000",
fs::read_to_string("benches/text/lines.txt")
.expect("Cannot find benchmark text.")
.repeat(10),
),
(
"lines_10000",
fs::read_to_string("benches/text/lines.txt")
.expect("Cannot find benchmark text.")
.repeat(100),
),
];

//---------------------------------------------------------
Expand All @@ -37,6 +69,7 @@ fn all(c: &mut Criterion) {
{
let mut group = c.benchmark_group("chars::count");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
bench.iter(|| {
black_box(chars::count(text));
Expand All @@ -49,6 +82,7 @@ fn all(c: &mut Criterion) {
// for performance comparisons.
let mut group = c.benchmark_group("chars::count_std");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
bench.iter(|| {
black_box(text.chars().count());
Expand All @@ -61,6 +95,7 @@ fn all(c: &mut Criterion) {
{
let mut group = c.benchmark_group("chars::from_byte_idx");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
let idx = text.len();
bench.iter(|| {
Expand All @@ -74,6 +109,7 @@ fn all(c: &mut Criterion) {
// for performance comparisons.
let mut group = c.benchmark_group("chars::from_byte_idx_std");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(format!("std::{}", text_name), |bench| {
let idx = text.len();
bench.iter(|| {
Expand All @@ -96,6 +132,7 @@ fn all(c: &mut Criterion) {
{
let mut group = c.benchmark_group("chars::to_byte_idx");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
let idx = chars::count(text);
bench.iter(|| {
Expand All @@ -109,6 +146,7 @@ fn all(c: &mut Criterion) {
// for performance comparisons.
let mut group = c.benchmark_group("chars::to_byte_idx_std");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(format!("std::{}", text_name), |bench| {
let idx = chars::count(text) - 1; // Minus 1 so we can unwrap below.
bench.iter(|| {
Expand All @@ -125,6 +163,7 @@ fn all(c: &mut Criterion) {
{
let mut group = c.benchmark_group("utf16::count");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
bench.iter(|| {
black_box(utf16::count(text));
Expand All @@ -137,6 +176,7 @@ fn all(c: &mut Criterion) {
{
let mut group = c.benchmark_group("utf16::count_surrogates");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
bench.iter(|| {
black_box(utf16::count_surrogates(text));
Expand All @@ -149,6 +189,7 @@ fn all(c: &mut Criterion) {
{
let mut group = c.benchmark_group("utf16::from_byte_idx");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
let idx = text.len();
bench.iter(|| {
Expand All @@ -162,6 +203,7 @@ fn all(c: &mut Criterion) {
{
let mut group = c.benchmark_group("utf16::to_byte_idx");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
let idx = utf16::count(text);
bench.iter(|| {
Expand All @@ -173,42 +215,64 @@ fn all(c: &mut Criterion) {

//---------------------------------------------------------
// Lines (unicode).
let unicode_line_breaks = [
("LF", "\u{000A}"),
("VT", "\u{000B}"),
("FF", "\u{000C}"),
("CR", "\u{000D}"),
("NEL", "\u{0085}"),
("LS", "\u{2028}"),
("PS", "\u{2029}"),
("CRLF", "\u{000D}\u{000A}"),
];

// lines::count_breaks()
{
let mut group = c.benchmark_group("lines::count_breaks");
for (text_name, text) in test_strings.iter() {
group.bench_function(*text_name, |bench| {
bench.iter(|| {
black_box(lines::count_breaks(text));
})
});
for (text_name, text) in line_strings.iter() {
for (break_name, line_break) in unicode_line_breaks {
let text = text.replace('\n', line_break);
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(format!("{text_name}_{break_name}"), |bench| {
bench.iter(|| {
black_box(lines::count_breaks(&text));
})
});
}
}
}

// lines::from_byte_idx()
{
let mut group = c.benchmark_group("lines::from_byte_idx");
for (text_name, text) in test_strings.iter() {
group.bench_function(*text_name, |bench| {
let idx = text.len();
bench.iter(|| {
black_box(lines::from_byte_idx(text, idx));
})
});
for (text_name, text) in line_strings.iter() {
for (break_name, line_break) in unicode_line_breaks {
let text = text.replace('\n', line_break);
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(format!("{text_name}_{break_name}"), |bench| {
let idx = text.len();
bench.iter(|| {
black_box(lines::from_byte_idx(&text, idx));
})
});
}
}
}

// lines::to_byte_idx()
{
let mut group = c.benchmark_group("lines::to_byte_idx");
for (text_name, text) in test_strings.iter() {
group.bench_function(*text_name, |bench| {
let idx = lines::count_breaks(text) + 1;
bench.iter(|| {
black_box(lines::to_byte_idx(text, idx));
})
});
for (text_name, text) in line_strings.iter() {
for (break_name, line_break) in unicode_line_breaks {
let text = &text.replace('\n', line_break);
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(format!("{text_name}_{break_name}"), |bench| {
let idx = lines::count_breaks(text) + 1;
bench.iter(|| {
black_box(lines::to_byte_idx(text, idx));
})
});
}
}
}

Expand All @@ -218,7 +282,8 @@ fn all(c: &mut Criterion) {
// lines_lf::count_breaks()
{
let mut group = c.benchmark_group("lines_lf::count_breaks");
for (text_name, text) in test_strings.iter() {
for (text_name, text) in line_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
bench.iter(|| {
black_box(lines_lf::count_breaks(text));
Expand All @@ -234,7 +299,8 @@ fn all(c: &mut Criterion) {
// But it should be close enough for perf
// comparisons.
let mut group = c.benchmark_group("lines_lf::count_breaks_std");
for (text_name, text) in test_strings.iter() {
for (text_name, text) in line_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
bench.iter(|| {
black_box(text.lines().count());
Expand All @@ -246,7 +312,8 @@ fn all(c: &mut Criterion) {
// lines_lf::from_byte_idx()
{
let mut group = c.benchmark_group("lines_lf::from_byte_idx");
for (text_name, text) in test_strings.iter() {
for (text_name, text) in line_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
let idx = text.len();
bench.iter(|| {
Expand All @@ -260,6 +327,7 @@ fn all(c: &mut Criterion) {
{
let mut group = c.benchmark_group("lines_lf::to_byte_idx");
for (text_name, text) in test_strings.iter() {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(*text_name, |bench| {
let idx = lines_lf::count_breaks(text) + 1;
bench.iter(|| {
Expand All @@ -271,42 +339,59 @@ fn all(c: &mut Criterion) {

//---------------------------------------------------------
// Lines (CRLF).
let crlf_line_breaks = [
("LF", "\u{000A}"),
("CR", "\u{000D}"),
("CRLF", "\u{000D}\u{000A}"),
];

// lines_crlf::count_breaks()
{
let mut group = c.benchmark_group("lines_crlf::count_breaks");
for (text_name, text) in test_strings.iter() {
group.bench_function(*text_name, |bench| {
bench.iter(|| {
black_box(lines_crlf::count_breaks(text));
})
});
for (text_name, text) in line_strings.iter() {
for (break_name, line_break) in crlf_line_breaks {
let text = &text.replace('\n', line_break);
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(format!("{text_name}_{break_name}"), |bench| {
bench.iter(|| {
black_box(lines_crlf::count_breaks(text));
})
});
}
}
}

// lines_crlf::from_byte_idx()
{
let mut group = c.benchmark_group("lines_crlf::from_byte_idx");
for (text_name, text) in test_strings.iter() {
group.bench_function(*text_name, |bench| {
let idx = text.len();
bench.iter(|| {
black_box(lines_crlf::from_byte_idx(text, idx));
})
});
for (text_name, text) in line_strings.iter() {
for (break_name, line_break) in crlf_line_breaks {
let text = &text.replace('\n', line_break);
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(format!("{text_name}_{break_name}"), |bench| {
let idx = text.len();
bench.iter(|| {
black_box(lines_crlf::from_byte_idx(text, idx));
})
});
}
}
}

// lines_crlf::to_byte_idx()
{
let mut group = c.benchmark_group("lines_crlf::to_byte_idx");
for (text_name, text) in test_strings.iter() {
group.bench_function(*text_name, |bench| {
let idx = lines_crlf::count_breaks(text) + 1;
bench.iter(|| {
black_box(lines_crlf::to_byte_idx(text, idx));
})
});
for (text_name, text) in line_strings.iter() {
for (break_name, line_break) in crlf_line_breaks {
let text = &text.replace('\n', line_break);
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_function(format!("{text_name}_{break_name}"), |bench| {
let idx = lines_crlf::count_breaks(text) + 1;
bench.iter(|| {
black_box(lines_crlf::to_byte_idx(text, idx));
})
});
}
}
}
}
Expand Down
5 changes: 5 additions & 0 deletions benches/text/lines.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
日本 is a West Germanic ー百 that
出典: spoken in early
medieval 日本 eventually
became commons england

0 comments on commit a00ab3d

Please sign in to comment.