Skip to content

Commit

Permalink
apply: optimize
Browse files Browse the repository at this point in the history
- use lazy OnceLock initialization for GENDER_GUESER
- lazily init SENTIMENT_ANALYZER
- use itoa and ryu for accelerated integer and float to alpha conversions
- use more to_owned, into_owned and clone_into instead of to_string to minimize allocations
  • Loading branch information
jqnatividad committed Oct 19, 2024
1 parent cd00daa commit bc837ae
Showing 1 changed file with 55 additions and 24 deletions.
79 changes: 55 additions & 24 deletions src/cmd/apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ static SENTIMENT_ANALYZER: OnceLock<SentimentIntensityAnalyzer> = OnceLock::new(
static THOUSANDS_POLICY: OnceLock<SeparatorPolicy> = OnceLock::new();
static ROUND_PLACES: OnceLock<u32> = OnceLock::new();
static WHATLANG_CONFIDENCE_THRESHOLD: OnceLock<f64> = OnceLock::new();
static GENDER_GUESSER: OnceLock<gender_guesser::Detector> = OnceLock::new();

// default confidence threshold for whatlang language detection - 90% confidence
const DEFAULT_THRESHOLD: f64 = 0.9;
Expand Down Expand Up @@ -831,6 +832,15 @@ fn validate_operations(
"--new_column (-c) is required for sentiment operation."
);
}
if sentiment_invokes == 0 {
if SENTIMENT_ANALYZER
.set(SentimentIntensityAnalyzer::new())
.is_err()
{
return fail!("Cannot initialize Sentiment Analyzer.");
}
}

sentiment_invokes = sentiment_invokes.saturating_add(1);
},
Operations::Simdl
Expand Down Expand Up @@ -932,6 +942,9 @@ fn validate_operations(
"--new_column (-c) is required for Gender_Guess"
);
}
if GENDER_GUESSER.set(gender_guesser::Detector::new()).is_err() {
return fail!("Cannot initialize Gender Detector.");
}
},
_ => {},
}
Expand Down Expand Up @@ -970,7 +983,7 @@ fn apply_operations(
for op in ops_vec {
match op {
Operations::Len => {
*cell = cell.len().to_string();
itoa::Buffer::new().format(cell.len()).clone_into(cell);
},
Operations::Lower => {
*cell = cell.to_lowercase();
Expand All @@ -980,11 +993,11 @@ fn apply_operations(
},
Operations::Squeeze => {
let squeezer: &'static Regex = regex_oncelock!(r"\s+");
*cell = squeezer.replace_all(cell, " ").to_string();
*cell = squeezer.replace_all(cell, " ").into_owned();
},
Operations::Squeeze0 => {
let squeezer: &'static Regex = regex_oncelock!(r"\s+");
*cell = squeezer.replace_all(cell, "").to_string();
*cell = squeezer.replace_all(cell, "").into_owned();
},
Operations::Trim => {
*cell = String::from(cell.trim());
Expand Down Expand Up @@ -1038,18 +1051,21 @@ fn apply_operations(
// including selection of the best algorithm repeatedly at runtime
let mut crc32_hasher = CRC32.get().unwrap().clone();
crc32_hasher.update(cell.as_bytes());
*cell = crc32_hasher.finalize().to_string();
itoa::Buffer::new()
.format(crc32_hasher.finalize())
.clone_into(cell);
},
Operations::Gender_Guess => {
let gender_detector = gender_guesser::Detector::new();
// safety: we set GENDER_GUESSER in validate_operations()
let gender_detector = GENDER_GUESSER.get().unwrap();
*cell = match gender_detector.get_gender(cell) {
Gender::Male => "Male".to_string(),
Gender::Female => "Female".to_string(),
Gender::MayBeMale => "MayBeMale".to_string(),
Gender::MayBeFemale => "MayBeFemale".to_string(),
Gender::BothMaleFemale => "BothMaleFemale".to_string(),
Gender::NotSure => "NotSure".to_string(),
Gender::NotFound => "NotFound".to_string(),
Gender::Male => "Male".to_owned(),
Gender::Female => "Female".to_owned(),
Gender::MayBeMale => "MayBeMale".to_owned(),
Gender::MayBeFemale => "MayBeFemale".to_owned(),
Gender::BothMaleFemale => "BothMaleFemale".to_owned(),
Gender::NotSure => "NotSure".to_owned(),
Gender::NotFound => "NotFound".to_owned(),
};
},
Operations::Escape => {
Expand All @@ -1074,7 +1090,7 @@ fn apply_operations(
Operations::Regex_Replace => {
// safety: we set REGEX_REPLACE in validate_operations()
let regexreplace = REGEX_REPLACE.get().unwrap();
*cell = regexreplace.replace_all(cell, replacement).to_string();
*cell = regexreplace.replace_all(cell, replacement).into_owned();
},
Operations::Censor => {
// safety: we set CENSOR in validate_operations()
Expand All @@ -1084,12 +1100,14 @@ fn apply_operations(
Operations::Censor_Check => {
// safety: we set CENSOR in validate_operations()
let censor = CENSOR.get().unwrap();
*cell = censor.check(cell).to_string();
if censor.check(cell) { "true" } else { "false" }.clone_into(cell);
},
Operations::Censor_Count => {
// safety: we set CENSOR in validate_operations()
let censor = CENSOR.get().unwrap();
*cell = censor.count(cell).to_string();
itoa::Buffer::new()
.format(censor.count(cell))
.clone_into(cell);
},
Operations::Thousands => {
if let Ok(num) = cell.parse::<f64>() {
Expand Down Expand Up @@ -1170,25 +1188,37 @@ fn apply_operations(
}
},
Operations::Simdl => {
*cell = damerau_levenshtein(cell, comparand).to_string();
itoa::Buffer::new()
.format(damerau_levenshtein(cell, comparand))
.clone_into(cell);
},
Operations::Simdln => {
*cell = normalized_damerau_levenshtein(cell, comparand).to_string();
ryu::Buffer::new()
.format_finite(normalized_damerau_levenshtein(cell, comparand))
.clone_into(cell);
},
Operations::Simjw => {
*cell = jaro_winkler(cell, comparand).to_string();
ryu::Buffer::new()
.format_finite(jaro_winkler(cell, comparand))
.clone_into(cell);
},
Operations::Simsd => {
*cell = sorensen_dice(cell, comparand).to_string();
ryu::Buffer::new()
.format_finite(sorensen_dice(cell, comparand))
.clone_into(cell);
},
Operations::Simhm => {
let ham_val = hamming(cell, comparand);
match ham_val {
Ok(val) => *cell = val.to_string(),
Ok(val) => itoa::Buffer::new().format(val).clone_into(cell),
Err(_) => *cell = String::from("ERROR: Different lengths"),
}
},
Operations::Simod => *cell = osa_distance(cell, comparand).to_string(),
Operations::Simod => {
itoa::Buffer::new()
.format(osa_distance(cell, comparand))
.clone_into(cell);
},
Operations::Eudex => {
// safety: we set EUDEX_COMPARAND_HASH in validate_operations()
let eudex_comparand_hash = EUDEX_COMPARAND_HASH.get().unwrap();
Expand All @@ -1197,10 +1227,11 @@ fn apply_operations(
},
Operations::Sentiment => {
// safety: we set SENTIMENT_ANALYZER in validate_operations()
let sentiment_analyzer =
SENTIMENT_ANALYZER.get_or_init(SentimentIntensityAnalyzer::new);
let sentiment_analyzer = SENTIMENT_ANALYZER.get().unwrap();
let sentiment_scores = sentiment_analyzer.polarity_scores(cell);
*cell = sentiment_scores.get("compound").unwrap_or(&0.0).to_string();
ryu::Buffer::new()
.format_finite(*sentiment_scores.get("compound").unwrap_or(&0.0))
.clone_into(cell);
},
Operations::Whatlang => {
let lang_info = detect(cell);
Expand Down

0 comments on commit bc837ae

Please sign in to comment.