Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance find_outliers and identify_outliers performance by avoiding duplication and filtering columns #140

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion media/coverage-badge.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
134 changes: 69 additions & 65 deletions src/cosmicqc/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,80 +60,83 @@ def identify_outliers(
or not for use within other functions.
"""

# interpret the df as CytoDataFrame
# Ensure the input is a CytoDataFrame, converting if necessary
df = CytoDataFrame(data=df)

# create a copy of the dataframe to ensure
# we don't modify the supplied dataframe inplace.
outlier_df = df.copy()
# reference the df for a new outlier_df
outlier_df = df

# Define the naming scheme for z-score columns based on thresholds
thresholds_name = (
f"cqc.{feature_thresholds}"
if isinstance(feature_thresholds, str)
else "cqc.custom"
)

# If feature_thresholds is a string, load the thresholds from the specified file
if isinstance(feature_thresholds, str):
feature_thresholds = read_thresholds_set_from_file(
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
)

# Create z-score columns for each feature to reference during outlier detection
# Dictionary to store mappings of features to their z-score column names
zscore_columns = {}
for feature in feature_thresholds:
# Ensure the feature exists in the DataFrame
if feature not in df.columns:
raise ValueError(f"Feature '{feature}' does not exist in the DataFrame.")
outlier_df[(colname := f"{thresholds_name}.Z_Score.{feature}")] = scipy_zscore(
df[feature]
)
zscore_columns[feature] = colname

# Create outlier detection conditions for each feature
conditions = []
for feature, threshold in feature_thresholds.items():
# For positive thresholds, look for outliers that are
# that number of std "above" the mean
# Construct the z-score column name
zscore_col = f"{thresholds_name}.Z_Score.{feature}"

# Calculate and store z-scores only if not already present
if zscore_col not in outlier_df:
outlier_df[zscore_col] = scipy_zscore(df[feature])

# Add the column name to the zscore_columns dictionary
zscore_columns[feature] = zscore_col

# Helper function to create outlier detection conditions
def create_condition(feature: str, threshold: float) -> pd.Series:
# Positive threshold checks for outliers above the mean
if threshold > 0:
condition = outlier_df[zscore_columns[feature]] > threshold
# For negative thresholds, look for outliers that are
# that number of std "below" the mean
else:
condition = outlier_df[zscore_columns[feature]] < threshold
conditions.append(condition)

result = (
# create a boolean pd.series identifier for dataframe
# based on all conditions for use within other functions.
reduce(operator.and_, conditions)
if not include_threshold_scores
# otherwise, provide the threshold zscore col and the above column
else CytoDataFrame(
data=pd.concat(
[
# grab only the outlier zscore columns from the outlier_df
outlier_df[zscore_columns.values()],
CytoDataFrame(
{
f"{thresholds_name}.is_outlier": reduce(
operator.and_, conditions
)
}
),
],
axis=1,
),
return outlier_df[zscore_columns[feature]] > threshold
# Negative threshold checks for outliers below the mean
return outlier_df[zscore_columns[feature]] < threshold

# Generate outlier detection conditions for all features
conditions = [
create_condition(feature, threshold)
for feature, threshold in feature_thresholds.items()
]

# Construct the result based on whether threshold scores should be included
if include_threshold_scores:
# Extract z-score columns for each feature
zscore_df = outlier_df[list(zscore_columns.values())]

# Combine conditions into a single Series indicating outlier status
is_outlier_series = reduce(operator.and_, conditions).rename(
f"{thresholds_name}.is_outlier"
)

# Combine z-scores and outlier status into a single DataFrame
result = CytoDataFrame(
data=pd.concat([zscore_df, is_outlier_series], axis=1),
data_context_dir=df._custom_attrs["data_context_dir"],
data_mask_context_dir=df._custom_attrs["data_mask_context_dir"],
)
)
else:
# Combine conditions into a single Series of boolean values
result = reduce(operator.and_, conditions)

# Export the result if an export path is specified
if export_path is not None:
if isinstance(result, pd.Series):
CytoDataFrame(result).export(file_path=export_path)
else:
result.export(file_path=export_path)
export_df = CytoDataFrame(result) if isinstance(result, pd.Series) else result
export_df.export(file_path=export_path)

# Return the resulting Series or DataFrame
return result


Expand Down Expand Up @@ -175,26 +178,29 @@ def find_outliers(
Outlier data frame for the given conditions.
"""

# interpret the df as CytoDataFrame
df = CytoDataFrame(data=df)

# Resolve feature_thresholds if provided as a string
if isinstance(feature_thresholds, str):
feature_thresholds = read_thresholds_set_from_file(
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
)

# Filter DataFrame for outliers using all conditions
outliers_df = df[
# use identify outliers as a mask on the full dataframe
identify_outliers(
df=df,
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
)
]
# Determine the columns required for processing
required_columns = list(feature_thresholds.keys()) + metadata_columns

# Print outliers count and range for each feature
# Interpret the df as CytoDataFrame
df = CytoDataFrame(data=df)[required_columns]

# Filter DataFrame for outliers using identify_outliers
outliers_mask = identify_outliers(
# Select only the required columns from the DataFrame
df=df,
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
)
outliers_df = df[outliers_mask]

# Print outlier count and range for each feature
print(
"Number of outliers:",
outliers_df.shape[0],
Expand All @@ -206,15 +212,13 @@ def find_outliers(
print(f"{feature} Max:", outliers_df[feature].max())

# Include metadata columns in the output DataFrame
columns_to_include = list(feature_thresholds.keys()) + metadata_columns

result = outliers_df[columns_to_include]
result = outliers_df[required_columns]

# export the file if specified
# Export the file if specified
if export_path is not None:
result.export(file_path=export_path)

# Return outliers DataFrame with specified columns
# Return the resulting DataFrame
return result


Expand Down
132 changes: 0 additions & 132 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,8 @@

import pathlib

import numpy as np
import pandas as pd
import plotly.colors as pc
import pytest
import skimage
from PIL import Image

import cosmicqc


@pytest.fixture(name="cytotable_CFReT_data_df")
Expand All @@ -26,25 +20,6 @@ def fixture_cytotable_CFReT_df():
)


@pytest.fixture(name="cytotable_NF1_data_parquet_shrunken")
def fixture_cytotable_NF1_data_parquet_shrunken():
"""
Return df to test CytoTable NF1 data through shrunken parquet file
"""
return (
"tests/data/cytotable/NF1_cellpainting_data_shrunken/"
"Plate_2_with_image_data_shrunken.parquet"
)


@pytest.fixture(name="cytotable_nuclear_speckles_data_parquet")
def fixture_cytotable_nuclear_speckle_data_parquet():
"""
Return df to test CytoTable nuclear speckles data through shrunken parquet file
"""
return "tests/data/cytotable/nuclear_speckles/test_slide1_converted.parquet"


@pytest.fixture(name="basic_outlier_dataframe")
def fixture_basic_outlier_dataframe():
"""
Expand All @@ -66,110 +41,3 @@ def fixture_basic_outlier_csv(
)

return csv_path


@pytest.fixture(name="basic_outlier_csv_gz")
def fixture_basic_outlier_csv_gz(
tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
):
"""
Creates basic example data csv for use in tests
"""

basic_outlier_dataframe.to_csv(
csv_gz_path := tmp_path / "example.csv.gz", index=False, compression="gzip"
)

return csv_gz_path


@pytest.fixture(name="basic_outlier_tsv")
def fixture_basic_outlier_tsv(
tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
):
"""
Creates basic example data tsv for use in tests
"""

basic_outlier_dataframe.to_csv(
tsv_path := tmp_path / "example.tsv", sep="\t", index=False
)

return tsv_path


@pytest.fixture(name="basic_outlier_parquet")
def fixture_basic_outlier_parquet(
tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
):
"""
Creates basic example data parquet for use in tests
"""

basic_outlier_dataframe.to_parquet(
parquet_path := tmp_path / "example.parquet", index=False
)

return parquet_path


@pytest.fixture(name="generate_show_report_html_output")
def fixture_generate_show_report_html_output(cytotable_CFReT_data_df: pd.DataFrame):
"""
Used for generating report output for use with other tests.
"""

# create outliers dataframe
df = cosmicqc.analyze.label_outliers(
df=cytotable_CFReT_data_df,
include_threshold_scores=True,
)

# show a report
df.show_report(
report_path=(
report_path := pathlib.Path(__file__).parent
/ "data"
/ "coSMicQC"
/ "show_report"
/ "cosmicqc_example_report.html"
),
color_palette=pc.qualitative.Dark24[0:2],
auto_open=False,
)

return report_path


@pytest.fixture
def fixture_dark_image():
# Create a dark image (50x50 pixels, almost black)
dark_img_array = np.zeros((50, 50, 3), dtype=np.uint8)
return Image.fromarray(dark_img_array)


@pytest.fixture
def fixture_mid_brightness_image():
# Create an image with medium brightness (50x50 pixels, mid gray)
mid_brightness_img_array = np.full((50, 50, 3), 128, dtype=np.uint8)
return Image.fromarray(mid_brightness_img_array)


@pytest.fixture
def fixture_bright_image():
# Create a bright image (50x50 pixels, almost white)
bright_img_array = np.full((50, 50, 3), 255, dtype=np.uint8)
return Image.fromarray(bright_img_array)


@pytest.fixture
def fixture_nuclear_speckle_example_image():
# create an image array from example nuclear speckle data
return Image.fromarray(
(
skimage.io.imread(
"tests/data/cytotable/nuclear_speckles/images/plate1/slide1_A1_M10_CH0_Z09_illumcorrect.tiff"
)
/ 256
).astype(np.uint8)
).convert("RGBA")