Merge pull request #19 from fraunhoferportugal/dev

Tabular docs and Pypi depdency fix
fraunhoferportugal · Nov 27, 2024 · df8586b · df8586b
2 parents 2e0c160 + 43d6519
commit df8586b
Show file tree

Hide file tree

Showing 9 changed files with 141 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -225,6 +225,6 @@ If you publish work that uses pyMDMA, please cite pyMDMA as follows:
 ```
 
 ## Acknowledgments
-This work was funded by AISym4Med project number 101095387, supported by the European Heath and Digital Executive Agency (HADEA), granting authority under the powers delegated by the Europeam Commision. More information on this project can be found [here](https://aisym4med.eu/).
+This work was funded by AISym4Med project number 101095387, supported by the European Health and Digital Executive Agency (HADEA), granting authority under the powers delegated by the European Commision. More information on this project can be found [here](https://aisym4med.eu/).
 
 This work was supported by European funds through the Recovery and Resilience Plan, project ”Center for Responsible AI”, project number C645008882-00000055. Learn more about this project [here](https://centerforresponsible.ai/).
diff --git a/docs/image/input_val.md b/docs/image/input_val.md
@@ -2,7 +2,6 @@
 
 ## Data-based
 ### Quality (No-reference)
-::: pymdma.image.measures.input_val.DOM
 ::: pymdma.image.measures.input_val.Tenengrad
 ::: pymdma.image.measures.input_val.TenengradRelative
 ::: pymdma.image.measures.input_val.EME

diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,7 @@ gudhi = {version = ">=3.9.0, <=4.0.0"}
 scikit-learn = {version = ">1.4.0"}
 
 # Image dependencies
-pydom = {git = "https://github.com/umang-singhal/pydom.git", rev = "2554af8d0", optional = true}
+# pydom = {git = "https://github.com/umang-singhal/pydom.git", rev = "2554af8d0", optional = true}
 torchvision = {version = ">=0.15.2, <0.19.0", optional = true}
 torchmetrics = {version = ">=1.3.2, <1.4.0", extras = ["image"], optional = true}
 pycocotools = {version = ">=2.0.8", optional = true}

diff --git a/src/pymdma/image/measures/input_val/data/no_reference.py b/src/pymdma/image/measures/input_val/data/no_reference.py
@@ -1,7 +1,7 @@
 from typing import Literal, Tuple, Union
 
 import cv2
-import dom as _dom
+# import dom as _dom
 import numpy as np
 import torch
 from PIL import Image, ImageEnhance
@@ -17,84 +17,84 @@
 # TODO review documentations and attributes
 
 
-class DOM(Metric):
-    """Computes DOM sharpness score for an image. It is effective in detecting
-    motion-blur, de-focused images or inherent properties of imaging system.
-
-    **Objective**: Sharpness
-
-    Parameters
-    ----------
-    width : int, optional, default=2
-        Width of the edge filter.
-    sharpness_threshold : int, optional, default=2
-        Threshold for considering if a pixel is sharp or not.
-    edge_threshold : float, optional, default=0.0001
-        Threshold for edge.
-    **kwargs : dict, optional
-        Additional keyword arguments for compatibility.
-
-    References
-    ----------
-    Kumar et al., Sharpness estimation for document and scene images (2012).
-    https://ieeexplore.ieee.org/document/6460868
-
-    Code was adapted from:
-    pydom, Sharpness Estimation for Document and Scene Images.
-    https://github.com/umang-singhal/pydom
-
-    Examples
-    --------
-    >>> dom = DOM()
-    >>> imgs = np.random.rand(20, 100, 100, 3) # (N, H, W, C)
-    >>> result: MetricResult = dom.compute(imgs)
-    """
-
-    reference_type = ReferenceType.NONE
-    evaluation_level = EvaluationLevel.INSTANCE
-    metric_group = MetricGroup.QUALITY
-
-    higher_is_better: bool = True
-    min_value: float = 0.0
-    max_value: float = 1.0
-
-    def __init__(
-        self,
-        width: int = 2,
-        sharpness_threshold: int = 2,
-        edge_threshold: float = 0.0001,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self._dom = _dom.DOM()
-        self.width = width
-        self.sharpness_threshold = sharpness_threshold
-        self.edge_threshold = edge_threshold
-
-    def compute(
-        self,
-        imgs: np.ndarray,
-        **kwargs,
-    ) -> MetricResult:
-        """Computes DOM score for an image.
-
-        Parameters
-        ----------
-        imgs : {(N, H, W, C) ndarray, (N, H, W) ndarray}
-            List of arrays representing RGB or grayscale image of shape (H, W, C) or (H, W), respectively.
-
-        Returns
-        -------
-        result: MetricResult
-            DOM score for each image.
-        """
-        scores = [
-            self._dom.get_sharpness(img, self.width, self.sharpness_threshold, self.edge_threshold) for img in imgs
-        ]
-
-        return DistributionResult(
-            instance_level={"dtype": OutputsTypes.ARRAY, "subtype": "float", "value": scores},
-        )
+# class DOM(Metric):
+#     """Computes DOM sharpness score for an image. It is effective in detecting
+#     motion-blur, de-focused images or inherent properties of imaging system.
+
+#     **Objective**: Sharpness
+
+#     Parameters
+#     ----------
+#     width : int, optional, default=2
+#         Width of the edge filter.
+#     sharpness_threshold : int, optional, default=2
+#         Threshold for considering if a pixel is sharp or not.
+#     edge_threshold : float, optional, default=0.0001
+#         Threshold for edge.
+#     **kwargs : dict, optional
+#         Additional keyword arguments for compatibility.
+
+#     References
+#     ----------
+#     Kumar et al., Sharpness estimation for document and scene images (2012).
+#     https://ieeexplore.ieee.org/document/6460868
+
+#     Code was adapted from:
+#     pydom, Sharpness Estimation for Document and Scene Images.
+#     https://github.com/umang-singhal/pydom
+
+#     Examples
+#     --------
+#     >>> dom = DOM()
+#     >>> imgs = np.random.rand(20, 100, 100, 3) # (N, H, W, C)
+#     >>> result: MetricResult = dom.compute(imgs)
+#     """
+
+#     reference_type = ReferenceType.NONE
+#     evaluation_level = EvaluationLevel.INSTANCE
+#     metric_group = MetricGroup.QUALITY
+
+#     higher_is_better: bool = True
+#     min_value: float = 0.0
+#     max_value: float = 1.0
+
+#     def __init__(
+#         self,
+#         width: int = 2,
+#         sharpness_threshold: int = 2,
+#         edge_threshold: float = 0.0001,
+#         **kwargs,
+#     ):
+#         super().__init__(**kwargs)
+#         self._dom = _dom.DOM()
+#         self.width = width
+#         self.sharpness_threshold = sharpness_threshold
+#         self.edge_threshold = edge_threshold
+
+#     def compute(
+#         self,
+#         imgs: np.ndarray,
+#         **kwargs,
+#     ) -> MetricResult:
+#         """Computes DOM score for an image.
+
+#         Parameters
+#         ----------
+#         imgs : {(N, H, W, C) ndarray, (N, H, W) ndarray}
+#             List of arrays representing RGB or grayscale image of shape (H, W, C) or (H, W), respectively.
+
+#         Returns
+#         -------
+#         result: MetricResult
+#             DOM score for each image.
+#         """
+#         scores = [
+#             self._dom.get_sharpness(img, self.width, self.sharpness_threshold, self.edge_threshold) for img in imgs
+#         ]
+
+#         return DistributionResult(
+#             instance_level={"dtype": OutputsTypes.ARRAY, "subtype": "float", "value": scores},
+#         )
 
 
 class Tenengrad(Metric):
@@ -380,6 +380,7 @@ def compute(
 # TODO documentation
 class ExposureBrightness(Metric):
     """Computes Exposure and Brightness level Metric.
+    Values higher than 1 indicate overexposure, while values closer to 0 indicate underexposure.
 
     **Objective**: Exposure and Brightness
 

diff --git a/src/pymdma/tabular/measures/input_val/data/privacy.py b/src/pymdma/tabular/measures/input_val/data/privacy.py
@@ -25,6 +25,11 @@ class KAnonymityScore(Metric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Díaz and García, A python library to check the level of anonymity of a dataset. (2022).
+    http://dx.doi.org/10.1038/s41597-022-01894-2
+
     Returns
     -------
     MetricResult

diff --git a/src/pymdma/tabular/measures/input_val/data/quality.py b/src/pymdma/tabular/measures/input_val/data/quality.py
@@ -34,6 +34,11 @@ class CorrelationScore(Metric):
     **kwargs : dict
         Additional keyword arguments for compatibility or future use.
 
+    References
+    ----------
+    Shrestha, Detecting multicollinearity in regression analysis (2020).
+    http://pubs.sciepub.com/ajams/8/2/1
+
     Returns
     -------
     MetricResult
@@ -148,6 +153,11 @@ class UniquenessScore(Metric):
     **kwargs : dict
         Additional keyword arguments for compatibility or future use.
 
+    References
+    ----------
+    Sukhobok, Tabular data anomaly patterns (2017).
+    https://ieeexplore.ieee.org/document/8316296
+
     Returns
     -------
     MetricResult
@@ -390,6 +400,11 @@ class OutlierScore(Metric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Iglewicz, B. and Hoaglin, D. (1993) The ASQC Basic References in Quality Control: Statistical Techniques. 
+    In: Mykytka, E.F., Eds., How to Detect and Handle Outliers, ASQC Quality Press, Milwaukee, Vol. 16
+
     Returns
     -------
     MetricResult
@@ -516,6 +531,11 @@ class MissingScore(Metric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Taleb et al., Big data quality: A quality dimensions evaluation (2016).
+    https://ieeexplore.ieee.org/document/7816918
+
     Returns
     -------
     MetricResult
@@ -691,6 +711,11 @@ class VIFactorScore(Metric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Marcoulides and Raykov, Evaluation of variance inflation factors in regression models using latent variable modeling methods (2019).
+    https://pmc.ncbi.nlm.nih.gov/articles/PMC6713981/
+
     Returns
     -------
     MetricResult

diff --git a/src/pymdma/tabular/measures/synthesis_val/data/similarity.py b/src/pymdma/tabular/measures/synthesis_val/data/similarity.py
@@ -18,7 +18,7 @@ class StatisticalSimScore(Metric):
     This metric assesses how closely the statistical properties of the synthetic dataset
     resemble those of the real dataset, providing a fidelity measure for synthetic data generation.
 
-    **Objective**: Similarity
+    **Objective**: Fidelity
 
     Parameters
     ----------
@@ -28,6 +28,11 @@ class StatisticalSimScore(Metric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Yang et al., Structured evaluation of synthetic tabular data (2024).
+    https://arxiv.org/abs/2403.10424
+
     Returns
     -------
     MetricResult
@@ -204,7 +209,7 @@ class StatisiticalDivergenceScore(Metric):
     """Computes a statistical divergence score for each column, specifically
     the Jensen-Shannon (JS) and Kullback-Leibler (KL) divergence scores.
 
-    **Objective**: Similarity
+    **Objective**: Fidelity
 
     Parameters
     ----------
@@ -216,6 +221,11 @@ class StatisiticalDivergenceScore(Metric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Fonseca and Bacao,  Tabular and latent space synthetic data generation: a literature review (2023).
+    https://doi.org/10.1186/s40537-023-00792-7
+
     Returns
     -------
     MetricResult
@@ -400,7 +410,7 @@ class CoherenceScore(Metric):
     target and synthetic datasets. A higher coherence score indicates better
     fidelity between the datasets in terms of their correlation structures.
 
-    **Objective**: Similarity
+    **Objective**: Fidelity
 
     Parameters
     ----------
@@ -413,6 +423,11 @@ class CoherenceScore(Metric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Yang et al., Structured evaluation of synthetic tabular data (2024).
+    https://arxiv.org/abs/2403.10424
+
     Returns
     -------
     MetricResult

diff --git a/src/pymdma/tabular/measures/synthesis_val/feature/privacy.py b/src/pymdma/tabular/measures/synthesis_val/feature/privacy.py
@@ -22,6 +22,11 @@ class NNDRPrivacy(FeatureMetric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Liu et al., Scaling while privacy preserving: A comprehensive synthetic tabular data generation and evaluation in learning analytics (2024).
+    https://doi.org/10.1145/3636555.3636921
+
     Returns
     -------
     MetricResult
@@ -123,6 +128,11 @@ class DCRPrivacy(FeatureMetric):
     **kwargs : dict
         Additional keyword arguments passed to the parent class.
 
+    References
+    ----------
+    Liu et al., Scaling while privacy preserving: A comprehensive synthetic tabular data generation and evaluation in learning analytics (2024).
+    https://doi.org/10.1145/3636555.3636921
+
     Returns
     -------
     MetricResult

diff --git a/tests/test_ts_import.py b/tests/test_ts_import.py
@@ -539,7 +539,7 @@ def test_distribution_shift(metric_name, sample_distribution, expected_upper, si
         (synth_distance_metrics.CosineSimilarity, 0.8370494332671239),
         (synth_shared_metrics.PrecisionRecallDistribution, (0.6881853042325229, 0.6920392785323591)),
         (synth_shared_metrics.FrechetDistance, 0.5000000060902672),
-        (synth_shared_metrics.MultiScaleIntrinsicDistance, 24.476226229017197),
+        # (synth_shared_metrics.MultiScaleIntrinsicDistance, 24.476226229017197),
         (synth_shared_metrics.Authenticity, 0.5),
         (synth_shared_metrics.ImprovedPrecision, 1.0),
         (synth_shared_metrics.ImprovedRecall, 0.8),