Merge pull request #75 from datamol-io/ruff

Improve docs on how to contribute and fix hanging test issues.
datamol-io · Aug 1, 2023 · 4390f9f · 4390f9f
2 parents 3335596 + b801bac
commit 4390f9f
Show file tree

Hide file tree

Showing 26 changed files with 151 additions and 65 deletions.
diff --git a/.github/workflows/code-check.yml b/.github/workflows/code-check.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   python-format-black:
-    name: Python lint [black]
+    name: Python format [black]
     runs-on: ubuntu-latest
     steps:
       - name: Checkout the code
@@ -27,3 +27,22 @@ jobs:
 
       - name: Lint
         run: black --check .
+
+  python-lint-ruff:
+    name: Python lint [ruff]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout the code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+
+      - name: Install ruff
+        run: |
+          pip install ruff
+
+      - name: Lint
+        run: ruff .
diff --git a/docs/developers/contribute.md b/docs/developers/contribute.md
@@ -10,14 +10,54 @@ The rest of this page details the development lifecycle of molfeat.
 
 ## Setup a dev environment
 
-First you'll need to fork and clone the repository. Once you have a local copy, install the dependencies.
-It is strongly recommended that you do so in a new conda environment.
+To contribute, you will first need to setup a dev environment. Follow the steps below:
 
-```bash
-mamba env create -n molfeat -f env.yml
-mamba activate molfeat
-pip install -e .
-```
+1. Fork the [repository](https://github.com/datamol-io/molfeat) by
+   clicking on the **[Fork](https://github.com/datamol-io/molfeat/fork)** button on the repository's page. This creates a copy of the code under your GitHub user account.
+
+2. Clone your fork to your local disk, and add the base repository as a remote:
+
+   ```bash
+   git clone [email protected]:<your Github handle>/molfeat.git
+   cd molfeat
+   git remote add upstream https://github.com/datamol-io/molfeat.git
+   ```
+
+3. Create a new branch to hold your development changes:
+
+   ```bash
+   git checkout -b useful-branch-name
+   ```
+
+   **Do not** work on the `main` branch!
+
+4. Once you have a local copy, setup a development environment and install the dependencies. 
+
+   It is strongly recommended that you do so in a new **conda environment**. 
+
+    ```bash
+    mamba env create -n molfeat -f env.yml
+    conda activate molfeat
+    pip install -e . --no-deps
+    ```
+
+    If you absolutely cannot use `conda/mamba`, please use the following pip install command in your virtual environment:
+
+    ```bash
+    pip install -e ".[dev]"
+    ```
+
+   If molfeat was already installed in the virtual environment, remove it with `pip uninstall molfeat` first, before reinstalling it in editable mode with the `-e` flag.
+
+5. Make your changes and modifications on your branch.
+
+   As you work on your code, you should make sure the test suite passes. Run the tests impacted by your changes like this:
+
+   ```bash
+   pytest tests/<TEST_TO_RUN>.py
+   ```
+
+6. Commit your code, push it to your forked repository and open a pull request with a detailed description of your changes and why they are valuable. 
 
 ## Continuous Integration
 
@@ -27,7 +67,7 @@ molfeat uses Github Actions to:
 - **Check code formating** the code: `black`.
 - **Documentation**: build and deploy the documentation on `main` and for every new git tag.
 
-## Run tests
+## Run tests globally
 
 ```bash
 pytest

diff --git a/env.yml b/env.yml
@@ -57,6 +57,7 @@ dependencies:
   - pytest-timeout
   - pytest-xdist
   - black >=22
+  - ruff
   - jupyterlab
   - nbconvert
 

diff --git a/molfeat/calc/base.py b/molfeat/calc/base.py
@@ -6,7 +6,6 @@
 import joblib
 import yaml
 import fsspec
-import importlib
 from loguru import logger
 from molfeat._version import __version__ as MOLFEAT_VERSION
 

diff --git a/molfeat/calc/bond.py b/molfeat/calc/bond.py
@@ -19,7 +19,6 @@
 from molfeat.calc._atom_bond_features import bond_direction_one_hot
 from molfeat.calc._atom_bond_features import bond_stereo_one_hot
 from molfeat.calc._atom_bond_features import pairwise_ring_membership
-from molfeat.calc._atom_bond_features import pairwise_3D_dist
 from molfeat.calc._atom_bond_features import pairwise_2D_dist
 from molfeat.calc._atom_bond_features import pairwise_bond_indicator
 from molfeat.calc._atom_bond_features import pairwise_dist_indicator

diff --git a/molfeat/calc/descriptors.py b/molfeat/calc/descriptors.py
@@ -55,6 +55,13 @@ class RDKitDescriptors2D(SerializableCalculator):
     r"""
     Compute a list of available  rdkit 2D descriptors for a molecule.
     The descriptor calculator does not mask errors in featurization and will propagate them
+
+    !!! note
+        Due to recent RDKit changes, this calculator could hang for a while for large molecules (>900).
+        The main culprit is the `Ipc` descriptor which requires some computation on the full adjacency matrix.
+        You may also consider using `ignore_descrs=['AvgIpc', 'Ipc']` as a workaround when you have large molecules.
+        Alternatively, You may wish to use an alternative featurizer
+
     """
 
     DESCRIPTORS_FN = {name: fn for (name, fn) in Descriptors.descList}
@@ -66,6 +73,7 @@ def __init__(
         descrs: List = None,
         avg_ipc: Optional[bool] = True,
         do_not_standardize: Optional[bool] = False,
+        ignore_descrs: Optional[List] = None,
         **kwargs,
     ):
         """RDKit descriptor computation
@@ -75,6 +83,8 @@ def __init__(
             augment: Whether to augment the descriptors with some additional custom features
             descrs: Subset of available features to consider if not None
             avg_ipc: Whether to average IPC values or to use rdkit original
+            ignore_descrs: optional list of descriptors to ignore. You can get the full list of default descriptors
+                by calling `calculator.columns`.
             do_not_standardize: Whether to force standardization of molecule before computation of the descriptor.
                 Set to True if you want molfeat<=0.5.3 behaviour
         """
@@ -84,6 +94,7 @@ def __init__(
         self.avg_ipc = avg_ipc
         self.do_not_standardize = do_not_standardize
         all_features = [d[0] for d in Descriptors.descList]
+        self.ignore_descrs = ignore_descrs or []
         if self.augment:
             all_features += [
                 "NumAtomStereoCenters",
@@ -101,6 +112,8 @@ def __init__(
         else:
             self._columns = all_features
 
+        self._columns = [x for x in self._columns if x not in self.ignore_descrs]
+
     def __getstate__(self):
         """Serialize the class for pickling."""
         state = {}
@@ -266,15 +279,15 @@ def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1) ->
             if desc not in self.ignore_descrs:
                 try:
                     val = getattr(Descriptors3D.rdMolDescriptors, desc)(mol, confId=conformer_id)
-                except:
+                except Exception:
                     pass
                 desc_val.append(val)
         for i, desc in enumerate(self._vec_descr):
             val = [float("nan")] * self._vec_descr_length[i]
             if desc not in self.ignore_descrs:
                 try:
                     val = getattr(Descriptors3D.rdMolDescriptors, desc)(mol, confId=conformer_id)
-                except:
+                except Exception:
                     pass
                 desc_val.extend(val)
 

diff --git a/molfeat/calc/fingerprints.py b/molfeat/calc/fingerprints.py
@@ -221,7 +221,7 @@ def __init__(
         """
         self.method = method.lower()
         self.counting = counting or "-count" in self.method
-        if self.counting and not "-count" in self.method:
+        if self.counting and "-count" not in self.method:
             self.method = self.method + "-count"
         self.input_length = length
         if self.method not in FP_FUNCS:

diff --git a/molfeat/calc/tree.py b/molfeat/calc/tree.py
@@ -1,7 +1,5 @@
-from typing import Iterable
 from typing import List
 from typing import Union
-from typing import Tuple
 from typing import Optional
 from collections import defaultdict
 

diff --git a/molfeat/store/loader.py b/molfeat/store/loader.py
@@ -98,7 +98,7 @@ def _load_or_raise(
         try:
             modelcard = store.search(name=name)[0]
             artifact_dir = store.download(modelcard, download_path, **kwargs)
-        except Exception as e:
+        except Exception:
             mess = f"Can't retrieve model {name} from the store !"
             raise ModelStoreError(mess)
         return artifact_dir

diff --git a/molfeat/store/modelcard.py b/molfeat/store/modelcard.py
@@ -15,7 +15,7 @@ def get_model_init(card):
         import_statement = "from molfeat.trans import MoleculeTransformer"
         loader_statement = f"MoleculeTransformer(featurizer='{card.name}', dtype=float)"
     elif card.group in ["rdkit", "fp", "shape"]:
-        import_statement = f"from molfeat.trans.fp import FPVecTransformer"
+        import_statement = "from molfeat.trans.fp import FPVecTransformer"
         loader_statement = f"FPVecTransformer(kind='{card.name}', dtype=float)"
     elif card.group == "dgllife":
         import_statement = "from molfeat.trans.pretrained import PretrainedDGLTransformer"
@@ -25,7 +25,7 @@ def get_model_init(card):
         loader_statement = f"GraphormerTransformer(kind='{card.name}', dtype=float)"
     elif card.group == "fcd":
         import_statement = "from molfeat.trans.pretrained import FCDTransformer"
-        loader_statement = f"FCDTransformer()"
+        loader_statement = "FCDTransformer()"
     elif card.group == "pharmacophore":
         name = card.name.split("-")[-1]
         if card.require_3D:

diff --git a/molfeat/store/modelstore.py b/molfeat/store/modelstore.py
@@ -123,7 +123,7 @@ def register(
         if save_fn is None:
             if not isinstance(model, (pathlib.Path, os.PathLike)):
                 local_model_path = tempfile.NamedTemporaryFile(delete=False)
-                with local_model_path as f:
+                with local_model_path:
                     joblib.dump(model, local_model_path)
                 model_path = local_model_path.name
             # Upload the artifact to the bucket
@@ -152,10 +152,10 @@ def _filelock(self, lock_name: str):
         lock_path = dm.fs.join(
             str(platformdirs.user_cache_dir("molfeat")), "_lock_files", lock_name
         )
-        mapper = dm.fs.get_mapper(lock_path)
+        dm.fs.get_mapper(lock_path)
         # ensure file is created
         # out = mapper.fs.touch(lock_path) # does not work  -_-
-        with fsspec.open(lock_path, "w", auto_mkdir=True) as f:
+        with fsspec.open(lock_path, "w", auto_mkdir=True):
             pass
 
         return filelock.FileLock(lock_path)

diff --git a/molfeat/trans/base.py b/molfeat/trans/base.py
@@ -36,8 +36,6 @@
 from molfeat.utils.parsing import get_input_args
 from molfeat.utils.parsing import import_from_string
 from molfeat.utils.state import map_dtype
-from molfeat.utils.state import ATOM_FEATURIZER_MAPPING
-from molfeat.utils.state import BOND_FEATURIZER_MAPPING
 from molfeat.utils.state import ATOM_FEATURIZER_MAPPING_REVERSE
 from molfeat.utils.state import BOND_FEATURIZER_MAPPING_REVERSE
 

diff --git a/molfeat/trans/graph/adj.py b/molfeat/trans/graph/adj.py
@@ -19,7 +19,6 @@
 from molfeat.utils.commons import requires_conformer
 from molfeat.utils.commons import pack_graph
 from molfeat.calc.atom import AtomCalculator
-from molfeat.calc.bond import BondCalculator
 from molfeat.calc.bond import EdgeMatCalculator
 
 if requires.check("dgl"):
@@ -103,7 +102,7 @@ def preprocess(self, inputs, labels=None):
                 mol = dm.to_mol(
                     m, add_hs=self.explicit_hydrogens, ordered=self.canonical_atom_order
                 )
-            except:
+            except Exception:
                 mol = None
             new_inputs.append(mol)
 
@@ -126,7 +125,7 @@ def atom_dim(self):
         if self._atom_dim is None:
             try:
                 self._atom_dim = len(self.atom_featurizer)
-            except:
+            except Exception:
                 _toy_mol = dm.to_mol("C")
                 out = self.atom_featurizer(_toy_mol)
                 self._atom_dim = sum([x.shape[-1] for x in out.values()])
@@ -145,7 +144,7 @@ def bond_dim(self):
         if self._bond_dim is None:
             try:
                 self._bond_dim = len(self.bond_featurizer)
-            except:
+            except Exception:
                 _toy_mol = dm.to_mol("CO")
                 out = self.bond_featurizer(_toy_mol)
                 self._bond_dim = sum([x.shape[-1] for x in out.values()])

diff --git a/molfeat/trans/pretrained/base.py b/molfeat/trans/pretrained/base.py
@@ -3,7 +3,6 @@
 from typing import Callable
 from typing import List
 from collections.abc import Iterable
-from functools import lru_cache
 
 import copy
 import datamol as dm
@@ -131,7 +130,7 @@ def preprocess(self, inputs: list, labels: Optional[list] = None):
         if self.precompute_cache not in [False, None]:
             try:
                 self.transform(inputs)
-            except:
+            except Exception:
                 pass
         return out
 

diff --git a/molfeat/trans/struct/esm.py b/molfeat/trans/struct/esm.py
@@ -57,7 +57,7 @@ def __init__(
         self._max_layers = int(max_layer_pattern.match(featurizer).group(1))
         if layers is None:
             self.repr_layers = [self._max_layers]
-        if any(l > self._max_layers for l in self.repr_layers):
+        if any(lay > self._max_layers for lay in self.repr_layers):
             raise ValueError(
                 "You are requesting more layers than available for this pretrained model"
             )

diff --git a/molfeat/utils/cache.py b/molfeat/utils/cache.py
@@ -260,7 +260,7 @@ def fetch(
         try:
             cacher = copy.deepcopy(self)
             n_jobs = self.n_jobs
-        except:  # noqa
+        except Exception:  # noqa
             # cannot parallelize process, ensure n_jobs is 0
             cacher = self
             n_jobs = 0
@@ -357,7 +357,7 @@ def clear(self, delete: bool = False):
                 for path in glob.glob(str(self.cache_file) + "*"):
                     try:
                         os.unlink(path)
-                    except:  # noqa
+                    except Exception:  # noqa
                         pass
         else:
             self._initialize_cache()

diff --git a/molfeat/utils/converters.py b/molfeat/utils/converters.py
@@ -45,7 +45,7 @@ def decode(self, inp: str):
             try:
                 decoded = self.converter.decode(inp)
                 return decoded.strip()
-            except:  # (deepsmiles.DecodeError, ValueError, AttributeError, IndexError):
+            except Exception:  # (deepsmiles.DecodeError, ValueError, AttributeError, IndexError):
                 return None
 
     def encode(self, smiles: str):
@@ -60,5 +60,5 @@ def encode(self, smiles: str):
             try:
                 encoded = self.converter.encode(smiles)
                 return encoded.strip()
-            except:
+            except Exception:
                 return None
diff --git a/molfeat/utils/datatype.py b/molfeat/utils/datatype.py
@@ -72,7 +72,7 @@ def to_tensor(x, gpu=False, dtype=None):
         try:
             if torch.is_tensor(x[0]):
                 x = torch.stack(x)
-        except:
+        except Exception:
             pass
         x = torch.as_tensor(x)
     if dtype is not None:
@@ -205,11 +205,11 @@ def is_null(obj):
     try:
         tmp = to_numpy(obj)
         array_nan = np.all(np.isnan(tmp))
-    except:
+    except Exception:
         pass
     try:
         all_none = all(x is None for x in obj)
-    except:
+    except Exception:
         pass
     return obj is None or all_none or array_nan