Merge pull request #21 from usnistgov/develop

Develop
usnistgov · Jul 11, 2021 · 257e78b · 257e78b
2 parents 147d64a + cf5d459
commit 257e78b
Show file tree

Hide file tree

Showing 13 changed files with 124 additions and 143 deletions.
diff --git a/README.md b/README.md
@@ -31,25 +31,37 @@ python setup.py develop
 Examples
 ---------
 Users can keep their structure files in POSCAR, .cif, or .xyz files in a directory. In the examples below we will use POSCAR format files. In the same directory, there should be id_prop.csv file.
-In this id_prop.csv, the filenames, and correponding target values are kept in comma separated values (csv) format.
-Here is an example of training OptB88vdw bandgaps of 50 materials from JARVIS-DFT. The example is created using the script provided in the script folder.
-Users can modify the script more than 50 data, or make their own dataset in this format. The dataset in split in 80:10:10 as training-validation-test set.
-With the configuration parameters given in config_example_regrssion.json, the model is trained.
+
+In this directory, `id_prop.csv`, the filenames, and correponding target values are kept in comma separated values (csv) format.
+
+Here is an example of training OptB88vdw bandgaps of 50 materials from JARVIS-DFT database. The example is created using the examples/sample_data/scripts/generate_sample_data_reg.py script. Users can modify the script more than 50 data, or make their own dataset in this format. 
+
+The dataset in split in 80:10:10 as training-validation-test set (controlled by `train_ratio, val_ratio, test_ratio`) . To change the split proportion and other parameters, change the `config_example.json` file. If, users want to train on certain sets and val/test on another dataset, set `n_train`, `n_val`, `n_test` manually in the `config_example.json` and also set `keep_data_order` as True there so that random shuffle is disabled.  
+
+A brief help guide can be obtained as:
 
 ```
-python alignn/scripts/train_folder.py --root_dir "alignn/examples/sample_data" --config "alignn/examples/sample_data/config_example_regrssion.json"
+python alignn/scripts/train_folder.py -h
 ```
+
+Now, the model is trained.
+
+```
+python alignn/scripts/train_folder.py --root_dir "alignn/examples/sample_data" --config "alignn/examples/sample_data/config_example.json"
+```
+
 While the above example is for regression, the follwoing example shows a classification task for metal/non-metal based on the above bandgap values. We transform the dataset
-into 1 or 0 based on a threshold of 0.01 eV (controlled by the parameter, 'classification_threshold') and train a similar classification model.
+into 1 or 0 based on a threshold of 0.01 eV (controlled by the parameter, `classification_threshold`) and train a similar classification model. Currently, the script allows binary classification tasks only.
 ```
-python alignn/scripts/train_folder.py --root_dir "alignn/examples/sample_data" --config "alignn/examples/sample_data/config_example_classification.json"
+python alignn/scripts/train_folder.py --root_dir "alignn/examples/sample_data" --classification_threshold 0.01 --config "alignn/examples/sample_data/config_example.json"
 ```
+
+
 While the above example regression was for single-output values, we can train multi-output regression models as well.
-An example is given below for training formation energy per atom, bandgap and total energy per atom simulataneously. The script to generate the example data is provided in the script folder of the sample_data_multi_prop. 
-Another example of training electron and phonon density of states is provided also.
+An example is given below for training formation energy per atom, bandgap and total energy per atom simulataneously. The script to generate the example data is provided in the script folder of the sample_data_multi_prop. Another example of training electron and phonon density of states is provided also.
 ```
-python alignn/scripts/train_folder_multi_prop.py --root_dir "alignn/examples/sample_data_multi_prop" --config "alignn/examples/sample_data/config_example_regrssion.json"
+python alignn/scripts/train_folder.py --root_dir "alignn/examples/sample_data_multi_prop" --config "alignn/examples/sample_data/config_example.json"
 ```
 
-You can also try multiple example scripts to run multiple dataset training. Look into the 'scripts' folder. 
-These scripts automatically download datasets from jarvis.db.fighshare module in JARVIS-Tools and train several models. Make sure you specify your specific queuing system details in the scripts. 
+Users can try training using multiple example scripts to run multiple dataset (such as JARVIS-DFT, Materials project, QM9_JCTC etc.). Look into the 'alignn/scripts' folder. This is done primarily to make the trainings more automated rather than making folder/ csv files etc.  
+These scripts automatically download datasets from `jarvis.db.fighshare` module in `jarvis-tools` package and train several models. Make sure you specify your specific queuing system details in the scripts. 
diff --git a/alignn/__init__.py b/alignn/__init__.py
@@ -1,2 +1,2 @@
 """Version number."""
-__version__ = "2021.07.05"
+__version__ = "2021.07.10"
diff --git a/alignn/config.py b/alignn/config.py
@@ -104,6 +104,13 @@
     "bandgap",
     "energy_total",
     "net_magmom",
+    "b3lyp_homo",
+    "b3lyp_lumo",
+    "b3lyp_gap",
+    "b3lyp_scharber_pce",
+    "b3lyp_scharber_voc",
+    "b3lyp_scharber_jsc",
+    "log_kd_ki",
 ]
 
 
@@ -127,6 +134,8 @@ class TrainingConfig(BaseSettings):
         "edos_up",
         "edos_pdos",
         "qmof",
+        "hpov",
+        "pdbbind",
     ] = "dft_3d"
     target: TARGET_ENUM = "formation_energy_peratom"
     atom_features: Literal["basic", "atomic_number", "cfid", "cgcnn"] = "cgcnn"
@@ -167,6 +176,7 @@ class TrainingConfig(BaseSettings):
     num_workers: int = 4
     cutoff: float = 8.0
     max_neighbors: int = 12
+    keep_data_order: bool = False
 
     # model configuration
     model: Union[

diff --git a/alignn/data.py b/alignn/data.py
@@ -124,6 +124,7 @@ def get_id_train_val_test(
     n_train=None,
     n_test=None,
     n_val=None,
+    keep_data_order=False,
 ):
     """Get train, val, test IDs."""
     if (
@@ -145,9 +146,9 @@ def get_id_train_val_test(
     if n_val is None:
         n_val = int(val_ratio * total_size)
     ids = list(np.arange(total_size))
-
-    random.seed(split_seed)
-    random.shuffle(ids)
+    if not keep_data_order:
+        random.seed(split_seed)
+        random.shuffle(ids)
     if n_train + n_val + n_test > total_size:
         raise ValueError(
             "Check total number of samples.",
@@ -242,6 +243,7 @@ def get_train_val_loaders(
     classification_threshold: Optional[float] = None,
     target_multiplication_factor: Optional[float] = None,
     standard_scalar_and_pca=False,
+    keep_data_order=False,
     output_features=1,
 ):
     """Help function to set up Jarvis train and val dataloaders."""
@@ -358,6 +360,7 @@ def get_train_val_loaders(
             n_train=n_train,
             n_test=n_test,
             n_val=n_val,
+            keep_data_order=keep_data_order,
         )
         ids_train_val_test = {}
         ids_train_val_test["id_train"] = [dat[i][id_tag] for i in id_train]

diff --git a/...sample_data/config_example_regrssion.json → .../examples/sample_data/config_example.json b/...sample_data/config_example_regrssion.json → .../examples/sample_data/config_example.json
diff --git a/alignn/examples/sample_data/config_example_classification.json b/alignn/examples/sample_data/config_example_classification.json
diff --git a/.../sample_data_multi_prop/id_multi_prop.csv → ...amples/sample_data_multi_prop/id_prop.csv b/.../sample_data_multi_prop/id_multi_prop.csv → ...amples/sample_data_multi_prop/id_prop.csv
diff --git a/alignn/scripts/train_folder.py b/alignn/scripts/train_folder.py
@@ -18,17 +18,33 @@
 parser.add_argument(
     "--root_dir",
     default="./",
-    help="Folder with id_props.csv, poscars and config*.json",
+    help="Folder with id_props.csv, poscars",
 )
 parser.add_argument(
     "--config_name",
-    default="config_example_regrssion.json",
+    default="alignn/examples/sample_data/config_example.json",
     help="Name of the config file",
 )
 
+parser.add_argument(
+    "--keep_data_order",
+    default=False,
+    help="Whether to randomly shuffle samples, True/False",
+)
+
+parser.add_argument(
+    "--classification_threshold",
+    default=None,
+    help="Floating point threshold for converting into 0/1 class"
+    + ", use only for classification tasks",
+)
+
 
 def train_for_folder(
-    root_dir="examples/sample_data", config_name="config.json"
+    root_dir="examples/sample_data",
+    config_name="config.json",
+    keep_data_order=False,
+    classification_threshold=None,
 ):
     """Train for a folder."""
     # config_dat=os.path.join(root_dir,config_name)
@@ -40,20 +56,49 @@ def train_for_folder(
         except Exception as exp:
             print("Check", exp)
 
+    config.keep_data_order = keep_data_order
+    if classification_threshold is not None:
+        config.classification_threshold = float(classification_threshold)
     with open(id_prop_dat, "r") as f:
         reader = csv.reader(f)
         data = [row for row in reader]
 
     dataset = []
+    n_outputs = []
+    multioutput = False
+    lists_length_equal = True
     for i in data:
         info = {}
         poscar_name = i[0]
         poscar_path = os.path.join(root_dir, poscar_name)
         atoms = Atoms.from_poscar(poscar_path)
         info["atoms"] = atoms.to_dict()
         info["jid"] = poscar_name
-        info["target"] = float(i[1])
+
+        tmp = [float(j) for j in i[1:]]  # float(i[1])
+        if len(tmp) == 1:
+            tmp = tmp[0]
+        else:
+            multioutput = True
+        info["target"] = tmp  # float(i[1])
+        n_outputs.append(info["target"])
         dataset.append(info)
+    if multioutput:
+        lists_length_equal = False not in [
+            len(i) == len(n_outputs[0]) for i in n_outputs
+        ]
+
+    # print ('n_outputs',n_outputs[0])
+    if multioutput and classification_threshold is not None:
+        raise ValueError("Classification for multi-output not implemented.")
+    if multioutput and lists_length_equal:
+        config.model.output_features = len(n_outputs[0])
+    else:
+        # TODO: Pad with NaN
+        if not lists_length_equal:
+            raise ValueError("Make sure the outputs are of same size.")
+        else:
+            config.model.output_features = 1
     (
         train_loader,
         val_loader,
@@ -98,4 +143,9 @@ def train_for_folder(
 
 if __name__ == "__main__":
     args = parser.parse_args(sys.argv[1:])
-    train_for_folder(root_dir=args.root_dir, config_name=args.config_name)
+    train_for_folder(
+        root_dir=args.root_dir,
+        config_name=args.config_name,
+        keep_data_order=args.keep_data_order,
+        classification_threshold=args.classification_threshold,
+    )
diff --git a/alignn/scripts/train_folder_multi_prop.py b/alignn/scripts/train_folder_multi_prop.py
diff --git a/alignn/tests/test_prop.py b/alignn/tests/test_prop.py
@@ -3,9 +3,10 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from alignn.train import train_dgl
+
 plt.switch_backend("agg")
 from sklearn.metrics import mean_absolute_error
-
+import os
 
 config = {
     "dataset": "dft_2d",
@@ -32,6 +33,15 @@
 }
 
 
+def test_runtime_training():
+    cmd1 = 'python alignn/scripts/train_folder.py --root_dir "alignn/examples/sample_data" --config "alignn/examples/sample_data/config_example.json"'
+    os.system(cmd1)
+    cmd2 = 'python alignn/scripts/train_folder.py --root_dir "alignn/examples/sample_data" --classification_threshold 0.01 --config "alignn/examples/sample_data/config_example.json"'
+    os.system(cmd2)
+    cmd3 = 'python alignn/scripts/train_folder.py --root_dir "alignn/examples/sample_data_multi_prop" --config "alignn/examples/sample_data/config_example.json"'
+    os.system(cmd3)
+
+
 def test_models():
     """Test CGCNN end to end training."""
     config["model"]["name"] = "dense_alignn"
@@ -192,4 +202,5 @@ def test_models():
     """
 
 
+# test_runtime_training()
 # test_models()