CALIPSO-project · tztsai · Sep 11, 2024 · Sep 11, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -0,0 +1,50 @@
+# Summary of Changes
+
+- Converted dataset format to xarray.Dataset - merged auxil into packdata, etc.
+- Reformat documentation and added more comments
+- Rewrote tests as pytest files
+- Reformated the config file as a python file
+- Used ruff for reformatting and linting
+- Added pre-commit checks and a Github workflow for CI checks
+- Added input features (std of some variables like Qair, Psurf, etc.)
+- Increase Nc values (by a factor of 2)
+- Instead of averaging over the whole time span, only take monthly averages and separate data per year to increase dataset size
+- Simplified readvar.py
+- Added new ML algorithm options: XGBoost, RandomForest, MLP, Lasso, Stacking Ensemble
+- Combined all ML evaluation results into a single CSV table
+- Implemented multithread parallelization to train a ML model per target variable in parallelization
+- Added standard scaling to preprocess the data before ML training
+- Updated README.md and CONTRIBUTING.md
+- Added explanation of the varlist.json file
+
+## TODO
+- Investigate (on bad-performance variables): feature importance, target variable correlation, sample size
+
+## Performance Benchmark
+
+### Separated Years
+| Algorithm | R2                 | slope              |
+|-----------|--------------------|--------------------|
+| bt        | 0.6528848053359372 | 0.9542832780095561 |
+| rf        | **0.6530125681385772** | 0.9540960891785613 |
+| gbm       | 0.6512312928704228 | 0.950301142771462  |
+| lasso     | 0.3712954690899892 | **0.9877068027413212** |
+| stack     | 0.6525452816395577 | 0.9525996582374604 |
+
+### Separated Years without BAT
+| Algorithm | R2                 | slope              |
+|-----------|--------------------|--------------------|
+|bt|**0.9324186841654837**|0.9485227574788934|
+|rf|0.932251880129098|0.9485025443202926|
+|gbm|0.9302603471413085|0.9438685125142494|
+|lasso|0.8361626491858671|**0.9557082517475748**|
+|stack|0.9314316345860048|0.9493522690938773|
+
+### Averaged Years
+| Algorithm | R2                 | slope              |
+|-----------|--------------------|--------------------|
+| bt        | 0.31926695871812644| 0.9591009540297856 |
+| rf        | 0.321636483649443  | 0.9590895662312189 |
+| gbm       | **0.328685618778433**  | 0.9583401949919101 |
+| lasso     | 0.09302916905772492| **0.9930868709808638** |
+| stack     | 0.3225905817064779 | 0.9753542956777028 |
diff --git a/DEF_Trunk/config.py b/DEF_Trunk/config.py
@@ -2,17 +2,28 @@
 
 logfile = "log.MLacc_Trunk"
 tasks = [
-    2,
+    # 2,
     4,
+    5,
 ]  # 1=test clustering, 2=clustering, 3=compress forcing, 4=ML, 5=evaluation
 results_dir = "./EXE_DIR/"
 reference_dir = "/home/surface10/mrasolon/files_for_zenodo/reference/EXE_DIR/"
-start_from_scratch = True
+start_from_scratch = False
+take_year_average = False
+smote_bat = False
 kmeans_clusters = 4
 max_kmeans_clusters = 9
 random_seed = 1000
+algorithms = [
+    # "bt",
+    # "rf",
+    # "gbm",
+    # "nn",
+    # "ridge",
+    "best",
+]  # bt: BaggingTrees, rf: RandomForest, nn: MLPRegressor, gbm: XGBRegressor, lasso: Lasso, best: SelectBestModel
 leave_one_out_cv = False
-repro_test_task_1 = True
-repro_test_task_2 = True
-repro_test_task_3 = True
-repro_test_task_4 = True
+repro_test_task_1 = False
+repro_test_task_2 = False
+repro_test_task_3 = False
+repro_test_task_4 = False
diff --git a/DEF_Trunk/varlist.json b/DEF_Trunk/varlist.json
@@ -44,7 +44,7 @@
   {
     "test_K":[2,3,4,5,6,7,8,9],
     "pfts":[2,3,4,5,6,7,8,9,10,11,12,13,14,15],
-    "Ncc":[10,20,10,10,10,20,20,20,10,10,10,10,10,10]
+    "Ncc": [20, 40, 20, 20, 20, 40, 40, 40, 20, 20, 20, 20, 20, 20]
   },
   "resp":
   {

diff --git a/Tools/Cluster.py b/Tools/Cluster.py
@@ -17,17 +17,26 @@
 from Tools import *
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   PFT_mask               PFT mask where PFT fraction >0.01
-##@param[in]   ipft                   ith PFT to deal with
-##@param[in]   var_pred               predicting variables
-##@param[in]   var_pred_name          names of predicting variables
-##@param[in]   K                      K
-##@param[in]   Nc                     number of sites of select
-##@retval      cluster_dic            # to be complete by Yan
-##@retval      distance               # to be complete by Yan
-##@retval      All_selectedID         # to be complete by Yan
 def Cluster_Ana(packdata, PFT_mask, ipft, var_pred_name, K, Nc):
+    """
+    Perform clustering analysis on the data for a specific Plant Functional Type (PFT).
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        PFT_mask (numpy.ndarray): Mask for Plant Functional Types.
+        ipft (int): Index of the current Plant Functional Type.
+        var_pred_name (list): List of predictor variable names.
+        K (int): Number of clusters.
+        Nc (int): Number of sites to select from each cluster.
+
+    Returns:
+        tuple:
+            - cluster_dic (dict): Dictionary containing cluster information.
+            - distance (float): Sum of squared distances of samples to their closest cluster center.
+            - All_selectedID (numpy.ndarray): Array of selected site IDs.
+    """
+    if "year" in packdata.dims:
+        packdata = packdata.mean("year", keep_attrs=True)
     if "Ndep_nhx_pft" in var_pred_name:
         packdata.Ndep_nhx_pft = packdata.Ndep_nhx[ipft - 1]
     if "Ndep_noy_pft" in var_pred_name:
@@ -56,17 +65,27 @@ def Cluster_Ana(packdata, PFT_mask, ipft, var_pred_name, K, Nc):
             SelectedID = locations[RandomS]
         else:
             SelectedID = locations
+        print(
+            f"Selected {len(SelectedID)} ({len(SelectedID)/len(locations):.2%}) sites in cluster {clus}"
+        )
         cluster_dic["clus_%.2i_loc_select" % clus] = SelectedID
         All_selectedID = np.append(All_selectedID, SelectedID, axis=0)
 
     return cluster_dic, distance, All_selectedID
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   varlist                list of variables, including name of source files, variable names, etc.
-##@param[in]   logfile                logfile
-##@retval      dis_all                # Eulerian (?) distance corresponding to different number of Ks
 def Cluster_test(packdata, varlist, logfile):
+    """
+    Test clustering with different K values for all specified PFTs.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        varlist (dict): Dictionary of variable information.
+        logfile (file): File object for logging.
+
+    Returns:
+        numpy.ndarray: Array of distances for different K values and PFTs.
+    """
     # 1.clustering def
     # Make a mask map according to PFT fractions: nan - <0.00000001; 1 - >=0.00000001
     # I used the output 'VEGET_COV_MAX' by ORCHIDEE-CNP with run the spin-up for 1 year.
@@ -92,22 +111,31 @@ def Cluster_test(packdata, varlist, logfile):
     return dis_all
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   varlist                list of variables, including name of source files, variable names, etc.
-##@param[in]   KK                     K value chosen to do final clustering
-##@param[in]   logfile                logfile
-##@retval      IDx                    chosen IDs of pixels for MLacc
-##@retval      IDloc                  # to be complete by Yan (just for plotting)
-##@retval      IDsel                  # to be complete by Yan (just for plotting)
 def Cluster_all(packdata, varlist, KK, logfile):
+    """
+    Perform clustering for all specified PFTs with a chosen K value.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        varlist (dict): Dictionary of variable information.
+        KK (int): Chosen K value for clustering.
+        logfile (file): File object for logging.
+
+    Returns:
+        tuple:
+            - IDx (numpy.ndarray): Array of chosen pixel IDs for MLacc.
+            - IDloc (numpy.ndarray): Array of cluster locations (for plotting).
+            - IDsel (numpy.ndarray): Array of selected cluster locations (for plotting).
+    """
     adict = locals()
     kpfts = varlist["clustering"]["pfts"]
     Ncc = varlist["clustering"]["Ncc"]
     PFT_mask, PFT_mask_lai = genMask.PFT(
         packdata, varlist, varlist["PFTmask"]["cluster_thres"]
     )
 
-    var_pred_name = varlist["pred"]["clustering"]
+    # var_pred_name = varlist["pred"]["clustering"]
+    var_pred_name = [k for k, v in packdata.items() if "veget" not in v.dims]
     for veg in kpfts:
         ClusD, disx, training_ID = Cluster_Ana(
             packdata,