diff --git a/CHANGES.md b/CHANGES.md
new file mode 100644
index 0000000..97f6945
--- /dev/null
+++ b/CHANGES.md
@@ -0,0 +1,50 @@
+# Summary of Changes
+
+- Converted dataset format to xarray.Dataset - merged auxil into packdata, etc.
+- Reformat documentation and added more comments
+- Rewrote tests as pytest files
+- Reformated the config file as a python file
+- Used ruff for reformatting and linting
+- Added pre-commit checks and a Github workflow for CI checks
+- Added input features (std of some variables like Qair, Psurf, etc.)
+- Increase Nc values (by a factor of 2)
+- Instead of averaging over the whole time span, only take monthly averages and separate data per year to increase dataset size
+- Simplified readvar.py
+- Added new ML algorithm options: XGBoost, RandomForest, MLP, Lasso, Stacking Ensemble
+- Combined all ML evaluation results into a single CSV table
+- Implemented multithread parallelization to train a ML model per target variable in parallelization
+- Added standard scaling to preprocess the data before ML training
+- Updated README.md and CONTRIBUTING.md
+- Added explanation of the varlist.json file
+
+## TODO
+- Investigate (on bad-performance variables): feature importance, target variable correlation, sample size
+
+## Performance Benchmark
+
+### Separated Years
+| Algorithm | R2                 | slope              |
+|-----------|--------------------|--------------------|
+| bt        | 0.6528848053359372 | 0.9542832780095561 |
+| rf        | **0.6530125681385772** | 0.9540960891785613 |
+| gbm       | 0.6512312928704228 | 0.950301142771462  |
+| lasso     | 0.3712954690899892 | **0.9877068027413212** |
+| stack     | 0.6525452816395577 | 0.9525996582374604 |
+
+### Separated Years without BAT
+| Algorithm | R2                 | slope              |
+|-----------|--------------------|--------------------|
+|bt|**0.9324186841654837**|0.9485227574788934|
+|rf|0.932251880129098|0.9485025443202926|
+|gbm|0.9302603471413085|0.9438685125142494|
+|lasso|0.8361626491858671|**0.9557082517475748**|
+|stack|0.9314316345860048|0.9493522690938773|
+
+### Averaged Years
+| Algorithm | R2                 | slope              |
+|-----------|--------------------|--------------------|
+| bt        | 0.31926695871812644| 0.9591009540297856 |
+| rf        | 0.321636483649443  | 0.9590895662312189 |
+| gbm       | **0.328685618778433**  | 0.9583401949919101 |
+| lasso     | 0.09302916905772492| **0.9930868709808638** |
+| stack     | 0.3225905817064779 | 0.9753542956777028 |
diff --git a/DEF_Trunk/config.py b/DEF_Trunk/config.py
index 53a791d..df22a38 100644
--- a/DEF_Trunk/config.py
+++ b/DEF_Trunk/config.py
@@ -2,17 +2,28 @@
 
 logfile = "log.MLacc_Trunk"
 tasks = [
-    2,
+    # 2,
     4,
+    5,
 ]  # 1=test clustering, 2=clustering, 3=compress forcing, 4=ML, 5=evaluation
 results_dir = "./EXE_DIR/"
 reference_dir = "/home/surface10/mrasolon/files_for_zenodo/reference/EXE_DIR/"
-start_from_scratch = True
+start_from_scratch = False
+take_year_average = False
+smote_bat = False
 kmeans_clusters = 4
 max_kmeans_clusters = 9
 random_seed = 1000
+algorithms = [
+    # "bt",
+    # "rf",
+    # "gbm",
+    # "nn",
+    # "ridge",
+    "best",
+]  # bt: BaggingTrees, rf: RandomForest, nn: MLPRegressor, gbm: XGBRegressor, lasso: Lasso, best: SelectBestModel
 leave_one_out_cv = False
-repro_test_task_1 = True
-repro_test_task_2 = True
-repro_test_task_3 = True
-repro_test_task_4 = True
+repro_test_task_1 = False
+repro_test_task_2 = False
+repro_test_task_3 = False
+repro_test_task_4 = False
diff --git a/DEF_Trunk/varlist.json b/DEF_Trunk/varlist.json
index 6bbf74e..a11d6b7 100644
--- a/DEF_Trunk/varlist.json
+++ b/DEF_Trunk/varlist.json
@@ -44,7 +44,7 @@
   {
     "test_K":[2,3,4,5,6,7,8,9],
     "pfts":[2,3,4,5,6,7,8,9,10,11,12,13,14,15],
-    "Ncc":[10,20,10,10,10,20,20,20,10,10,10,10,10,10]
+    "Ncc": [20, 40, 20, 20, 20, 40, 40, 40, 20, 20, 20, 20, 20, 20]
   },
   "resp":
   {
diff --git a/Tools/Cluster.py b/Tools/Cluster.py
index 72b00f9..11e4b4e 100644
--- a/Tools/Cluster.py
+++ b/Tools/Cluster.py
@@ -17,17 +17,26 @@
 from Tools import *
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   PFT_mask               PFT mask where PFT fraction >0.01
-##@param[in]   ipft                   ith PFT to deal with
-##@param[in]   var_pred               predicting variables
-##@param[in]   var_pred_name          names of predicting variables
-##@param[in]   K                      K
-##@param[in]   Nc                     number of sites of select
-##@retval      cluster_dic            # to be complete by Yan
-##@retval      distance               # to be complete by Yan
-##@retval      All_selectedID         # to be complete by Yan
 def Cluster_Ana(packdata, PFT_mask, ipft, var_pred_name, K, Nc):
+    """
+    Perform clustering analysis on the data for a specific Plant Functional Type (PFT).
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        PFT_mask (numpy.ndarray): Mask for Plant Functional Types.
+        ipft (int): Index of the current Plant Functional Type.
+        var_pred_name (list): List of predictor variable names.
+        K (int): Number of clusters.
+        Nc (int): Number of sites to select from each cluster.
+
+    Returns:
+        tuple:
+            - cluster_dic (dict): Dictionary containing cluster information.
+            - distance (float): Sum of squared distances of samples to their closest cluster center.
+            - All_selectedID (numpy.ndarray): Array of selected site IDs.
+    """
+    if "year" in packdata.dims:
+        packdata = packdata.mean("year", keep_attrs=True)
     if "Ndep_nhx_pft" in var_pred_name:
         packdata.Ndep_nhx_pft = packdata.Ndep_nhx[ipft - 1]
     if "Ndep_noy_pft" in var_pred_name:
@@ -56,17 +65,27 @@ def Cluster_Ana(packdata, PFT_mask, ipft, var_pred_name, K, Nc):
             SelectedID = locations[RandomS]
         else:
             SelectedID = locations
+        print(
+            f"Selected {len(SelectedID)} ({len(SelectedID)/len(locations):.2%}) sites in cluster {clus}"
+        )
         cluster_dic["clus_%.2i_loc_select" % clus] = SelectedID
         All_selectedID = np.append(All_selectedID, SelectedID, axis=0)
 
     return cluster_dic, distance, All_selectedID
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   varlist                list of variables, including name of source files, variable names, etc.
-##@param[in]   logfile                logfile
-##@retval      dis_all                # Eulerian (?) distance corresponding to different number of Ks
 def Cluster_test(packdata, varlist, logfile):
+    """
+    Test clustering with different K values for all specified PFTs.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        varlist (dict): Dictionary of variable information.
+        logfile (file): File object for logging.
+
+    Returns:
+        numpy.ndarray: Array of distances for different K values and PFTs.
+    """
     # 1.clustering def
     # Make a mask map according to PFT fractions: nan - <0.00000001; 1 - >=0.00000001
     # I used the output 'VEGET_COV_MAX' by ORCHIDEE-CNP with run the spin-up for 1 year.
@@ -92,14 +111,22 @@ def Cluster_test(packdata, varlist, logfile):
     return dis_all
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   varlist                list of variables, including name of source files, variable names, etc.
-##@param[in]   KK                     K value chosen to do final clustering
-##@param[in]   logfile                logfile
-##@retval      IDx                    chosen IDs of pixels for MLacc
-##@retval      IDloc                  # to be complete by Yan (just for plotting)
-##@retval      IDsel                  # to be complete by Yan (just for plotting)
 def Cluster_all(packdata, varlist, KK, logfile):
+    """
+    Perform clustering for all specified PFTs with a chosen K value.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        varlist (dict): Dictionary of variable information.
+        KK (int): Chosen K value for clustering.
+        logfile (file): File object for logging.
+
+    Returns:
+        tuple:
+            - IDx (numpy.ndarray): Array of chosen pixel IDs for MLacc.
+            - IDloc (numpy.ndarray): Array of cluster locations (for plotting).
+            - IDsel (numpy.ndarray): Array of selected cluster locations (for plotting).
+    """
     adict = locals()
     kpfts = varlist["clustering"]["pfts"]
     Ncc = varlist["clustering"]["Ncc"]
@@ -107,7 +134,8 @@ def Cluster_all(packdata, varlist, KK, logfile):
         packdata, varlist, varlist["PFTmask"]["cluster_thres"]
     )
 
-    var_pred_name = varlist["pred"]["clustering"]
+    # var_pred_name = varlist["pred"]["clustering"]
+    var_pred_name = [k for k, v in packdata.items() if "veget" not in v.dims]
     for veg in kpfts:
         ClusD, disx, training_ID = Cluster_Ana(
             packdata,
diff --git a/Tools/ML.py b/Tools/ML.py
index 245dfac..50faa4d 100644
--- a/Tools/ML.py
+++ b/Tools/ML.py
@@ -15,169 +15,11 @@
 from Tools import *
 
 
-def MLmap(
-    packdata,
-    ivar,
-    PFT_mask,
-    PFT_mask_lai,
-    var_pred_name,
-    ipool,
-    ipft,
-    logfile,
-    varname,
-    varlist,
-    labx,
-    ii,
-    resultpath,
-    loocv,
-    restvar,
-    missVal,
-):
-    check.display("processing %s, variable %s..." % (ipool, varname), logfile)
-
-    # extract data
-    extr_var = extract_X.var(packdata, ipft)
-    # extract PFT map
-    pft_ny = extract_X.pft(packdata, PFT_mask_lai, ipft).reshape(len(packdata.Nlat), 1)
-
-    # extract Y
-    pool_arr = np.full(len(packdata.Nlat), np.nan)
-    pool_map = np.squeeze(
-        ivar
-    )  # [tuple(ind-1)] # all indices start from 1, but python loop starts from 0
-    pool_map[pool_map == 1e20] = np.nan
-    if "format" in varlist["resp"] and varlist["resp"]["format"] == "compressed":
-        for cc in range(len(packdata.Nlat)):
-            pool_arr[cc] = pool_map.flatten()[cc]
-    else:
-        for cc in range(len(packdata.Nlat)):
-            pool_arr[cc] = pool_map[packdata.Nlat[cc], packdata.Nlon[cc]]
-    # end extract Y
-    extracted_Y = np.reshape(pool_arr, (len(packdata.Nlat), 1))
-    extr_all = np.concatenate((extracted_Y, extr_var, pft_ny), axis=1)
-    df_data = DataFrame(extr_all, columns=[labx])  # convert the array into dataframe
-    # df_data.ix[:,22]=(df_data.ix[:,22].astype(int)).astype(str)
-    combine_XY = df_data.dropna()  # delete pft=nan
-    combine_XY = combine_XY.drop(["pft"], axis=1)
-    if len(combine_XY) == 0:
-        check.display(
-            "%s, variable %s : NO DATA in training set!" % (ipool, varname), logfile
-        )
-        return None
-    # need Yan Sun to modify it
-    if "allname_type" in varlist["pred"].keys():
-        col_type = labx.index(varlist["pred"]["allname_type"])
-        type_val = varlist["pred"]["type_code"]
-        combineXY = encode.en_code(combine_XY, col_type, type_val)
-    else:
-        col_type = "None"
-        type_val = "None"
-        combineXY = combine_XY
-    # combine_XY=pd.get_dummies(combine_XY) # one-hot encoded
-    (
-        Tree_Ens,
-        predY_train,
-        loocv_R2,
-        loocv_reMSE,
-        loocv_slope,
-        loocv_dNRMSE,
-        loocv_sNRMSE,
-        loocv_iNRMSE,
-        loocv_f_SB,
-        loocv_f_SDSD,
-        loocv_f_LSC,
-    ) = train.training_BAT(combineXY, logfile, loocv)
-
-    if not Tree_Ens:
-        # only one value
-        predY = np.where(pool_map == pool_map, predY_train[0], np.nan)
-        Global_Predicted_Y_map = predY
-    else:
-        Global_Predicted_Y_map, predY = mapGlobe.extrp_global(
-            packdata,
-            ipft,
-            PFT_mask,
-            var_pred_name,
-            Tree_Ens,
-            col_type,
-            type_val,
-            var_pred_name,
-        )
-        # write to restart file
-        pmask = np.nansum(PFT_mask, axis=0)
-        pmask[np.isnan(pmask)] == 0
-        # set all ocean pixels to missVal
-        Pred_Y_out = np.where(pmask == 0, missVal, Global_Predicted_Y_map[:])
-        # some pixel with nan remain, so set them zero
-        Pred_Y_out = np.nan_to_num(Pred_Y_out)
-
-        restvar[:] = Pred_Y_out[:]
-
-    if "format" in varlist["resp"] and varlist["resp"]["format"] == "compressed":
-        return None
-
-    if (PFT_mask[ipft - 1] > 0).any():
-        return MLeval.evaluation_map(Global_Predicted_Y_map, pool_map, ipft, PFT_mask)
-        # evaluation
-        R2, RMSE, slope, reMSE, dNRMSE, sNRMSE, iNRMSE, f_SB, f_SDSD, f_LSC = (
-            MLeval.evaluation_map(Global_Predicted_Y_map, pool_map, ipft, PFT_mask)
-        )
-        check.display(
-            "%s, variable %s : R2=%.3f , RMSE=%.2f, slope=%.2f, reMSE=%.2f"
-            % (ipool, varname, R2, RMSE, slope, reMSE),
-            logfile,
-        )
-        # save R2, RMSE, slope to txt files
-        # fx.write('%.2f' % R2+',')
-        # plot the results
-        fig = plt.figure(figsize=[12, 12])
-        # training dat
-        ax1 = plt.subplot(221)
-        ax1.scatter(combineXY.iloc[:, 0].values, predY_train)
-        # global dta
-        ax2 = plt.subplot(222)
-        #    predY=Global_Predicted_Y_map.flatten()
-        #    simuY=pool_map.flatten()
-        ax2.scatter(
-            pool_map[PFT_mask[ipft - 1] > 0],
-            Global_Predicted_Y_map[PFT_mask[ipft - 1] > 0],
-        )
-        xx = np.linspace(0, np.nanmax(pool_map), 10)
-        yy = np.linspace(0, np.nanmax(pool_map), 10)
-        ax2.text(
-            0.1 * np.nanmax(pool_map),
-            0.7 * np.nanmax(Global_Predicted_Y_map),
-            "R2=%.2f" % R2,
-        )
-        ax2.text(
-            0.1 * np.nanmax(pool_map),
-            0.8 * np.nanmax(Global_Predicted_Y_map),
-            "RMSE=%i" % RMSE,
-        )
-        ax2.plot(xx, yy, "k--")
-        ax2.set_xlabel("ORCHIDEE simulated")
-        ax2.set_ylabel("Machine-learning predicted")
-        ax3 = plt.subplot(223)
-        im = ax3.imshow(pool_map, vmin=0, vmax=0.8 * np.nanmax(pool_map))
-        ax3.set_title("ORCHIDEE simulated")
-        plt.colorbar(im, orientation="horizontal")
-        ax4 = plt.subplot(224)
-        im = ax4.imshow(Global_Predicted_Y_map, vmin=0, vmax=0.8 * np.nanmax(pool_map))
-        ax4.set_title("Machine-learning predicted")
-        plt.colorbar(im, orientation="horizontal")
-
-        fig.savefig(resultpath + "Eval_%s" % varname + ".png")
-        plt.close("all")
-    else:
-        check.display("%s, variable %s : NO DATA!" % (ipool, varname), logfile)
-
-
 def MLmap_multidim(
     packdata,
     ivar,
     PFT_mask,
     PFT_mask_lai,
-    var_pred_name,
     ipool,
     ipft,
     logfile,
@@ -186,11 +28,37 @@ def MLmap_multidim(
     labx,
     ind,
     ii,
-    resultpath,
-    loocv,
+    config,
     restvar,
     missVal,
+    alg,
 ):
+    """
+    Perform multi-dimensional machine learning mapping.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        ivar (numpy.ndarray): Array of response variable.
+        PFT_mask (numpy.ndarray): Mask for Plant Functional Types.
+        PFT_mask_lai (numpy.ndarray): Mask for Plant Functional Types based on LAI.
+        ipool (str): Name of the current pool.
+        ipft (int): Index of current Plant Functional Type.
+        logfile (file): File object for logging.
+        varname (str): Name of the current variable.
+        varlist (dict): Dictionary of variable information.
+        labx (list): List of column labels.
+        ind (tuple): Index tuple for multi-dimensional variables.
+        ii (dict): Dictionary containing dimension information.
+        config (module): module of config.
+        restvar (numpy.ndarray): Restart variable.
+        missVal (float): Missing value to use.
+        alg (str): ML algorithm to use.
+
+    Returns:
+        tuple:
+            - Global_Predicted_Y_map (numpy.ndarray): Globally predicted Y map.
+            - model: Trained machine learning model.
+    """
     check.display(
         "processing %s, variable %s, index %s (dim: %s)..."
         % (ipool, varname, ind, ii["dim_loop"]),
@@ -199,28 +67,29 @@ def MLmap_multidim(
 
     # extract data
     extr_var = extract_X.var(packdata, ipft)
+
     # extract PFT map
-    pft_ny = extract_X.pft(packdata, PFT_mask_lai, ipft).reshape(len(packdata.Nlat), 1)
+    pft_ny = extract_X.pft(packdata, PFT_mask_lai, ipft)
+    pft_ny = np.resize(pft_ny, (*extr_var.shape[:-1], 1))
 
     # extract Y
-    pool_arr = np.full(len(packdata.Nlat), np.nan)
     pool_map = np.squeeze(ivar)[
         tuple(i - 1 for i in ind)
     ]  # all indices start from 1, but python loop starts from 0
-    pool_map[pool_map == 1e20] = np.nan
+    pool_map[pool_map >= 1e20] = np.nan
+    # Y_map[ind[0]] = pool_map
+
     if "format" in varlist["resp"] and varlist["resp"]["format"] == "compressed":
-        for cc in range(len(packdata.Nlat)):
-            pool_arr[cc] = pool_map.flatten()[cc]
+        pool_arr = pool_map.flatten()
     else:
-        for cc in range(len(packdata.Nlat)):
-            pool_arr[cc] = pool_map[packdata.Nlat[cc], packdata.Nlon[cc]]
-    # end extract Y
-    extracted_Y = np.reshape(pool_arr, (len(packdata.Nlat), 1))
-    extr_all = np.concatenate((extracted_Y, extr_var, pft_ny), axis=1)
-    df_data = DataFrame(extr_all, columns=[labx])  # convert the array into dataframe
-    # df_data.ix[:,22]=(df_data.ix[:,22].astype(int)).astype(str)
-    combine_XY = df_data.dropna()  # delete pft=nan
-    combine_XY = combine_XY.drop(["pft"], axis=1)
+        pool_arr = pool_map[packdata.Nlat, packdata.Nlon]
+    extracted_Y = np.resize(pool_arr, (*extr_var.shape[:-1], 1))
+
+    extr_all = np.concatenate((extracted_Y, extr_var, pft_ny), axis=-1)
+    extr_all = extr_all.reshape(-1, extr_all.shape[-1])
+
+    df_data = DataFrame(extr_all, columns=labx)  # convert the array into dataframe
+    combine_XY = df_data.dropna().drop(["pft"], axis=1)
     if len(combine_XY) == 0:
         check.display(
             "%s, variable %s, index %s (dim: %s) : NO DATA in training set!"
@@ -241,7 +110,7 @@ def MLmap_multidim(
         combineXY = combine_XY
     # combine_XY=pd.get_dummies(combine_XY) # one-hot encoded
     (
-        Tree_Ens,
+        model,
         predY_train,
         loocv_R2,
         loocv_reMSE,
@@ -252,22 +121,21 @@ def MLmap_multidim(
         loocv_f_SB,
         loocv_f_SDSD,
         loocv_f_LSC,
-    ) = train.training_BAT(combineXY, logfile, loocv)
+    ) = train.training_BAT(combineXY, logfile, config, alg)
 
-    if not Tree_Ens:
+    if not model:
         # only one value
-        predY = np.where(pool_map == pool_map, predY_train[0], np.nan)
+        predY = np.where(pool_map == pool_map, predY_train.iloc[0], np.nan)
         Global_Predicted_Y_map = predY
     else:
         Global_Predicted_Y_map, predY = mapGlobe.extrp_global(
             packdata,
             ipft,
             PFT_mask,
-            var_pred_name,
-            Tree_Ens,
+            combine_XY.columns.drop("Y"),
+            model,
             col_type,
             type_val,
-            var_pred_name,
         )
         # write to restart file
         pmask = np.nansum(PFT_mask, axis=0)
@@ -285,64 +153,97 @@ def MLmap_multidim(
         return None
 
     if (PFT_mask[ipft - 1] > 0).any():
-        return MLeval.evaluation_map(Global_Predicted_Y_map, pool_map, ipft, PFT_mask)
-        # evaluation
-        R2, RMSE, slope, reMSE, dNRMSE, sNRMSE, iNRMSE, f_SB, f_SDSD, f_LSC = (
-            MLeval.evaluation_map(Global_Predicted_Y_map, pool_map, ipft, PFT_mask)
-        )
-        check.display(
-            "%s, variable %s, index %s (dim: %s) : R2=%.3f , RMSE=%.2f, slope=%.2f, reMSE=%.2f"
-            % (ipool, varname, ind, ii["dim_loop"], R2, RMSE, slope, reMSE),
-            logfile,
-        )
-        # save R2, RMSE, slope to txt files
-        # fx.write('%.2f' % R2+',')
-        # plot the results
-        fig = plt.figure(figsize=[12, 12])
-        # training dat
-        ax1 = plt.subplot(221)
-        ax1.scatter(combineXY.iloc[:, 0].values, predY_train)
-        # global dta
-        ax2 = plt.subplot(222)
-        #    predY=Global_Predicted_Y_map.flatten()
-        #    simuY=pool_map.flatten()
-        ax2.scatter(
-            pool_map[PFT_mask[ipft - 1] > 0],
-            Global_Predicted_Y_map[PFT_mask[ipft - 1] > 0],
-        )
-        xx = np.linspace(0, np.nanmax(pool_map), 10)
-        yy = np.linspace(0, np.nanmax(pool_map), 10)
-        ax2.text(
-            0.1 * np.nanmax(pool_map),
-            0.7 * np.nanmax(Global_Predicted_Y_map),
-            "R2=%.2f" % R2,
-        )
-        ax2.text(
-            0.1 * np.nanmax(pool_map),
-            0.8 * np.nanmax(Global_Predicted_Y_map),
-            "RMSE=%i" % RMSE,
-        )
-        ax2.plot(xx, yy, "k--")
-        ax2.set_xlabel("ORCHIDEE simulated")
-        ax2.set_ylabel("Machine-learning predicted")
-        ax3 = plt.subplot(223)
-        im = ax3.imshow(pool_map, vmin=0, vmax=0.8 * np.nanmax(pool_map))
-        ax3.set_title("ORCHIDEE simulated")
-        plt.colorbar(im, orientation="horizontal")
-        ax4 = plt.subplot(224)
-        im = ax4.imshow(Global_Predicted_Y_map, vmin=0, vmax=0.8 * np.nanmax(pool_map))
-        ax4.set_title("Machine-learning predicted")
-        plt.colorbar(im, orientation="horizontal")
+        res = MLeval.evaluation_map(Global_Predicted_Y_map, pool_map, ipft, PFT_mask)
+        if varname.startswith("biomass"):
+            ipft = ind[0]
+            ivar = int(varname.split("_")[1])
+        else:
+            ipft = int(varname.split("_")[1])
+            ivar = ind[0]
+            if varname.startswith("litter"):
+                j = ["ab", "be"].index(varname.split("_")[2])
+                ivar = ivar * 2 + j - 1
+        if type(model).__name__ == "Pipeline":
+            alg = type(model.named_steps["estimator"]).__name__
+        else:
+            alg = type(model).__name__
+        res["varname"] = varname
+        res["ipft"] = ipft
+        res["pft"] = [
+            "TrENF",
+            "TrEBF",
+            "TrDBF",
+            "TeENF",
+            "TeEBF",
+            "TeDBF",
+            "BoENF",
+            "BoDBF",
+            "BoDNF",
+            "C3G",
+            "C4G",
+            "C3C",
+            "C4C",
+            "_",
+            "_",
+        ][ipft - 1]
+        res["ivar"] = ivar
+        res["var"] = varlist["resp"][f"pool_name_{ipool}"][ivar - 1]
+        res["dim"] = ii["dim_loop"][0]
+        res["alg"] = alg
+        return res
+        # check.display(
+        #     "%s, variable %s, index %s (dim: %s) : R2=%.3f , RMSE=%.2f, slope=%.2f, reMSE=%.2f"
+        #     % (ipool, varname, ind, ii["dim_loop"], R2, RMSE, slope, reMSE),
+        #     logfile,
+        # )
+        # # save R2, RMSE, slope to txt files
+        # # fx.write('%.2f' % R2+',')
+        # # plot the results
+        # fig = plt.figure(figsize=[12, 12])
+        # # training dat
+        # ax1 = plt.subplot(221)
+        # ax1.scatter(combineXY.iloc[:, 0].values, predY_train)
+        # # global dta
+        # ax2 = plt.subplot(222)
+        # #    predY=Global_Predicted_Y_map.flatten()
+        # #    simuY=pool_map.flatten()
+        # ax2.scatter(
+        #     pool_map[PFT_mask[ipft - 1] > 0],
+        #     Global_Predicted_Y_map[PFT_mask[ipft - 1] > 0],
+        # )
+        # xx = np.linspace(0, np.nanmax(pool_map), 10)
+        # yy = np.linspace(0, np.nanmax(pool_map), 10)
+        # ax2.text(
+        #     0.1 * np.nanmax(pool_map),
+        #     0.7 * np.nanmax(Global_Predicted_Y_map),
+        #     "R2=%.2f" % R2,
+        # )
+        # ax2.text(
+        #     0.1 * np.nanmax(pool_map),
+        #     0.8 * np.nanmax(Global_Predicted_Y_map),
+        #     "RMSE=%i" % RMSE,
+        # )
+        # ax2.plot(xx, yy, "k--")
+        # ax2.set_xlabel("ORCHIDEE simulated")
+        # ax2.set_ylabel("Machine-learning predicted")
+        # ax3 = plt.subplot(223)
+        # im = ax3.imshow(pool_map, vmin=0, vmax=0.8 * np.nanmax(pool_map))
+        # ax3.set_title("ORCHIDEE simulated")
+        # plt.colorbar(im, orientation="horizontal")
+        # ax4 = plt.subplot(224)
+        # im = ax4.imshow(Global_Predicted_Y_map, vmin=0, vmax=0.8 * np.nanmax(pool_map))
+        # ax4.set_title("Machine-learning predicted")
+        # plt.colorbar(im, orientation="horizontal")
 
-        fig.savefig(
-            resultpath
-            + "Eval_%s" % varname
-            + "".join(
-                ["_" + ii["dim_loop"][ll] + "%2.2i" % ind[ll] for ll in range(len(ind))]
-                + [".png"]
-            )
-        )
-        plt.close("all")
+        # fig.savefig(
+        #     resultpath
+        #     + "Eval_%s" % varname
+        #     + "".join(
+        #         ["_" + ii["dim_loop"][ll] + "%2.2i" % ind[ll] for ll in range(len(ind))]
+        #         + [".png"]
+        #     )
+        # )
+        # plt.close("all")
     else:
         check.display(
             "%s, variable %s, index %s (dim: %s) : NO DATA!"
@@ -350,28 +251,35 @@ def MLmap_multidim(
             logfile,
         )
     if ind[-1] == ii["loops"][ii["dim_loop"][-1]][-1]:
-        print(varname, ind)
+        raise Exception
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   ipool                  'som','biomass' or 'litter'
-##@param[in]   logfile                logfile
-##@
-##@param[in]
 def MLloop(
     packdata,
     ipool,
     logfile,
     varlist,
     labx,
-    resultpath,
-    loocv,
+    config,
     restfile,
+    alg,
 ):
-    var_pred_name1 = varlist["pred"]["allname"]
-    var_pred_name2 = varlist["pred"]["allname_pft"]
-    var_pred_name = var_pred_name1 + var_pred_name2
+    """
+    Main loop for machine learning processing.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        ipool (str): Name of the current pool.
+        logfile (file): File object for logging.
+        varlist (dict): Dictionary of variable information.
+        labx (list): List of column labels.
+        config (module): module of config.
+        restfile (str): Path to restart file.
+        alg (str): ML algorithm to use.
 
+    Returns:
+        pandas.DataFrame: Results of machine learning evaluations.
+    """
     responseY = Dataset(varlist["resp"]["sourcefile"], "r")
     PFT_mask, PFT_mask_lai = genMask.PFT(
         packdata, varlist, varlist["PFTmask"]["pred_thres"]
@@ -393,37 +301,14 @@ def MLloop(
                 varname = jj + ("_%2.2i" % kk if kk else "") + ii["name_postfix"]
                 if ii["name_loop"] == "pft":
                     ipft = kk
-                #        print(responseY.variables.keys())
                 ivar = responseY[varname]
 
                 # open restart file and select variable (memory is exceeded if open outside this loop)
                 restnc = Dataset(restfile, "a")
                 restvar = restnc[varname]
 
-                if ii["dim_loop"] == ["null"]:
-                    if ipft in ii["skip_loop"]["pft"]:
-                        continue
-                    res = MLmap(
-                        packdata,
-                        ivar,
-                        PFT_mask,
-                        PFT_mask_lai,
-                        var_pred_name,
-                        ipool,
-                        ipft,
-                        logfile,
-                        varname,
-                        varlist,
-                        labx,
-                        ii,
-                        resultpath,
-                        loocv,
-                        restvar,
-                        missVal,
-                    )
-                    if res:
-                        res["var"] = varname
-                        result.append(res)
+                if ii["dim_loop"] == ["null"] and ipft in ii["skip_loop"]["pft"]:
+                    continue
                 else:
                     index = itertools.product(
                         *[ii["loops"][ll] for ll in ii["dim_loop"]]
@@ -434,32 +319,37 @@ def MLloop(
                             ipft = ind[ii["dim_loop"].index("pft")]
                         if ipft in ii["skip_loop"]["pft"]:
                             continue
-                        res = MLmap_multidim(
-                            packdata,
-                            ivar,
-                            PFT_mask,
-                            PFT_mask_lai,
-                            var_pred_name,
-                            ipool,
-                            ipft,
-                            logfile,
-                            varname,
-                            varlist,
-                            labx,
-                            ind,
-                            ii,
-                            resultpath,
-                            loocv,
-                            restvar,
-                            missVal,
+                        result.append(
+                            (
+                                packdata,
+                                ivar[:],
+                                PFT_mask,
+                                PFT_mask_lai,
+                                ipool,
+                                ipft,
+                                None,  # logfile
+                                varname,
+                                varlist,
+                                labx,
+                                ind,
+                                ii,
+                                config,
+                                restvar[:],
+                                missVal,
+                                alg,
+                            )
                         )
-                        if res:
-                            res["var"] = varname
-                            for i, (k, v) in enumerate(dim_ind):
-                                res[f"dim_{i+1}"] = k
-                                res[f"ind_{i+1}"] = v
-                            result.append(res)
+                #         break
+                #     if result:
+                #         break
+                # if result:
+                #     break
+
                 # close&save netCDF file
                 restnc.close()
 
-    return pd.DataFrame(result).set_index("var")
+    # result = result[:3]
+    with ThreadPoolExecutor() as pool:
+        result = list(filter(None, pool.map(MLmap_multidim, *zip(*result))))
+
+    return pd.DataFrame(result).set_index(["ivar", "ipft"]).sort_index()
diff --git a/Tools/MLeval.py b/Tools/MLeval.py
index f7d2de9..9ee0df3 100644
--- a/Tools/MLeval.py
+++ b/Tools/MLeval.py
@@ -17,17 +17,29 @@
 from Tools import *
 
 
-##@param[in]   ipft                   index of PFT
-##@param[in]   PFTmask                PFT mask
-##@param[in]   XVarName               input variables
-##@param[in]   Tree_Ens               tree ensemble
-##@param[in]   colum
-##@param[in]   Nm
-##@param[in]   labx
-##@retval      R2                     R2 between predicted Y and target Y
-##@retval      RMSE                   RMSE of prediceted Y
-##@retval      slope                  regression slope of target Y and predicted Y
 def evaluation_map(Global_Predicted_Y_map, Y_map, ipft, PFTmask):
+    """
+    Evaluate the performance of the machine learning predictions.
+
+    Args:
+        Global_Predicted_Y_map (numpy.ndarray): Predicted values.
+        Y_map (numpy.ndarray): Actual values.
+        ipft (int): Index of current Plant Functional Type.
+        PFT_mask (numpy.ndarray): Mask for Plant Functional Types.
+
+    Returns:
+        dict: Dictionary containing various evaluation metrics:
+            - R2: R-squared value
+            - RMSE: Root Mean Squared Error
+            - slope: Regression slope
+            - reMSE: Relative Mean Squared Error
+            - dNRMSE: Normalized RMSE (by range)
+            - sNRMSE: Normalized RMSE (by standard deviation)
+            - iNRMSE: Normalized RMSE (by interquartile range)
+            - f_SB: Fraction of MSE due to bias
+            - f_SDSD: Fraction of MSE due to difference in standard deviations
+            - f_LSC: Fraction of MSE due to lack of correlation
+    """
     pmask = np.squeeze(PFTmask[ipft - 1][:])
     all_predY = np.reshape(Global_Predicted_Y_map, (-1, 1))
     all_Y = np.reshape(Y_map * pmask, (-1, 1))
diff --git a/Tools/__init__.py b/Tools/__init__.py
index f251a46..021aafc 100644
--- a/Tools/__init__.py
+++ b/Tools/__init__.py
@@ -22,6 +22,7 @@
 import matplotlib
 import numpy as np
 import pandas as pd
+from copy import deepcopy
 from netCDF4 import Dataset
 
 matplotlib.use("Agg")
@@ -30,18 +31,26 @@
 import random
 import sys
 from collections import Counter
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 
 import matplotlib.colors as mcolors
 import matplotlib.pyplot as plt
 from imblearn.over_sampling import SMOTE
 from mpl_toolkits.basemap import Basemap
 from pandas import DataFrame, Series
+from pathlib import Path
 from scipy import stats
 from sklearn.cluster import Birch, KMeans
 from sklearn.datasets import make_regression
-from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
+from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, StackingRegressor
+from sklearn.linear_model import Lasso, RidgeCV
 from sklearn.metrics import mean_squared_error, r2_score
-from sklearn.model_selection import LeaveOneOut
+from sklearn.model_selection import (
+    LeaveOneOut,
+    train_test_split,
+)
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.neural_network import MLPRegressor
 from xgboost import XGBRegressor
diff --git a/Tools/check.py b/Tools/check.py
index 121abc9..4010992 100644
--- a/Tools/check.py
+++ b/Tools/check.py
@@ -21,13 +21,15 @@
 ##@param[in] sss string to be written
 ##@param[in] logfile verbose file
 def verbose(sss, logfile):
-    logfile.write(sss + "\n")
+    if logfile:
+        logfile.write(sss + "\n")
 
 
 ##@param[in] sss string to be written
 ##@param[in] logfile verbose file
 def display(sss, logfile):
-    logfile.write(sss + "\n")
+    if logfile:
+        logfile.write(sss + "\n")
     print(sss)
 
 
diff --git a/Tools/eval_plot_loocv_un.py b/Tools/eval_plot_loocv_un.py
index 2b6388c..4f39b5b 100644
--- a/Tools/eval_plot_loocv_un.py
+++ b/Tools/eval_plot_loocv_un.py
@@ -32,11 +32,11 @@ def plot_metric(data_path, npfts, ipool, subLabel, dims, sect_n, xTickLabel):
     # print(subps)
     loop_n = len(subLabel)
     shape = np.array([npfts, subps])
-    R22 = np.genfromtxt(data_path + ipool + "_loocv_R2.txt", delimiter=",")[:, :-1]
-    slope = np.genfromtxt(data_path + ipool + "_loocv_slope.txt", delimiter=",")[:, :-1]
-    dNRMSE = np.genfromtxt(data_path + ipool + "_loocv_dNRMSE.txt", delimiter=",")[
-        :, :-1
-    ]
+    df = pd.read_csv(data_path + "MLacc_results.csv", index_col=[0, 1, 2])
+    df = df.loc[ipool].round(2)
+    R22 = df["R2"].unstack().values
+    slope = df["slope"].unstack().values
+    dNRMSE = df["dNRMSE"].unstack().values
 
     # print(R22)
     yTickLabel = [
diff --git a/Tools/eval_plot_un.py b/Tools/eval_plot_un.py
index fd42912..d84c610 100644
--- a/Tools/eval_plot_un.py
+++ b/Tools/eval_plot_un.py
@@ -28,18 +28,20 @@
 # data_path='/home/orchidee04/ysun/MLacc_Python_Tool/'
 def plot_metric(data_path, npfts, ipool, subLabel, dims, sect_n, xTickLabel):
     subps = len(xTickLabel)
-    print(subps)
+    # print(subps)
     # subLabel=['C']
     loop_n = len(subLabel)
     # print(loop_n)
     # dims=np.array([0,1])# biomass: [1,0]
     # shape=np.array([14*loop_n,4])# biomass: [14,8sect_n,
     shape = np.array([npfts, subps])
-    R22 = np.genfromtxt(data_path + ipool + "_R2.txt", delimiter=",")[:, :-1]
-    slope = np.genfromtxt(data_path + ipool + "_slope.txt", delimiter=",")[:, :-1]
-    dNRMSE = np.genfromtxt(data_path + ipool + "_dNRMSE.txt", delimiter=",")[:, :-1]
+    df = pd.read_csv(data_path + "MLacc_results.csv", index_col=[0, 1, 2])
+    df = df.loc[ipool].round(2)
+    R22 = df["R2"].unstack().values
+    slope = df["slope"].unstack().values
+    dNRMSE = df["dNRMSE"].unstack().values
 
-    print(R22)
+    # print(R22)
     yTickLabel = [
         "PFT02",
         "PFT03",
@@ -86,15 +88,15 @@ def plot_metric(data_path, npfts, ipool, subLabel, dims, sect_n, xTickLabel):
     mymap_rmse = mcolors.LinearSegmentedColormap.from_list("mylist", mycolor_rmse, N=5)
     # clip
     # print(npfts)
-    print(ipool)
-    print(sect_n)
+    # print(ipool)
+    # print(sect_n)
     if sect_n > 1:
         jq = sect_n * npfts
     else:
         jq = npfts
     for n in range(0, loop_n):
-        print(n)
-        print(dims)
+        # print(n)
+        # print(dims)
         if dims[0] == 0:
             R22_n = R22[n * jq : (n + 1) * jq, :]
             slope_n = slope[n * jq : (n + 1) * jq, :]
@@ -103,9 +105,9 @@ def plot_metric(data_path, npfts, ipool, subLabel, dims, sect_n, xTickLabel):
             R22_n = R22[n * subps : (n + 1) * subps, :]
             slope_n = slope[n * subps : (n + 1) * subps, :]
             dNRMSE_n = dNRMSE[n * subps : (n + 1) * subps, :]
-        print(R22_n)
+        # print(R22_n)
         R22_n = np.transpose(R22_n, dims).reshape(shape, order="F")
-        print(R22_n)
+        # print(R22_n)
         slope_n = np.transpose(slope_n, dims).reshape(shape, order="F")
         dNRMSE_n = np.transpose(dNRMSE_n, dims).reshape(shape, order="F")
         # R2_Cpools
diff --git a/Tools/extract_X.py b/Tools/extract_X.py
index 00746c3..de34d1a 100644
--- a/Tools/extract_X.py
+++ b/Tools/extract_X.py
@@ -17,65 +17,70 @@
 from Tools import *
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   var_ind                index of variable
-##@param[in]   VarName                list of variables' names
-##@retval      VarN                   variable values of selected pixels
-def extract_X(packdata, var_ind, VarName):
-    Nlat = packdata.Nlat
-    Nlon = packdata.Nlon
-    varN = np.full(len(Nlat), np.nan)
-    var_data = packdata[VarName[var_ind]]
-    for cc in range(0, len(Nlat)):
-        varN[cc] = var_data[Nlat[cc], Nlon[cc]]
-    return varN
-
-
-##@param[in]   packdata               packaged data
-##@param[in]   var_ind                index of variable
-##@param[in]   VarName                list of variables' names
-##@param[in]   px                     index of PFT
-##@retval      VarN                   variable values of selected pixels
-def extract_XN(packdata, var_ind, VarName, px):
-    Nlat = packdata.Nlat
-    Nlon = packdata.Nlon
-    varN = np.full(len(Nlat), np.nan)
-    var_data = packdata[VarName[var_ind]]
-    var_pft_map = np.squeeze(var_data[px - 1][:][:])
-    for cc in range(0, len(Nlat)):
-        varN[cc] = var_pft_map[Nlat[cc], Nlon[cc]]
-    return varN
-
-
-##@param[in]   packdata               packaged data
-##@param[in]   PFT_mask               PFT mask
-##@param[in]   px                     index of PFT
-##@retval      VarN                   variable values of selected pixels
+def extract_X(packdata, var_name):
+    """
+    Extract variable values for selected pixels.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        var_name (str): Name of the variable to extract.
+
+    Returns:
+        numpy.ndarray: Variable values for selected pixels.
+    """
+    var = packdata[var_name].values
+    return var[..., packdata.Nlat, packdata.Nlon]
+
+
+def extract_XN(packdata, var_name, px):
+    """
+    Extract variable values for selected pixels for a specific PFT.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        var_name (str): Name of the variable to extract.
+        px (int): Index of the PFT.
+
+    Returns:
+        numpy.ndarray: Variable values for selected pixels and PFT.
+    """
+    var = packdata[var_name].values
+    return var[px - 1, packdata.Nlat, packdata.Nlon]
+
+
 def pft(packdata, PFT_mask, px):
-    Nlat = packdata.Nlat
-    Nlon = packdata.Nlon
-    varN = np.full(len(Nlat), np.nan)
-    for cc in range(0, len(Nlat)):
-        varN[cc] = PFT_mask[px - 1, Nlat[cc], Nlon[cc]]
-    return varN
+    """
+    Extract PFT mask for selected pixels.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        PFT_mask (numpy.ndarray): Mask for Plant Functional Types.
+        px (int): Index of the PFT.
+
+    Returns:
+        numpy.ndarray: PFT mask values for selected pixels.
+    """
+    return PFT_mask[px - 1, packdata.Nlat, packdata.Nlon]
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   ipft                   ith pft
-##@retval      extr_var               extracked data
 def var(packdata, ipft):
-    extr_var = np.zeros(shape=(len(packdata.Nlat), 0))
-    for indx in range(packdata.Nv_total):
-        if indx < packdata.Nv_nopft:
-            extracted_var = np.reshape(
-                extract_X(packdata, indx, packdata.var_pred_name),
-                (len(packdata.Nlat), 1),
-            )
-            extr_var = np.concatenate((extr_var, extracted_var), axis=1)
+    """
+    Extract all variables for a specific PFT.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        ipft (int): Index of the PFT.
+
+    Returns:
+        numpy.ndarray: Array of extracted variable values for the specified PFT.
+    """
+    extr_var = []
+    for var_name in packdata.data_vars:
+        if "veget" not in packdata[var_name].dims:
+            extracted_var = extract_X(packdata, var_name)
         else:
-            extracted_var = np.reshape(
-                extract_XN(packdata, indx, packdata.var_pred_name, ipft),
-                (len(packdata.Nlat), 1),
-            )
-            extr_var = np.concatenate((extr_var, extracted_var), axis=1)
-    return extr_var
+            extracted_var = extract_XN(packdata, var_name, ipft)
+        extr_var.append(extracted_var.reshape(-1, len(packdata.Nlat), 1))
+    com_shape = max(map(np.shape, extr_var))
+    extr_var = [np.resize(a, com_shape) for a in extr_var]
+    return np.concatenate(extr_var, axis=-1)
diff --git a/Tools/genMask.py b/Tools/genMask.py
index f63f36e..4f5ea7f 100644
--- a/Tools/genMask.py
+++ b/Tools/genMask.py
@@ -17,12 +17,20 @@
 from Tools import *
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   varlist                list of variables, including name of source files, variable names, etc.
-##@param[in]   thres                  threshold for PFT fraction to be defined as valid pixel
-##@retval      PFT_mask               PFT mask where PFT fraction < thres
-##@retval      PFT_mask_lai           PFT mask where LAI <0
 def PFT(packdata, varlist, thres):
+    """
+    Generate Plant Functional Type (PFT) masks based on PFT fraction and LAI.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables, including LAI.
+        varlist (dict): Dictionary of variable information, including PFT mask source file and variable name.
+        thres (float): Threshold for PFT fraction to be defined as a valid pixel.
+
+    Returns:
+        tuple:
+            - PFT_mask (numpy.ndarray): Mask where PFT fraction > threshold (1 for valid, NaN for invalid).
+            - PFT_mask_lai (numpy.ndarray): Mask where PFT fraction > threshold and LAI >= 0 (1 for valid, NaN for invalid).
+    """
     f = Dataset(varlist["PFTmask"]["sourcefile"], "r")
     mkk = f[varlist["PFTmask"]["var"]][-1].filled(np.nan)
     PFT_fraction = np.squeeze(mkk)
diff --git a/Tools/mapGlobe.py b/Tools/mapGlobe.py
index 58aee5b..ee8df66 100644
--- a/Tools/mapGlobe.py
+++ b/Tools/mapGlobe.py
@@ -17,33 +17,42 @@
 from Tools import *
 
 
-##@param[in]   packdata               packaged data
-##@param[in]   ipft                   index of PFT
-##@param[in]   PFTmask                PFT mask
-##@param[in]   XVarName               input variables
-##@param[in]   Tree_Ens               tree ensemble
-##@param[in]   colum
-##@param[in]   Nm
-##@param[in]   labx
-##@retval      Pred_Y_map             predicted map of target variables, masking nan pixels
-##@retval      Pred_Y                 predicted map of target variables, without masking
-def extrp_global(packdata, ipft, PFTmask, XVarName, Tree_Ens, colum, Nm, labx):
+def extrp_global(packdata, ipft, PFTmask, XVarName, model, colum, Nm):
+    """
+    Extrapolate predictions globally using a trained model.
+
+    Args:
+        packdata (xarray.Dataset): Dataset containing input variables.
+        ipft (int): index of PFT
+        PFTmask (numpy.ndarray): Mask for Plant Functional Types.
+        XVarName (list): List of input variable names.
+        model: Trained machine learning model.
+        colum (str): Column name for encoding, or "None".
+        Nm (int): Number of categories for encoding.
+
+    Returns:
+        tuple:
+            - Pred_Y_map (numpy.ndarray): Predicted map of target variables, masking nan pixels.
+            - Pred_Y (numpy.ndarray): Predicted map of target variables, without masking.
+    """
+    if "year" in packdata.dims:
+        packdata = packdata.mean("year", keep_attrs=True)
     global_X_map = np.full((len(XVarName), packdata.nlat, packdata.nlon), np.nan)
     # PFTmask[np.isnan(PFTmask)]=0
     pmask = np.squeeze(PFTmask[ipft - 1][:])
     Pred_Y = np.full(PFTmask[0].shape, np.nan)
     # global metrics -> dataframe
-    for ii in range(len(XVarName)):
-        if ii < packdata.Nv_nopft:
-            global_X_map[ii] = packdata[XVarName[ii]][:]
+    for ii, arr in enumerate(packdata.values()):
+        if arr.ndim == 2:
+            global_X_map[ii] = arr.values
         else:
-            global_X_map[ii] = np.squeeze(packdata[XVarName[ii]][ipft - 1][:])
+            global_X_map[ii] = arr[ipft - 1].values
         #    global_X_map=lc['global_X_map']
         das = global_X_map.transpose(1, 2, 0)
     for llat in range(packdata.nlat):
         Xllat = das[llat][:][:]
         # Xllat[np.isnan(Xllat)]=-9999
-        Xtr = DataFrame(Xllat, columns=[labx])
+        Xtr = DataFrame(Xllat, columns=XVarName)
         ind = Xtr.index
         Xtr = Xtr.dropna()
         if len(Xtr) > 0:
@@ -53,7 +62,7 @@ def extrp_global(packdata, ipft, PFTmask, XVarName, Tree_Ens, colum, Nm, labx):
                 Xtr_encode = Xtr
             # Xtr.ix[:,colum]=(Xtr.ix[:,colum].astype(int)).astype(str)
             # Xtrr=pd.get_dummies(Xtr)
-            Ym = DataFrame(Tree_Ens.predict(Xtr_encode))
+            Ym = DataFrame(model.predict(Xtr_encode))
             Ym.index = Xtr_encode.index
             Ymm = Ym.reindex(index=range(max(ind) + 1))
             Pred_Y[llat][:] = np.squeeze(Ymm)
diff --git a/Tools/readvar.py b/Tools/readvar.py
index 1e22773..efa35f6 100644
--- a/Tools/readvar.py
+++ b/Tools/readvar.py
@@ -23,10 +23,18 @@ class PackData(dict):
     __getattr__ = dict.__getitem__
 
 
-##@param[in]   varlist                list of variables, including name of source files, variable names, etc.
-##@param[in]   config                 configurations
-##@param[in]   logfile                logfile
 def readvar(varlist, config, logfile):
+    """
+    Read and process variables from input files.
+
+    Args:
+        varlist (dict): Dictionary of variable information.
+        config (object): Configuration object.
+        logfile (file): File object for logging.
+
+    Returns:
+        xarray.Dataset: Dataset containing processed variables.
+    """
     adict = locals()
     # 0 initialize latitude and longitudes
     f = Dataset(varlist["coord_ref"], "r")
@@ -49,7 +57,6 @@ def readvar(varlist, config, logfile):
             )
             # DSG bugfix_start: remove hardcode
             # filename='crujra_twodeg_v2_'+str(year)+'.nc'
-            # f=Dataset(climvar['sourcepath']+filename,'r')
             f = Dataset(
                 climvar["sourcepath"] + climvar["filename"] + str(year) + ".nc", "r"
             )
@@ -76,35 +83,19 @@ def readvar(varlist, config, logfile):
                 zstart = count + 1
 
             var_month_year[year - climvar["year_start"]] = var_month
-            # eval('MY'+varname_clim[index]+'=var_month_year')
         adict["MY%s" % varname_clim[index]] = var_month_year[:]
 
     # 0.1.1 Tair (Tmax, Tmin, Tmean,Tstd,AMT)
-    Tair = adict["MYTair"] - 273.15
-    packdata.Tmean = np.mean(Tair, axis=(0, 1))
-    #  pyplot.imshow(Tmean)
-    packdata.Tstd = np.std(np.mean(Tair, axis=0), axis=0)
-    packdata.Tmin = np.min(np.mean(Tair, axis=0), axis=0)
-    packdata.Tmax = np.max(np.mean(Tair, axis=0), axis=0)
-    packdata.Tamp = packdata.Tmax - packdata.Tmin
+    packdata.Tair = (["year", "month", "lat", "lon"], adict["MYTair"] - 273.15)
 
     # 0.1.2 Other climatic variables (Rainf,Snowf,Qair,Psurf,SWdown,LWdown)
-    for index in range(len(varname_clim)):
-        if varname_clim[index] == "Tair":
-            continue
-        trav = adict["MY" + varname_clim[index]]
-        if varname_clim[index] in ["Rainf", "Snowf"]:
-            meanv = 365 * 24 * 3600 * np.mean(np.mean(trav, axis=0), axis=0)
-            stdv = np.std(30 * 24 * 3600 * np.mean(trav, axis=0), axis=0)
-        else:
-            meanv = np.mean(np.mean(trav, axis=0), axis=0)
-            stdv = np.std(np.mean(trav, axis=0), axis=0)
-        packdata[varname_clim[index] + "_std"] = stdv
-        packdata[varname_clim[index] + "_mean"] = meanv
+    packdata.update(
+        (k, (["year", "month", "lat", "lon"], adict[f"MY{k}"])) for k in varname_clim
+    )
 
     # 0.1.3 P and T for growing season (Pre_GS, Temp_GS, GS_length)
     pre = 30 * 24 * 3600 * np.mean(adict["MYRainf"], axis=0)
-    temp = np.mean(Tair, axis=0)
+    temp = np.mean(packdata.Tair[1], axis=0)
     Pre_GS_v = np.full((12, nlat, nlon), np.nan)
     Temp_GS_v = np.full((12, nlat, nlon), np.nan)
     GS_length_v = np.full((12, nlat, nlon), np.nan)
@@ -118,10 +109,9 @@ def readvar(varlist, config, logfile):
         Pre_GS_v[month - 1] = GS_mask * pre[month - 1]
         Temp_GS_v[month - 1] = GS_mask * temp[month - 1]
         GS_length_v[month - 1] = GS_mask * land
-    packdata.GS_length = np.sum(GS_length_v, axis=0)
-    # np.where(np.isnan(Tair[0]),GS_length,np.nan)
-    packdata.Pre_GS = np.sum(Pre_GS_v, axis=0)
-    packdata.Temp_GS = np.sum(Temp_GS_v, axis=0)
+    packdata.GS_length = (("lat", "lon"), np.sum(GS_length_v, axis=0))
+    packdata.Pre_GS = (("lat", "lon"), np.sum(Pre_GS_v, axis=0))
+    packdata.Temp_GS = (("lat", "lon"), np.sum(Temp_GS_v, axis=0))
 
     # 0.2 read other variables, including Edaphic variables, N and P deposition variables
     predvar = varlist["pred"]
@@ -167,20 +157,38 @@ def readvar(varlist, config, logfile):
                 if "missing_value" in predvar[ipred].keys():
                     da[da == predvar[ipred]["missing_value"]] = np.nan
                 if isinstance(da, np.ma.masked_array):
-                    packdata[rename[ivar]] = da.filled(np.nan)
-                else:
-                    packdata[rename[ivar]] = da
+                    da = da.filled(np.nan)
+                packdata[rename[ivar]] = (["veget", "lat", "lon"][-da.ndim :], da)
 
-    # 0.3 Interactions between variables
-    packdata.interx1 = packdata.Tmean * packdata.Rainf_mean
-    packdata.interx2 = packdata.Temp_GS * packdata.Pre_GS
+    ds = xarray.Dataset(packdata)
 
-    # insert dimension names for each variable
-    for k, v in packdata.items():
-        if k not in ["lat", "lon"]:
-            packdata[k] = (["veget", "lat", "lon"][-v.ndim :], v)
+    for var, arr in ds.data_vars.items():
+        if "year" in arr.dims:
+            if config.take_year_average:
+                arr = arr.mean("year")
+            if var in ("Rainf", "Snowf"):
+                stats = {
+                    f"{var}_mean": 365 * 24 * 3600 * arr.mean("month"),
+                    f"{var}_std": 30 * 24 * 3600 * arr.std("month"),
+                }
+            elif var == "Tair":
+                stats = dict(
+                    Tmean=arr.mean("month"),
+                    Tstd=arr.std("month"),
+                    Tmin=arr.min("month"),
+                    Tmax=arr.max("month"),
+                    # Tamp=arr.max("month") - arr.min("month"),
+                )
+            else:
+                stats = {
+                    f"{var}_mean": arr.mean("month"),
+                    f"{var}_std": arr.std("month"),
+                }
+            ds = ds.drop_vars(var).assign(stats)
 
-    ds = xarray.Dataset(packdata)
+    # 0.3 Interactions between variables
+    ds["interx1"] = ds.Tmean * ds.Rainf_mean
+    ds["interx2"] = ds.Temp_GS * ds.Pre_GS
 
     ds.attrs.update(
         nlat=nlat, nlon=nlon, lat_reso=varlist["lat_reso"], lon_reso=varlist["lon_reso"]
diff --git a/Tools/train.py b/Tools/train.py
index 6768b8c..3d93be2 100644
--- a/Tools/train.py
+++ b/Tools/train.py
@@ -17,30 +17,41 @@
 from Tools import *
 
 
-##@param[in]   XY_train               latitudes of selected pixels
-##@param[in]   logfile                logfile
-##@param[in]   loocv                  do leave-one-out-cross-validation(1) or not (0)
-##@retval      TreeEns                Tree ensemble
-##@retval      predY                  predicted Y
-def training_BAT(XY_train, logfile, loocv, alg="bt"):
-    XX = XY_train.iloc[:, 1:].values
-    YY = XY_train.iloc[:, 0].values
-    # labels=np.zeros(shape=(len(YY),1))
+def training_BAT(XY_train, logfile, config, alg="gbm"):
+    """
+    Train a machine learning model using Balanced Augmentation Technique (BAT).
+
+    Args:
+        XY_train (pandas.DataFrame): Training dataset.
+        logfile (file): File object for logging.
+        config (module): module of config.
+        alg (str): ML algorithm to use (options: "nn", "bt", "rf", "gbm").
+        bat (bool): Whether to augment the dataset with SMOTE.
+
+    Returns:
+        tuple:
+            - model: Trained machine learning model.
+            - predY (numpy.ndarray): Predicted Y values.
+    """
+    Xtrain = XY_train.drop(columns="Y")
+    Ytrain = XY_train["Y"]
+    # labels=np.zeros(shape=(len(Ytrain),1))
+
+    loocv_R2 = np.nan
+    loocv_reMSE = np.nan
+    loocv_slope = np.nan
+    loocv_dNRMSE = np.nan
+    loocv_sNRMSE = np.nan
+    loocv_iNRMSE = np.nan
+    loocv_f_SB = np.nan
+    loocv_f_SDSD = np.nan
+    loocv_f_LSC = np.nan
 
     # if length of unique target is one
-    if len(np.unique(YY)) == 1:
+    if len(np.unique(Ytrain)) == 1:
         # return a set of default values and an empty TreeEnsemble
-        TreeEns = []
-        predY = YY
-        loocv_R2 = np.nan
-        loocv_reMSE = np.nan
-        loocv_slope = np.nan
-        loocv_dNRMSE = np.nan
-        loocv_sNRMSE = np.nan
-        loocv_iNRMSE = np.nan
-        loocv_f_SB = np.nan
-        loocv_f_SDSD = np.nan
-        loocv_f_LSC = np.nan
+        TreeEns = None
+        predY = Ytrain
         return (
             TreeEns,
             predY,
@@ -57,52 +68,54 @@ def training_BAT(XY_train, logfile, loocv, alg="bt"):
 
     # If the length of unique target variable is not 1,
     # run the KMeans algorithm to find the cluster centers, and resample the data
-    try:
-        mod = KMeans(n_clusters=3)
-        lab = mod.fit_predict(np.reshape(YY, (-1, 1)))
-        count = Counter(lab)
-        check.display("Counter(lab):" + str(count), logfile)
-        over_samples = SMOTE()
-        over_samples_X, over_samples_y = over_samples.fit_resample(XY_train, lab)
-        check.display(
-            "Counter(over_samples_y):" + str(Counter(over_samples_y)), logfile
-        )
-        Xtrain = over_samples_X.iloc[:, 1:]
-        Ytrain = over_samples_X.iloc[:, 0]
-    #  else:
-    except:
-        mod = KMeans(n_clusters=2)
-        lab = mod.fit_predict(np.reshape(YY, (-1, 1)))
-        count = Counter(lab)
-        check.display("Counter(lab):" + str(Counter(lab)), logfile)
-        # resample requires minimum number of a cluster >=6, if not, then repeat current samples
-        for label, number in count.items():
-            if number < 6:
-                XY_train = pd.concat(
-                    (XY_train,)
-                    + (XY_train[lab == label],) * int(np.ceil(6 / number) - 1)
-                )
-                lab = np.hstack(
-                    (lab, np.repeat(lab[lab == label], int(np.ceil(6 / number) - 1)))
-                )
-        #        print(len(lab),number,int(np.ceil(6/number)))
-        check.display("Counter(lab):" + str(Counter(lab)), logfile)
-        over_samples = SMOTE()
-        over_samples_X, over_samples_y = over_samples.fit_resample(XY_train, lab)
-        check.display(
-            "Counter(over_samples_y):" + str(Counter(over_samples_y)), logfile
-        )
-        Xtrain = over_samples_X.iloc[:, 1:]
-        Ytrain = over_samples_X.iloc[:, 0]
+    if config.smote_bat:
+        try:
+            mod = KMeans(n_clusters=3)
+            lab = mod.fit_predict(np.reshape(Ytrain.values, (-1, 1)))
+            count = Counter(lab)
+            check.display("Counter(lab):" + str(count), logfile)
+            over_samples = SMOTE()
+            over_samples_X, over_samples_y = over_samples.fit_resample(XY_train, lab)
+            check.display(
+                "Counter(over_samples_y):" + str(Counter(over_samples_y)), logfile
+            )
+            Xtrain = over_samples_X.iloc[:, 1:]
+            Ytrain = over_samples_X.iloc[:, 0]
+        except:
+            mod = KMeans(n_clusters=2)
+            lab = mod.fit_predict(np.reshape(Ytrain.values, (-1, 1)))
+            count = Counter(lab)
+            check.display("Counter(lab):" + str(Counter(lab)), logfile)
+            # resample requires minimum number of a cluster >=6, if not, then repeat current samples
+            for label, number in count.items():
+                if number < 6:
+                    XY_train = pd.concat(
+                        (XY_train,)
+                        + (XY_train[lab == label],) * int(np.ceil(6 / number) - 1)
+                    )
+                    lab = np.hstack(
+                        (
+                            lab,
+                            np.repeat(lab[lab == label], int(np.ceil(6 / number) - 1)),
+                        )
+                    )
+            check.display("Counter(lab):" + str(Counter(lab)), logfile)
+            over_samples = SMOTE()
+            over_samples_X, over_samples_y = over_samples.fit_resample(XY_train, lab)
+            check.display(
+                "Counter(over_samples_y):" + str(Counter(over_samples_y)), logfile
+            )
+            Xtrain = over_samples_X.iloc[:, 1:]
+            Ytrain = over_samples_X.iloc[:, 0]
 
     SW = np.ones(shape=(len(Ytrain),))
-    sq1 = np.percentile(YY, 2)
-    sq2 = np.percentile(YY, 10)
-    sq5 = np.percentile(YY, 75)
-    sq6 = np.percentile(YY, 95)
-    sq7 = np.percentile(YY, 99)
-    sq8 = np.percentile(YY, 99.5)
-    sq9 = np.percentile(YY, 99.9)
+    sq1 = np.percentile(Ytrain, 2)
+    sq2 = np.percentile(Ytrain, 10)
+    sq5 = np.percentile(Ytrain, 75)
+    sq6 = np.percentile(Ytrain, 95)
+    sq7 = np.percentile(Ytrain, 99)
+    sq8 = np.percentile(Ytrain, 99.5)
+    sq9 = np.percentile(Ytrain, 99.9)
     SW[Ytrain < sq2] = 2
     SW[Ytrain < sq1] = 4
     SW[Ytrain > sq5] = 1.5
@@ -113,12 +126,14 @@ def training_BAT(XY_train, logfile, loocv, alg="bt"):
 
     if alg == "nn":
         model = MLPRegressor(
-            hidden_layer_sizes=(64, 64),
-            max_iter=100,
+            hidden_layer_sizes=(32, 32),
+            activation="tanh",
+            solver="lbfgs" if len(Ytrain) < 800 else "adam",
             learning_rate="invscaling",
-            learning_rate_init=0.5,
+            learning_rate_init=0.6,
+            power_t=0.99,
+            max_iter=int(1e6 / len(Ytrain)),
             random_state=1000,
-            verbose=True,
         )
     elif alg == "bt":
         model = BaggingRegressor(
@@ -129,40 +144,95 @@ def training_BAT(XY_train, logfile, loocv, alg="bt"):
         )
     elif alg == "rf":
         model = RandomForestRegressor(
+            n_estimators=500,
             max_samples=0.8,
-            n_estimators=300,
+            max_depth=30,
             random_state=1000,
         )
     elif alg == "gbm":
         model = XGBRegressor(
-            n_estimators=300,
-            max_samples=0.8,
+            n_estimators=500,
+            max_depth=16,
+            # learning_rate=0.001,
             random_state=1000,
-            verbose=3,
+        )
+    elif alg == "ridge":
+        model = RidgeCV()
+    elif alg == "best":
+        model = select_best_model(
+            Xtrain,
+            Ytrain,
+            [
+                (
+                    "bt",
+                    BaggingRegressor(
+                        DecisionTreeRegressor(random_state=1000),
+                        max_samples=0.8,
+                        n_estimators=300,
+                        random_state=1000,
+                    ),
+                ),
+                (
+                    "nn",
+                    MLPRegressor(
+                        hidden_layer_sizes=(32, 32),
+                        activation="tanh",
+                        solver="lbfgs" if len(Ytrain) < 800 else "adam",
+                        learning_rate="invscaling",
+                        learning_rate_init=0.6,
+                        power_t=0.99,
+                        max_iter=int(1e6 / len(Ytrain)),
+                        random_state=1000,
+                    ),
+                ),
+                # (
+                #     "rf",
+                #     RandomForestRegressor(
+                #         n_estimators=500,
+                #         max_samples=0.8,
+                #         max_depth=30,
+                #         random_state=1000,
+                #     ),
+                # ),
+                (
+                    "gbm",
+                    XGBRegressor(
+                        n_estimators=500,
+                        # max_depth=16,
+                        # learning_rate=0.001,
+                        random_state=1000,
+                    ),
+                ),
+                (
+                    "ridge",
+                    RidgeCV(),
+                ),
+            ],
+            SW=SW,
         )
     else:
         raise ValueError("invalid ML algorithm name")
 
-    model.fit(Xtrain, Ytrain, sample_weight=SW)  # sample_weight=SW
+    model = Pipeline(
+        [
+            ("scaler", StandardScaler()),
+            ("estimator", model),
+        ]
+    )
+
+    try:
+        model.fit(Xtrain, Ytrain, estimator__sample_weight=SW)
+    except TypeError:
+        model.fit(Xtrain, Ytrain)
 
     # predict
-    predY = model.predict(XX)
+    predY = model.predict(Xtrain)
 
     # leave one out cross validations
     loo = LeaveOneOut()
     ytests = []
     ypreds = []
-    loocv_R2 = np.nan
-    loocv_reMSE = np.nan
-    loocv_slope = np.nan
-    loocv_RMSE = np.nan
-    loocv_dNRMSE = np.nan
-    loocv_sNRMSE = np.nan
-    loocv_iNRMSE = np.nan
-    loocv_f_SB = np.nan
-    loocv_f_SDSD = np.nan
-    loocv_f_LSC = np.nan
-    if loocv == 1:
+    if config.leave_one_out_cv:
         XM = Xtrain.values
         YM = Ytrain.values
         # check.display('weidu'+str(np.shape(Xtrain)),logfile)
@@ -210,3 +280,24 @@ def training_BAT(XY_train, logfile, loocv, alg="bt"):
         loocv_f_SDSD,
         loocv_f_LSC,
     )
+
+
+def select_best_model(X, Y, estimators, SW):
+    Xtrain, Xval, Ytrain, Yval, SWtrain, _ = train_test_split(X, Y, SW, test_size=0.25)
+    scores = []
+    for name, estimator in estimators:
+        model = deepcopy(
+            Pipeline(
+                [
+                    ("scaler", StandardScaler()),
+                    ("estimator", estimator),
+                ]
+            )
+        )
+        try:
+            model.fit(Xtrain, Ytrain, estimator__sample_weight=SWtrain)
+        except TypeError:
+            model.fit(Xtrain, Ytrain)
+        Ypred = model.predict(Xval)
+        scores.append(r2_score(Yval, Ypred))
+    return estimators[np.argmax(scores)][1]
diff --git a/main.py b/main.py
index 814e942..59870fd 100644
--- a/main.py
+++ b/main.py
@@ -3,6 +3,14 @@
 """
 MLacc - Machine-Learning-based acceleration of spin-up
 
+This script orchestrates the entire MLacc workflow, including:
+- Data preparation and clustering
+- Machine learning model training and prediction
+- Result evaluation and visualization
+
+The workflow is controlled by configuration settings and can be run in different modes
+depending on the specified tasks.
+
 Copyright Laboratoire des Sciences du Climat et de l'Environnement (LSCE)
           Unite mixte CEA-CNRS-UVSQ
 
@@ -64,6 +72,9 @@
     check.display("MLacc start from scratch...", logfile)
     # initialize packaged data
     packdata = readvar(varlist, config, logfile)
+    if os.path.exists(resultpath + "packdata.nc"):
+        refdata = xarray.load_dataset(resultpath + "packdata.nc")
+        assert (refdata == packdata).all()
     packdata.to_netcdf(resultpath + "packdata.nc")
 
 # Define random seed
@@ -108,7 +119,7 @@
     )
     # Run test of reproducibility for the task if yes
     if config.repro_test_task_1:
-        subprocess.run(["python", "tests/task1_log.py"])
+        subprocess.run(["python", "-m", "pytest", "tests/test_task1.py"])
         check.display(
             "Task 1 reproducibility test results have been stored in tests_results.txt",
             logfile,
@@ -126,7 +137,6 @@
     IDsel.dump(resultpath + "IDsel.npy")
     check.display("clustering done!\nResults have been stored as IDx.npy", logfile)
 
-    #
     # plot clustering results
     kpfts = varlist["clustering"]["pfts"]
     for ipft in range(len(kpfts)):
@@ -143,7 +153,7 @@
     )
     # Run test of reproducibility for the task if yes
     if config.repro_test_task_2:
-        subprocess.run(["python", "tests/task2_log.py"])
+        subprocess.run(["python", "-m", "pytest", "tests/test_task2.py"])
         check.display(
             "Task 2 reproducibility test results have been stored in tests_results.txt",
             logfile,
@@ -157,22 +167,17 @@
     forcing.write(varlist, resultpath, IDx)
     # Run test of reproducibility for the task if yes
     if config.repro_test_task_3:
-        subprocess.run(["python", "tests/task3_log.py"])
-        subprocess.run(["python", "tests/task3_2_log.py"])
+        subprocess.run(["python", "-m", "pytest", "tests/test_task3.py"])
+        subprocess.run(["python", "-m", "pytest", "tests/test_task3_2.py"])
         check.display(
             "Task 3 reproducibility test results have been stored in tests_results.txt",
             logfile,
         )
     check.display("task 3: done", logfile)
 if "4" in itask:
-    # ML extrapolation
-
     var_pred_name1 = varlist["pred"]["allname"]
     var_pred_name2 = varlist["pred"]["allname_pft"]
     var_pred_name = var_pred_name1 + var_pred_name2
-    # packdata.Nv_nopft = len(var_pred_name1)
-    # packdata.Nv_total = len(var_pred_name)
-    # packdata.var_pred_name = var_pred_name
 
     # Response variables
     Yvar = varlist["resp"]["variables"]
@@ -185,8 +190,6 @@
         packdata, varlist, varlist["PFTmask"]["pred_thres"]
     )
 
-    # packdata.attrs['Nlat'] = np.trunc((90 - IDx[:, 0]) / packdata.lat_reso).astype(int)
-    # packdata.attrs['Nlon'] = np.trunc((180 + IDx[:, 1]) / packdata.lon_reso).astype(int)
     packdata.attrs.update(
         Nv_nopft=len(var_pred_name1),
         Nv_total=len(var_pred_name),
@@ -194,7 +197,7 @@
         Nlat=np.trunc((90 - IDx[:, 0]) / packdata.lat_reso).astype(int),
         Nlon=np.trunc((180 + IDx[:, 1]) / packdata.lon_reso).astype(int),
     )
-    labx = ["Y"] + var_pred_name + ["pft"]
+    labx = ["Y"] + list(packdata.data_vars) + ["pft"]
 
     # copy the restart file to be modified
     targetfile = (
@@ -207,22 +210,39 @@
     # add rights to manipulate file:
     os.chmod(restfile, 0o644)
 
-    result = []
-    for ipool in Yvar.keys():
-        check.display("processing %s..." % ipool, logfile)
-        res_df = ML.MLloop(
-            packdata,
-            ipool,
-            logfile,
-            varlist,
-            labx,
-            resultpath,
-            loocv,
-            restfile,
-        )
-        result.append(res_df)
-    result_df = pd.concat(result, keys=Yvar.keys(), names=["comp"])
-    result_df.to_csv(resultpath + "MLacc_results.csv")
+    for alg in config.algorithms:
+        result = []
+        for ipool in Yvar.keys():
+            check.display("processing %s..." % ipool, logfile)
+            res_df = ML.MLloop(
+                packdata,
+                ipool,
+                logfile,
+                varlist,
+                labx,
+                config,
+                restfile,
+                alg,
+            )
+            result.append(res_df)
+        res_df = pd.concat(result, keys=Yvar.keys(), names=["comp"])
+        print(res_df)
+
+        scores = res_df.mean()[["R2", "slope"]].to_frame().T
+        scores = scores.assign(alg=alg).set_index("alg")
+        path = Path(resultpath + "ML_log.csv")
+        scores.to_csv(path, mode="a", header=not path.exists())
+
+        res_path = Path(resultpath + "MLacc_results.csv")
+        if res_path.exists():
+            ref_df = pd.read_csv(res_path, index_col=[0, 1, 2])
+            perf_diff = res_df["slope"] - ref_df["slope"]
+            if perf_diff.mean() > 0 and (perf_diff > 0).mean() > 0.5:
+                res_df.to_csv(res_path)
+            else:
+                print("Degraded performance:", perf_diff.mean(), (perf_diff > 0).mean())
+        else:
+            res_df.to_csv(res_path)
 
     # we need to handle additional variables in the restart files but are not state variables of ORCHIDEE
 
@@ -248,8 +268,8 @@
             restnc.close()
         # Run test of reproducibility for the task if yes
     if config.repro_test_task_4:
-        subprocess.run(["python", "tests/task4_log.py"])
-        subprocess.run(["python", "tests/task4_2_log.py"])
+        subprocess.run(["python", "-m", "pytest", "tests/test_task4.py"])
+        subprocess.run(["python", "-m", "pytest", "tests/test_task4_2.py"])
         check.display(
             "Task 4 reproducibility test results have been stored in tests_results.txt",
             logfile,
@@ -262,7 +282,6 @@
         subpool_name = varlist["resp"]["pool_name_" + ipool]
         npfts = varlist["resp"]["npfts"]
         subLabel = varlist["resp"]["sub_item"]
-        print(subLabel)
         pp = varlist["resp"]["dim"][ipool]
         sect_n = varlist["resp"]["sect_n"][ipool]
         if pp[0] == "pft":
diff --git a/requirements.txt b/requirements.txt
index 212b8d7..210e78b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,6 @@ pandas==1.5.2
 scikit_learn==1.1.3
 scipy==1.9.3
 xarray==2022.11.0
+xgboost
+pytest
+ruff
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..bade176
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,21 @@
+import numpy as np
+import pytest
+
+
+config_path = "DEF_Trunk/MLacc.def"
+
+
+@pytest.fixture
+def reference_path():  # reference EXE_DIR in zenodo
+    with open(config_path) as f:
+        return f.readlines()[25].strip()
+
+
+@pytest.fixture
+def test_path():  # test EXE_DIR
+    return "./EXE_DIR/"
+
+
+@pytest.fixture
+def output_file(test_path):  # logging file for reproducibility tests results
+    return test_path + "tests_results.txt"
diff --git a/tests/task1_log.py b/tests/task1_log.py
deleted file mode 100644
index 2ad46d0..0000000
--- a/tests/task1_log.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import os
-
-import numpy as np
-
-# -------------------------------variables----------------------------------------------------
-from config import output_file, reference_path, test_path
-
-# reference_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR/'
-# test_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR_3/'  # Enter the path from your EXE_DIR
-
-# output_file = test_path + 'tests_results.txt'  # Define the output file
-# ------------------------end of variables----------------------------------------------------
-
-
-# -------------------------------functions----------------------------------------------------
-def compare_npy_files(file1, file2, output_file):
-    # Read binary data from the files
-    with open(file1, "rb") as f1, open(file2, "rb") as f2:
-        data1 = f1.read()
-        data2 = f2.read()
-
-    # Compare the binary data
-    with open(output_file, "a") as out_file:
-        if data1 == data2:
-            out_file.write(f"The contents of {file1} and {file2} are identical.\n")
-        else:
-            out_file.write(f"The contents of {file1} and {file2} are different.\n")
-
-
-# -------------------------------end of functions----------------------------------------------
-
-# ------------------------------ Comparison of dist_all as .txt files--------------------------
-# Load the first .npy file
-array1 = np.load(reference_path + "dist_all.npy", allow_pickle=True)
-
-# Save the first array as a text file
-np.savetxt("dist_all_ref.txt", array1)
-
-# Load the second .npy file
-array2 = np.load(test_path + "dist_all.npy", allow_pickle=True)
-
-# Save the second array as a text file
-np.savetxt("dist_all_test.txt", array2)
-
-# Load the text files
-text_array1 = np.loadtxt("dist_all_ref.txt")
-text_array2 = np.loadtxt("dist_all_test.txt")
-
-# Calculate the element-wise absolute error
-error_array = np.abs(text_array1 - text_array2)
-
-
-# Print or use the error array as needed
-with open(output_file, "a") as out_file:
-    # WRITE TASK
-    out_file.write(" \n")
-    out_file.write(
-        f"################################### REPRODUCIBILITY TEST FOR TASK 1 ################################### \n"
-    )
-    out_file.write(" \n")
-    out_file.write("Element-wise absolute error array for dist_all.npy:\n")
-    out_file.write(str(error_array) + "\n")
-    out_file.write("error_array_size: " + str(error_array.shape) + "\n")
-
-# Delete the text files
-os.remove("dist_all_ref.txt")
-os.remove("dist_all_test.txt")
-# -------------------------End of comparison of dist_all as .txt files--------------------------
-
-# -----------------------Compare packdata and auxil files as binary files-----------------------
-file_paths = [
-    (reference_path + "auxil.npy", test_path + "auxil.npy"),
-    (reference_path + "packdata.npy", test_path + "packdata.npy"),
-    (reference_path + "dist_all.npy", test_path + "dist_all.npy"),
-]
-
-for file1, file2 in file_paths:
-    compare_npy_files(file1, file2, output_file)
-# --------------------------end of comparison test code----------------------------------------------------
diff --git a/tests/task2_log.py b/tests/task2_log.py
deleted file mode 100644
index 092c0d3..0000000
--- a/tests/task2_log.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# -------------------------------variables----------------------------------------------------
-from config import output_file, reference_path, test_path
-
-# reference_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR_2/'
-# test_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR/'
-
-# output_file = '/home/surface10/mrasolon/tests/tests_results.txt'  # Define the output file
-# ------------------------end of variables----------------------------------------------------
-
-
-# -------------------------------functions----------------------------------------------------
-def compare_npy_files(file1, file2, output_file):
-    # Read binary data from the files
-    with open(file1, "rb") as f1, open(file2, "rb") as f2:
-        data1 = f1.read()
-        data2 = f2.read()
-
-    # Compare the binary data
-    with open(output_file, "a") as out_file:
-        if data1 == data2:
-            out_file.write(f"The contents of {file1} and {file2} are identical.\n")
-        else:
-            out_file.write(f"The contents of {file1} and {file2} are different.\n")
-
-
-# -------------------------------end of functions----------------------------------------------
-
-# ------------------------------ Comparison of dist_all as .txt files--------------------------
-# -------------------------End of comparison of dist_all as .txt files--------------------------
-
-# -----------------------Compare packdata and auxil files as binary files-----------------------
-file_paths = [
-    (reference_path + "IDloc.npy", test_path + "IDloc.npy"),
-    (reference_path + "IDsel.npy", test_path + "IDsel.npy"),
-    (reference_path + "IDx.npy", test_path + "IDx.npy"),
-]
-# WRITE TASK
-with open(output_file, "a") as out_file:
-    out_file.write(" \n")
-    out_file.write(
-        f"################################### REPRODUCIBILITY TEST FOR TASK 2: ################################### \n"
-    )
-    out_file.write(" \n")
-for file1, file2 in file_paths:
-    compare_npy_files(file1, file2, output_file)
-# --------------------------end of comparison test code----------------------------------------------------
diff --git a/tests/task3_2_log.py b/tests/task3_2_log.py
deleted file mode 100644
index 071d921..0000000
--- a/tests/task3_2_log.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import netCDF4 as nc
-
-# -------------------------------variables----------------------------------------------------
-from config import output_file, reference_path, test_path
-
-# reference_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR_2/'
-# test_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR/'  # Enter the path from your EXE_DIR
-
-# output_file = '/home/surface10/mrasolon/tests/tests_results.txt'  # Define the output file
-
-# ------------------------end of variables----------------------------------------------------
-
-
-def compare_nc_files(file1_path, file2_path, output_file, print_diff_flag=False):
-    # Open the netCDF files
-    with open(output_file, "a") as output:
-        output.write(" \n")
-        output.write("For " + file1_path + " and " + file2_path + "\n")
-        output.write(" \n")
-        nc1 = nc.Dataset(file1_path)
-        nc2 = nc.Dataset(file2_path)
-
-        # Compare dimensions
-        if set(nc1.dimensions.keys()) == set(nc2.dimensions.keys()):
-            output.write("Dimensions are identical.\n")
-        else:
-            output.write("Dimensions are different.\n")
-
-        # Compare variables
-        if set(nc1.variables.keys()) == set(nc2.variables.keys()):
-            output.write("Variables are identical.\n")
-        else:
-            output.write("Variables are different.\n")
-
-        # Compare variable values
-        variable_diff_flag = False  # Flag to track if any variable values are different
-        for variable in set(nc1.variables.keys()) & set(nc2.variables.keys()):
-            values1 = nc1[variable][:]
-            values2 = nc2[variable][:]
-
-            if (values1 == values2).all():
-                output.write(f"Variable '{variable}' values are identical.\n")
-            else:
-                output.write(f"Variable '{variable}' values are different.\n")
-                variable_diff_flag = True  # Set the flag to True
-
-                if print_diff_flag:
-                    output.write(f"Values in {file1_path}: {values1}\n")
-                    output.write(f"Values in {file2_path}: {values2}\n")
-
-        # Display a message if any variable values are different
-        if variable_diff_flag:
-            output.write("Some variable values are different.\n")
-
-        # Close the netCDF files
-        nc1.close()
-        nc2.close()
-
-
-# Example usage
-output_file_path = output_file
-
-file1_path = test_path + "SRF_FGSPIN.10Y.ORC22v8034_19101231_sechiba_rest.nc"
-file2_path = reference_path + "SRF_FGSPIN.10Y.ORC22v8034_19101231_sechiba_rest.nc"
-
-file3_path = test_path + "OOL_FGSPIN.10Y.ORC22v8034_19101231_driver_rest.nc"
-file4_path = reference_path + "OOL_FGSPIN.10Y.ORC22v8034_19101231_driver_rest.nc"
-
-file5_path = test_path + "SBG_FGSPIN.10Y.ORC22v8034_19101231_stomate_rest.nc"
-file6_path = reference_path + "SBG_FGSPIN.10Y.ORC22v8034_19101231_stomate_rest.nc"
-
-
-compare_nc_files(file1_path, file2_path, output_file_path, print_diff_flag=True)
-compare_nc_files(file3_path, file4_path, output_file_path, print_diff_flag=True)
-compare_nc_files(file5_path, file6_path, output_file_path, print_diff_flag=True)
diff --git a/tests/task3_log.py b/tests/task3_log.py
deleted file mode 100644
index 52482f9..0000000
--- a/tests/task3_log.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import netCDF4 as nc
-
-# -------------------------------variables----------------------------------------------------
-from config import output_file, reference_path, test_path
-
-# Define the base file path
-# reference_path='/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR_2/'
-# test_path='/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR/'
-base_file_path_1 = reference_path + "forcing_aligned_"
-base_file_path_2 = test_path + "forcing_aligned_"
-# List of years to compare (assuming from 1902 to 1910)
-years_to_compare = range(1901, 1911)
-
-# Open a file for writing
-# output_file_path = '/home/surface10/mrasolon/tests/tests_results.txt'
-output_file_path = output_file
-# -----------------------end of variables------------------------------------------------------
-
-with open(output_file_path, "a") as output_file:
-    output_file.write(" \n")
-    output_file.write(
-        f"################################### REPRODUCIBILITY TEST FOR TASK 3 ################################### \n"
-    )
-    output_file.write(" \n")
-    # Loop through each year
-    for year in years_to_compare:
-        # Construct the file paths for the current year
-        file_path_nc1 = f"{base_file_path_1}{year}.nc"
-        file_path_nc2 = f"{base_file_path_2}{year}.nc"
-
-        # Open the NetCDF files
-        dataset_nc1 = nc.Dataset(file_path_nc1)
-        dataset_nc2 = nc.Dataset(file_path_nc2)
-
-        # Compare dimensions
-        if set(dataset_nc1.dimensions.keys()) == set(dataset_nc2.dimensions.keys()):
-            output_file.write(f"DIMENSIONS are identical in both files for {year}.\n")
-        else:
-            output_file.write(f"DIMENSIONS differ between the two files for {year}.\n")
-
-        # Compare variables
-        variables_nc1 = set(dataset_nc1.variables.keys())
-        variables_nc2 = set(dataset_nc2.variables.keys())
-
-        if variables_nc1 == variables_nc2:
-            output_file.write(f"VARIABLES are identical in both files for {year}.\n")
-        else:
-            output_file.write(f"VARIABLES differ between the two files for {year}.\n")
-
-        # List of variables to compare values
-        variables_to_compare = ["Tair", "PSurf", "Qair", "Rainf", "Snowf"]
-
-        # Iterate through variables and compare values
-        for var_name in variables_to_compare:
-            # Get variable from each file
-            variable_nc1 = dataset_nc1.variables[var_name][:]
-            variable_nc2 = dataset_nc2.variables[var_name][:]
-
-            # Compare the values
-            if (variable_nc1 == variable_nc2).all():
-                output_file.write(
-                    f"{var_name} values are identical in both files for {year}.\n"
-                )
-            else:
-                output_file.write(
-                    f"{var_name} values differ between the two files for {year}.\n"
-                )
-
-        # Close the NetCDF files for the current year
-        dataset_nc1.close()
-        dataset_nc2.close()
diff --git a/tests/task4_2_log.py b/tests/task4_2_log.py
deleted file mode 100644
index 1fa8fd2..0000000
--- a/tests/task4_2_log.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import netCDF4 as nc
-
-# -------------------------------variables----------------------------------------------------
-from config import output_file, reference_path, test_path
-
-# reference_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR_2/'
-# test_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR/'  # Enter the path from your EXE_DIR
-
-# output_file = '/home/surface10/mrasolon/tests/tests_results.txt'  # Define the output file
-# ------------------------end of variables----------------------------------------------------
-
-
-def compare_nc_files(file1_path, file2_path, output_file, print_diff_flag=False):
-    # Open the netCDF files
-    with open(output_file, "a") as output:
-        output.write(" \n")
-        output.write("For " + file1_path + " and " + file2_path + "\n")
-        output.write(" \n")
-        nc1 = nc.Dataset(file1_path)
-        nc2 = nc.Dataset(file2_path)
-
-        # Compare dimensions
-        if set(nc1.dimensions.keys()) == set(nc2.dimensions.keys()):
-            output.write("Dimensions are identical.\n")
-        else:
-            output.write("Dimensions are different.\n")
-
-        # Compare variables
-        if set(nc1.variables.keys()) == set(nc2.variables.keys()):
-            output.write("Variables are identical.\n")
-        else:
-            output.write("Variables are different.\n")
-
-        # Compare variable values
-        variable_diff_flag = False  # Flag to track if any variable values are different
-        for variable in set(nc1.variables.keys()) & set(nc2.variables.keys()):
-            values1 = nc1[variable][:]
-            values2 = nc2[variable][:]
-
-            if (values1 == values2).all():
-                output.write(f"Variable '{variable}' values are identical.\n")
-            else:
-                output.write(f"Variable '{variable}' values are different.\n")
-                variable_diff_flag = True  # Set the flag to True
-
-                if print_diff_flag:
-                    output.write(f"Values in {file1_path}: {values1}\n")
-                    output.write(f"Values in {file2_path}: {values2}\n")
-
-        # Display a message if any variable values are different
-        if variable_diff_flag:
-            output.write("Some variable values are different.\n")
-
-        # Close the netCDF files
-        nc1.close()
-        nc2.close()
-
-
-# Example usage
-output_file_path = output_file
-
-file1_path = test_path + "SBG_FGSPIN.340Y.ORC22v8034_22501231_stomate_rest.nc"
-file2_path = reference_path + "SBG_FGSPIN.340Y.ORC22v8034_22501231_stomate_rest.nc"
-
-compare_nc_files(file1_path, file2_path, output_file_path, print_diff_flag=True)
diff --git a/tests/task4_log.py b/tests/task4_log.py
deleted file mode 100644
index 5d6b8c3..0000000
--- a/tests/task4_log.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-
-# -------------------------------variables----------------------------------------------------
-from config import output_file, reference_path, test_path
-
-# reference_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR_2/'
-# test_path = '/home/surface10/mrasolon/SPINacc_12_7/EXE_DIR/'  # Enter the path from your EXE_DIR
-
-# output_file = '/home/surface10/mrasolon/tests/tests_results.txt'  # Define the output file
-# ------------------------end of variables----------------------------------------------------
-
-
-def compare_files(dir1, dir2, output_file):
-    files1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))]
-    files2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))]
-
-    common_files = set(files1) & set(files2)
-
-    with open(output_file, "a") as output:
-        output.write(" \n")
-        output.write(
-            f"################################### REPRODUCIBILITY TEST FOR TASK 4 ################################### \n"
-        )
-        output.write(" \n")
-        for file in common_files:
-            file1_path = os.path.join(dir1, file)
-            file2_path = os.path.join(dir2, file)
-
-            with open(file1_path, "rb") as file1, open(file2_path, "rb") as file2:
-                content1 = file1.read()
-                content2 = file2.read()
-
-                if content1 == content2:
-                    output.write(f"File {file} in {dir1} and {dir2} is identical.\n")
-                else:
-                    output.write(f"File {file} in {dir1} and {dir2} is different.\n")
-
-
-if __name__ == "__main__":
-    directory1 = reference_path
-    directory2 = test_path
-    output_file_path = output_file
-
-    compare_files(directory1, directory2, output_file_path)
diff --git a/tests/test_task1.py b/tests/test_task1.py
new file mode 100644
index 0000000..2091797
--- /dev/null
+++ b/tests/test_task1.py
@@ -0,0 +1,12 @@
+import pytest
+from utils import compare_npy_files
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "dist_all.npy"
+    ],  # "auxil.npy" & "packdata.npy" have been replaced by "packdata.nc"
+)
+def test_compare_npy_files(reference_path, test_path, filename):
+    compare_npy_files(reference_path + filename, test_path + filename)
diff --git a/tests/test_task2.py b/tests/test_task2.py
new file mode 100644
index 0000000..e9ed9d4
--- /dev/null
+++ b/tests/test_task2.py
@@ -0,0 +1,7 @@
+import pytest
+from utils import compare_npy_files
+
+
+@pytest.mark.parametrize("filename", ["IDloc.npy", "IDsel.npy", "IDx.npy"])
+def test_compare_npy_files(reference_path, test_path, filename):
+    compare_npy_files(reference_path + filename, test_path + filename)
diff --git a/tests/test_task3.py b/tests/test_task3.py
new file mode 100644
index 0000000..6db7885
--- /dev/null
+++ b/tests/test_task3.py
@@ -0,0 +1,14 @@
+import pytest
+from utils import compare_nc_files
+
+
+@pytest.fixture
+def file_prefix():
+    return "forcing_aligned_"
+
+
+@pytest.mark.parametrize("year", range(1901, 1911))
+def test_compare_nc_files(reference_path, test_path, file_prefix, year):
+    file_path_nc1 = f"{reference_path}{file_prefix}{year}.nc"
+    file_path_nc2 = f"{test_path}{file_prefix}{year}.nc"
+    compare_nc_files(file_path_nc1, file_path_nc2)
diff --git a/tests/test_task3_2.py b/tests/test_task3_2.py
new file mode 100644
index 0000000..2c08f36
--- /dev/null
+++ b/tests/test_task3_2.py
@@ -0,0 +1,14 @@
+import pytest
+from utils import compare_nc_files
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "SRF_FGSPIN.10Y.ORC22v8034_19101231_sechiba_rest.nc",
+        "OOL_FGSPIN.10Y.ORC22v8034_19101231_driver_rest.nc",
+        "SBG_FGSPIN.10Y.ORC22v8034_19101231_stomate_rest.nc",
+    ],
+)
+def test_compare_nc_files(reference_path, test_path, filename):
+    compare_nc_files(reference_path + filename, test_path + filename)
diff --git a/tests/test_task4.py b/tests/test_task4.py
new file mode 100644
index 0000000..28fb328
--- /dev/null
+++ b/tests/test_task4.py
@@ -0,0 +1,28 @@
+import os
+
+import pytest
+
+
+@pytest.mark.skip("Skipped for redundancy of tests.")
+def test_compare_all_files(reference_path, test_path):
+    files1 = [
+        f
+        for f in os.listdir(reference_path)
+        if os.path.isfile(os.path.join(reference_path, f))
+    ]
+    files2 = [
+        f for f in os.listdir(test_path) if os.path.isfile(os.path.join(test_path, f))
+    ]
+
+    common_files = set(files1) & set(files2)
+
+    for file in common_files:
+        file1_path = os.path.join(reference_path, file)
+        file2_path = os.path.join(test_path, file)
+
+        with open(file1_path, "rb") as file1, open(file2_path, "rb") as file2:
+            content1 = file1.read()
+            content2 = file2.read()
+            assert (
+                content1 == content2
+            ), f"File {file} in {reference_path} and {test_path} are different."
diff --git a/tests/test_task4_2.py b/tests/test_task4_2.py
new file mode 100644
index 0000000..81f7b34
--- /dev/null
+++ b/tests/test_task4_2.py
@@ -0,0 +1,9 @@
+import pytest
+from utils import compare_nc_files
+
+
+@pytest.mark.parametrize(
+    "filename", ["SBG_FGSPIN.340Y.ORC22v8034_22501231_stomate_rest.nc"]
+)
+def test_compare_nc_files(reference_path, test_path, filename):
+    compare_nc_files(reference_path + filename, test_path + filename)
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..17dc269
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,59 @@
+import netCDF4 as nc
+import numpy as np
+
+
+def compare_npy_files(file1, file2):
+    a = np.load(file1, allow_pickle=True)
+    b = np.load(file2, allow_pickle=True)
+
+    result = recursive_compare(a, b)
+
+    assert result, f"The contents of {file1} and {file2} are different."
+
+
+def recursive_compare(arr1, arr2, tol=1e-9):
+    # Check if both inputs are ndarrays
+    if isinstance(arr1, np.ndarray) and isinstance(arr2, np.ndarray):
+        # Check if both ndarrays have the same shape
+        if arr1.shape != arr2.shape:
+            return False
+
+        # If they contain objects, iterate and compare recursively
+        if arr1.dtype == "object" and arr2.dtype == "object":
+            for sub_arr1, sub_arr2 in zip(arr1, arr2):
+                if not recursive_compare(sub_arr1, sub_arr2, tol):
+                    return False
+            return True
+        else:
+            # Compare the arrays element-wise for floating point numbers
+            return np.allclose(arr1, arr2, atol=tol)
+    else:
+        return False
+
+
+def compare_nc_files(file1_path, file2_path):
+    # Open the netCDF files
+    nc1 = nc.Dataset(file1_path)
+    nc2 = nc.Dataset(file2_path)
+
+    # Compare dimensions
+    assert set(nc1.dimensions.keys()) == set(
+        nc2.dimensions.keys()
+    ), "Dimensions are different."
+
+    # Compare variables
+    assert set(nc1.variables.keys()) == set(
+        nc2.variables.keys()
+    ), "Variables are different."
+
+    # Compare variable values
+    for variable in set(nc1.variables.keys()) & set(nc2.variables.keys()):
+        values1 = nc1[variable][:]
+        values2 = nc2[variable][:]
+        assert np.allclose(
+            values1, values2
+        ), f"Variable '{variable}' values are different."
+
+    # Close the netCDF files
+    nc1.close()
+    nc2.close()