examples/water_cluster/args.yaml

# all arguments are flatten into this file
# they can also be splitted into separate files and referenced here
n_iter: 10

# training and testing systems
systems_train: # can also be files that containing system paths
  - ./systems/train.n[1-3]

systems_test: # if empty, use the last system of training set
  - ./systems/valid.n4 
  
# directory setting
workdir: "."
share_folder: "share" # folder that stores all other settings

# scf settings
scf_input: # can also be specified by a separete file
  basis: ccpvdz
  # this is for force training
  # the following properties will be dumped in data folder
  # please refer to https://arxiv.org/abs/2012.14615 for detailed explaination of each fields
  dump_fields: [e_base, e_tot, dm_eig, conv, f_base, f_tot, grad_vx, l_f_delta, l_e_delta]
  verbose: 1
  # parameters that will be passed directly to pyscf Mol class
  mol_args:
    incore_anyway: True
  # parameters that will be passed directly to pyscf SCF class
  scf_args:
    conv_tol: 1e-6
    conv_tol_grad: 3e-2
    level_shift: 0.1
    diis_space: 20
    conv_check: false # pyscf conv_check has a bug

scf_machine: 
  # # of systems that will be in one task, default is 1
  # task corresponds to a set of commands, and is the smallest unit to be tracked
  sub_size: 1 
  # 2 tasks will be gathered into one group and submitted together
  # group correspond to a job submitted to schedule system
  group_size: 2 
  # if larger than 1, run n tasks parallelly in one group (one job)
  ingroup_parallel: 1 
  # the parameters determining the machine settings that the jobs are running on
  dispatcher: 
    # "local" to run on local machine, or "ssh" to run on a remote machine
    context: local 
    # "slurm" to use slurm scheduler system, or "shell" to just use shell
    batch: slurm 
    # only needed when using "ssh" in context
    # pass a dict like {username: USERNAME, password: PASSWORD, work_path: /path/to/tmp/folder}
    remote_profile: null 
  # the parameters determining the resources allocated for each job (group of tasks)
  # only needed when batch is set to "slurm"
  # for shell users, will automatically use all resources available
  resources:
    # only set to larger than 1 if parallel in multiple nodes with `ingroup_parallel`
    # otherwise please keep to 1 since pyscf does not support mpi and can only run on a single node
    numb_node: 1 
    time_limit: '24:00:00'
    cpus_per_task: 8
    mem_limit: 8 #GB
    # environment variables
    envs:
      PYSCF_MAX_MEMORY: 8000 #MB, increase from default 4G to 8G to match the mem_limit above
  # resources for each sub task in jobs (groups of tasks)
  # only needed when ingroup_parallel is larger than 1 
  # the resources are reallocated between parallel tasks
  sub_res: 
    cpus_per_task: 8
  python: "python" # use python in path

# training settings
train_input:
  # model_args is ignored, since this is used as restart
  # see init_train for potential model_args
  data_args: 
    # training batch size, 16 is recommended
    batch_size: 16
    # if larger than 1, n batch will be grouped together to form a larger one
    # final batch size would be group_bath * batch_size
    # only needed when a lot of systems have only one datapoint hence the batch size can only be 1
    group_batch: 1
    # if set to true, will try to find force labels and use them in training
    extra_label: true
    # if set to true, will read the convergence data from conv_name 
    # and only use converged datapoints to train
    conv_filter: true
    conv_name: conv
  # to speed up training, deepks support first normalize the data (preshift and prescale)
  # and do a linear regression on the whole training set as prefitting
  preprocess_args:
    preshift: false # restarting model already shifted. Will not recompute shift value
    prescale: false # same as above
    # prefitting is by default enabled
    prefit_ridge: 1e1 # the ridge factor used in linear regression
    prefit_trainable: false # make the linear regression fixed during the training
  train_args: 
    # the start learning rate, will decay later
    start_lr: 0.0001
    # lr will decay a factor of `decay_rate` every `decay_steps` epoches
    decay_rate: 0.5
    decay_steps: 1000
    # show training results every n epoch
    display_epoch: 100
    # the prefactor multiplied infront of the force part of the loss
    force_factor: 1
    # total number of epoch needed in training
    n_epoch: 5000

train_machine: 
  # for training, no tasks or groups are needed since there's only one task
  # the dispatcher settings are same as above
  dispatcher: 
    context: local
    batch: slurm
    remote_profile: null # use lazy local
  # resources settings are also same as above
  resources:
    time_limit: '24:00:00'
    cpus_per_task: 4
    # using gpu in training, current only support 1
    numb_gpu: 1
    mem_limit: 8 #GB
  python: "python" # use python in path

# init settings
init_model: false # do not use existing model in share_folder/init/model.pth

# the first scf iteration, needed if init_model is false
# possible settings are same as scf_input
init_scf: 
  basis: ccpvdz
  dump_fields: [e_base, e_tot, dm_eig, conv, f_base, f_tot, grad_vx, l_f_delta, l_e_delta]
  verbose: 1
  mol_args:
    incore_anyway: True
  scf_args:
    conv_tol: 1e-8
    conv_check: false # pyscf conv_check has a bug

# the first scf iteration, needed if init_model is false
# most settings are same as scf_input but model_args will be specified here
init_train: 
  # necessary as this is init training
  model_args: 
    # the number of *hidden* neurons
    # note the first (n_descriptor) and last (1) layer is not included here
    hidden_sizes: [100, 100, 100]
    # the output will be devided by 100 before comparing with labels, to improve training
    output_scale: 100
    # use skip connection between layers if the sizes are same
    use_resnet: true
    # gelu generally performs better than others
    actv_fn: gelu
    # whether to use a predefined embedding function 
    # to further symmetrize the eigenvalues as descriptors
    # add embedding can make the energy surface smooth, hence improve convergence
    # but may slightly reduce the accuracy (especially in generalization)
    # for water we do not use it, if you encounter convergence problem, set it to
    # embedding: thermal
    embedding: null
  # the rest are the same as abpve
  data_args: 
    batch_size: 16
    group_batch: 1
  preprocess_args:
    preshift: true # init model will shift the input descriptors to mean zero
    prescale: false
    prefit_ridge: 1e1
    prefit_trainable: false
  # following are suggested parameters for initial training
  # note in the deepks-kit paper the training curve shown use a different set of parameters
  # the paper parameters take an unnecessary length of time and is no longer suggested
  train_args: 
    decay_rate: 0.95 # 0.96 in paper example training curve
    decay_steps: 300 # 500 in paper example training curve
    display_epoch: 100
    n_epoch: 15000 # 50000 in paper example training curve
    start_lr: 0.0003

# other settings
cleanup: false
strict: true