forked from deepmodeling/deepks-kit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
args.yaml
184 lines (173 loc) · 6.99 KB
/
args.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# all arguments are flatten into this file
# they can also be splitted into separate files and referenced here
n_iter: 10
# training and testing systems
systems_train: # can also be files that containing system paths
- ./systems/train.n[1-3]
systems_test: # if empty, use the last system of training set
- ./systems/valid.n4
# directory setting
workdir: "."
share_folder: "share" # folder that stores all other settings
# scf settings
scf_input: # can also be specified by a separete file
basis: ccpvdz
# this is for force training
# the following properties will be dumped in data folder
# please refer to https://arxiv.org/abs/2012.14615 for detailed explaination of each fields
dump_fields: [e_base, e_tot, dm_eig, conv, f_base, f_tot, grad_vx, l_f_delta, l_e_delta]
verbose: 1
# parameters that will be passed directly to pyscf Mol class
mol_args:
incore_anyway: True
# parameters that will be passed directly to pyscf SCF class
scf_args:
conv_tol: 1e-6
conv_tol_grad: 3e-2
level_shift: 0.1
diis_space: 20
conv_check: false # pyscf conv_check has a bug
scf_machine:
# # of systems that will be in one task, default is 1
# task corresponds to a set of commands, and is the smallest unit to be tracked
sub_size: 1
# 2 tasks will be gathered into one group and submitted together
# group correspond to a job submitted to schedule system
group_size: 2
# if larger than 1, run n tasks parallelly in one group (one job)
ingroup_parallel: 1
# the parameters determining the machine settings that the jobs are running on
dispatcher:
# "local" to run on local machine, or "ssh" to run on a remote machine
context: local
# "slurm" to use slurm scheduler system, or "shell" to just use shell
batch: slurm
# only needed when using "ssh" in context
# pass a dict like {username: USERNAME, password: PASSWORD, work_path: /path/to/tmp/folder}
remote_profile: null
# the parameters determining the resources allocated for each job (group of tasks)
# only needed when batch is set to "slurm"
# for shell users, will automatically use all resources available
resources:
# only set to larger than 1 if parallel in multiple nodes with `ingroup_parallel`
# otherwise please keep to 1 since pyscf does not support mpi and can only run on a single node
numb_node: 1
time_limit: '24:00:00'
cpus_per_task: 8
mem_limit: 8 #GB
# environment variables
envs:
PYSCF_MAX_MEMORY: 8000 #MB, increase from default 4G to 8G to match the mem_limit above
# resources for each sub task in jobs (groups of tasks)
# only needed when ingroup_parallel is larger than 1
# the resources are reallocated between parallel tasks
sub_res:
cpus_per_task: 8
python: "python" # use python in path
# training settings
train_input:
# model_args is ignored, since this is used as restart
# see init_train for potential model_args
data_args:
# training batch size, 16 is recommended
batch_size: 16
# if larger than 1, n batch will be grouped together to form a larger one
# final batch size would be group_bath * batch_size
# only needed when a lot of systems have only one datapoint hence the batch size can only be 1
group_batch: 1
# if set to true, will try to find force labels and use them in training
extra_label: true
# if set to true, will read the convergence data from conv_name
# and only use converged datapoints to train
conv_filter: true
conv_name: conv
# to speed up training, deepks support first normalize the data (preshift and prescale)
# and do a linear regression on the whole training set as prefitting
preprocess_args:
preshift: false # restarting model already shifted. Will not recompute shift value
prescale: false # same as above
# prefitting is by default enabled
prefit_ridge: 1e1 # the ridge factor used in linear regression
prefit_trainable: false # make the linear regression fixed during the training
train_args:
# the start learning rate, will decay later
start_lr: 0.0001
# lr will decay a factor of `decay_rate` every `decay_steps` epoches
decay_rate: 0.5
decay_steps: 1000
# show training results every n epoch
display_epoch: 100
# the prefactor multiplied infront of the force part of the loss
force_factor: 1
# total number of epoch needed in training
n_epoch: 5000
train_machine:
# for training, no tasks or groups are needed since there's only one task
# the dispatcher settings are same as above
dispatcher:
context: local
batch: slurm
remote_profile: null # use lazy local
# resources settings are also same as above
resources:
time_limit: '24:00:00'
cpus_per_task: 4
# using gpu in training, current only support 1
numb_gpu: 1
mem_limit: 8 #GB
python: "python" # use python in path
# init settings
init_model: false # do not use existing model in share_folder/init/model.pth
# the first scf iteration, needed if init_model is false
# possible settings are same as scf_input
init_scf:
basis: ccpvdz
dump_fields: [e_base, e_tot, dm_eig, conv, f_base, f_tot, grad_vx, l_f_delta, l_e_delta]
verbose: 1
mol_args:
incore_anyway: True
scf_args:
conv_tol: 1e-8
conv_check: false # pyscf conv_check has a bug
# the first scf iteration, needed if init_model is false
# most settings are same as scf_input but model_args will be specified here
init_train:
# necessary as this is init training
model_args:
# the number of *hidden* neurons
# note the first (n_descriptor) and last (1) layer is not included here
hidden_sizes: [100, 100, 100]
# the output will be devided by 100 before comparing with labels, to improve training
output_scale: 100
# use skip connection between layers if the sizes are same
use_resnet: true
# gelu generally performs better than others
actv_fn: gelu
# whether to use a predefined embedding function
# to further symmetrize the eigenvalues as descriptors
# add embedding can make the energy surface smooth, hence improve convergence
# but may slightly reduce the accuracy (especially in generalization)
# for water we do not use it, if you encounter convergence problem, set it to
# embedding: thermal
embedding: null
# the rest are the same as abpve
data_args:
batch_size: 16
group_batch: 1
preprocess_args:
preshift: true # init model will shift the input descriptors to mean zero
prescale: false
prefit_ridge: 1e1
prefit_trainable: false
# following are suggested parameters for initial training
# note in the deepks-kit paper the training curve shown use a different set of parameters
# the paper parameters take an unnecessary length of time and is no longer suggested
train_args:
decay_rate: 0.95 # 0.96 in paper example training curve
decay_steps: 300 # 500 in paper example training curve
display_epoch: 100
n_epoch: 15000 # 50000 in paper example training curve
start_lr: 0.0003
# other settings
cleanup: false
strict: true