Skip to content

Commit

Permalink
PhBaseWorkChain: add handler for diagonalization errors (aiidateam#757
Browse files Browse the repository at this point in the history
)

The `PwBaseWorkChain` now has a handler for `ERROR_COMPUTING_CHOLESKY`,
an exit code that was added to `PhCalculation` which is returned if the
calculation failed due to problems with cholesky factorization. The
handler will switch to `cg` diagonalization and restart the calculation.
If the diagonalization already was `cg` it will abort.

The `PhParser` was also fixed to return `ERROR_OUTPUT_STDOUT_INCOMPLETE`
in case the stdout was incomplete and no other more specific errors were
detected.
  • Loading branch information
unkcpz authored and bastonero committed Dec 20, 2021
1 parent a88c67a commit 9c3c599
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 1 deletion.
4 changes: 3 additions & 1 deletion aiida_quantumespresso/calculations/ph.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def define(cls, spec):
message='The calculation stopped prematurely because it ran out of walltime.')
spec.exit_code(410, 'ERROR_CONVERGENCE_NOT_REACHED',
message='The minimization cycle did not reach self-consistency.')
spec.exit_code(462, 'ERROR_COMPUTING_CHOLESKY',
message='The code failed during the cholesky factorization.')
# yapf: enable

def prepare_for_submission(self, folder):
Expand Down Expand Up @@ -170,7 +172,7 @@ def prepare_for_submission(self, folder):
try:
mesh, offset = self.inputs.qpoints.get_kpoints_mesh()

if any([i != 0. for i in offset]):
if any(i != 0. for i in offset):
raise NotImplementedError(
'Computation of phonons on a mesh with non zero offset is not implemented, at the level of ph.x'
)
Expand Down
1 change: 1 addition & 0 deletions aiida_quantumespresso/parsers/parse_raw/ph.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def detect_important_message(logs, line):
'error': {
'Maximum CPU time exceeded': 'ERROR_OUT_OF_WALLTIME',
'No convergence has been achieved': 'ERROR_CONVERGENCE_NOT_REACHED',
'problems computing cholesky': 'ERROR_COMPUTING_CHOLESKY',
},
'warning': {
'Warning:': None,
Expand Down
6 changes: 6 additions & 0 deletions aiida_quantumespresso/parsers/ph.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,9 @@ def parse(self, **kwargs):

if 'ERROR_CONVERGENCE_NOT_REACHED' in logs['error']:
return self.exit_codes.ERROR_CONVERGENCE_NOT_REACHED

if 'ERROR_COMPUTING_CHOLESKY' in logs['error']:
return self.exit_codes.ERROR_COMPUTING_CHOLESKY

if 'ERROR_OUTPUT_STDOUT_INCOMPLETE' in logs['error']:
return self.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE
17 changes: 17 additions & 0 deletions aiida_quantumespresso/workflows/ph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,23 @@ def handle_scheduler_out_of_walltime(self, node):
self.report_error_handled(node, action)
return ProcessHandlerReport(True)

@process_handler(priority=585, exit_codes=PhCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY)
def handle_diagonalization_errors(self, calculation):
"""Handle known issues related to the diagonalization.
Switch to ``diagonalization = 'cg'`` if not already running with this setting, and restart from the charge
density. In case the run already used conjugate gradient diagonalization, abort.
"""
if self.ctx.inputs.parameters['INPUTPH'].get('diagonalization', None) == 'cg':
action = 'found diagonalization issues but already running with conjugate gradient algorithm, aborting...'
self.report_error_handled(calculation, action)
return ProcessHandlerReport(True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE)

self.ctx.inputs.parameters['INPUTPH']['diagonalization'] = 'cg'
action = 'found diagonalization issues, switching to conjugate gradient diagonalization.'
self.report_error_handled(calculation, action)
return ProcessHandlerReport(True)

@process_handler(priority=580, exit_codes=PhCalculation.exit_codes.ERROR_OUT_OF_WALLTIME)
def handle_out_of_walltime(self, node):
"""Handle `ERROR_OUT_OF_WALLTIME` exit code: calculation shut down neatly and we can simply restart."""
Expand Down
Empty file.
95 changes: 95 additions & 0 deletions tests/parsers/fixtures/ph/failed_computing_cholesky/aiida.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@

Program PHONON v.6.3MaX starts on 9Aug2019 at 12:13:51

This program is part of the open-source Quantum ESPRESSO suite
for quantum simulation of materials; please cite
"P. Giannozzi et al., J. Phys.:Condens. Matter 21 395502 (2009);
"P. Giannozzi et al., J. Phys.:Condens. Matter 29 465901 (2017);
URL http://www.quantum-espresso.org",
in publications or presentations arising from this work. More details at
http://www.quantum-espresso.org/quote

*** WARNING: using old-style file format, will disappear from next version ***

Serial version
Title line not specified: using 'default'.
Message from routine phq_readin:
iverbosity is obsolete, use "verbosity" instead

Reading data from directory:
./out/aiida.save
Message from routine volume:
axis vectors are left-handed

IMPORTANT: XC functional enforced from input :
Exchange-correlation = PBE ( 1 4 3 4 0 0)
Any further DFT definition will be discarded
Please, verify this is what you really want


G-vector sticks info
--------------------
sticks: dense smooth PW G-vecs: dense smooth PW
Sum 859 433 127 16889 5985 965

3 / 3 q-points for this run, from 1 to 3:
N xq(1) xq(2) xq(3)
1 0.000000000 0.000000000 0.000000000
2 0.353553391 -0.353553391 -0.353553391
3 0.000000000 0.000000000 -0.707106781


Calculation of q = 0.0000000 0.0000000 0.0000000

Restart in Phonon calculation



bravais-lattice index = 0
lattice parameter (alat) = 7.2558 a.u.
unit-cell volume = 270.1072 (a.u.)^3
number of atoms/cell = 2
number of atomic types = 1
kinetic-energy cut-off = 30.0000 Ry
charge density cut-off = 240.0000 Ry
convergence threshold = 1.0E-12
beta = 0.7000
number of iterations used = 4
Exchange-correlation = PBE ( 1 4 3 4 0 0)


celldm(1)= 7.25577 celldm(2)= 0.00000 celldm(3)= 0.00000
celldm(4)= 0.00000 celldm(5)= 0.00000 celldm(6)= 0.00000

crystal axes: (cart. coord. in units of alat)
a(1) = ( 0.7071 0.7071 0.0000 )
a(2) = ( 0.7071 0.0000 0.7071 )
a(3) = ( 0.0000 0.7071 0.7071 )

reciprocal axes: (cart. coord. in units 2 pi/alat)
b(1) = ( 0.7071 0.7071 -0.7071 )
b(2) = ( 0.7071 -0.7071 0.7071 )
b(3) = ( -0.7071 0.7071 0.7071 )


Atoms inside the unit cell:

Cartesian axes

site n. atom mass positions (alat units)
1 Si 28.0855 tau( 1) = ( 0.00000 0.00000 0.00000 )
2 Si 28.0855 tau( 2) = ( 0.35355 0.35355 0.35355 )

Computing dynamical matrix for
q = ( 0.0000000 0.0000000 0.0000000 )

49 Sym.Ops. (with q -> -q+G )

s frac. trans.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Error in routine cdiaghg (25):
problems computing cholesky
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

stopping ...
19 changes: 19 additions & 0 deletions tests/parsers/test_ph.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,22 @@ def test_ph_out_of_walltime(fixture_localhost, generate_calc_job_node, generate_
assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUT_OF_WALLTIME.status
assert 'output_parameters' in results
data_regression.check(results['output_parameters'].get_dict())


def test_pw_failed_computing_cholesky(fixture_localhost, generate_calc_job_node, generate_parser):
"""Test the parsing of a calculation that failed during cholesky factorization.
In this test the stdout is incomplete, and the XML is missing completely. The stdout contains
the relevant error message.
"""
name = 'failed_computing_cholesky'
entry_point_calc_job = 'quantumespresso.ph'
entry_point_parser = 'quantumespresso.ph'

node = generate_calc_job_node(entry_point_calc_job, fixture_localhost, name, generate_inputs())
parser = generate_parser(entry_point_parser)
_, calcfunction = parser.parse_from_node(node, store_provenance=False)

assert calcfunction.is_finished, calcfunction.exception
assert calcfunction.is_failed, calcfunction.exit_status
assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_COMPUTING_CHOLESKY.status
21 changes: 21 additions & 0 deletions tests/workflows/ph/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,27 @@ def test_handle_convergence_not_reached(generate_workchain_ph):
assert result.status == 0


def test_handle_diagonalization_errors(generate_workchain_ph):
"""Test `PhBaseWorkChain.handle_diagonalization_errors`."""
process = generate_workchain_ph(exit_code=PhCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY)
process.setup()
process.validate_parameters()
process.prepare_process()

process.ctx.inputs.parameters['INPUTPH']['diagonalization'] = 'david'

result = process.handle_diagonalization_errors(process.ctx.children[-1])
assert isinstance(result, ProcessHandlerReport)
assert process.ctx.inputs.parameters['INPUTPH']['diagonalization'] == 'cg'
assert result.do_break

result = process.handle_diagonalization_errors(process.ctx.children[-1])
assert result.do_break

result = process.inspect_process()
assert result == PhBaseWorkChain.exit_codes.ERROR_UNRECOVERABLE_FAILURE


def test_set_max_seconds(generate_workchain_ph):
"""Test that `max_seconds` gets set in the parameters based on `max_wallclock_seconds` unless already set."""
inputs = generate_workchain_ph(return_inputs=True)
Expand Down

0 comments on commit 9c3c599

Please sign in to comment.