From 9c3c599ead6153402a701ab03e51f3f0fba8c42c Mon Sep 17 00:00:00 2001 From: "Jason.Yu" Date: Wed, 24 Nov 2021 10:45:10 +0100 Subject: [PATCH] `PhBaseWorkChain`: add handler for diagonalization errors (#757) The `PwBaseWorkChain` now has a handler for `ERROR_COMPUTING_CHOLESKY`, an exit code that was added to `PhCalculation` which is returned if the calculation failed due to problems with cholesky factorization. The handler will switch to `cg` diagonalization and restart the calculation. If the diagonalization already was `cg` it will abort. The `PhParser` was also fixed to return `ERROR_OUTPUT_STDOUT_INCOMPLETE` in case the stdout was incomplete and no other more specific errors were detected. --- aiida_quantumespresso/calculations/ph.py | 4 +- aiida_quantumespresso/parsers/parse_raw/ph.py | 1 + aiida_quantumespresso/parsers/ph.py | 6 ++ aiida_quantumespresso/workflows/ph/base.py | 17 ++++ .../DYN_MAT/.gitignore | 0 .../ph/failed_computing_cholesky/aiida.out | 95 +++++++++++++++++++ tests/parsers/test_ph.py | 19 ++++ tests/workflows/ph/test_base.py | 21 ++++ 8 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 tests/parsers/fixtures/ph/failed_computing_cholesky/DYN_MAT/.gitignore create mode 100644 tests/parsers/fixtures/ph/failed_computing_cholesky/aiida.out diff --git a/aiida_quantumespresso/calculations/ph.py b/aiida_quantumespresso/calculations/ph.py index 49e7853ef..5affd7a5f 100644 --- a/aiida_quantumespresso/calculations/ph.py +++ b/aiida_quantumespresso/calculations/ph.py @@ -78,6 +78,8 @@ def define(cls, spec): message='The calculation stopped prematurely because it ran out of walltime.') spec.exit_code(410, 'ERROR_CONVERGENCE_NOT_REACHED', message='The minimization cycle did not reach self-consistency.') + spec.exit_code(462, 'ERROR_COMPUTING_CHOLESKY', + message='The code failed during the cholesky factorization.') # yapf: enable def prepare_for_submission(self, folder): @@ -170,7 +172,7 @@ def prepare_for_submission(self, folder): try: mesh, offset = self.inputs.qpoints.get_kpoints_mesh() - if any([i != 0. for i in offset]): + if any(i != 0. for i in offset): raise NotImplementedError( 'Computation of phonons on a mesh with non zero offset is not implemented, at the level of ph.x' ) diff --git a/aiida_quantumespresso/parsers/parse_raw/ph.py b/aiida_quantumespresso/parsers/parse_raw/ph.py index 91e26d853..5127b7373 100644 --- a/aiida_quantumespresso/parsers/parse_raw/ph.py +++ b/aiida_quantumespresso/parsers/parse_raw/ph.py @@ -152,6 +152,7 @@ def detect_important_message(logs, line): 'error': { 'Maximum CPU time exceeded': 'ERROR_OUT_OF_WALLTIME', 'No convergence has been achieved': 'ERROR_CONVERGENCE_NOT_REACHED', + 'problems computing cholesky': 'ERROR_COMPUTING_CHOLESKY', }, 'warning': { 'Warning:': None, diff --git a/aiida_quantumespresso/parsers/ph.py b/aiida_quantumespresso/parsers/ph.py index d6859aba6..0d19cb72d 100644 --- a/aiida_quantumespresso/parsers/ph.py +++ b/aiida_quantumespresso/parsers/ph.py @@ -65,3 +65,9 @@ def parse(self, **kwargs): if 'ERROR_CONVERGENCE_NOT_REACHED' in logs['error']: return self.exit_codes.ERROR_CONVERGENCE_NOT_REACHED + + if 'ERROR_COMPUTING_CHOLESKY' in logs['error']: + return self.exit_codes.ERROR_COMPUTING_CHOLESKY + + if 'ERROR_OUTPUT_STDOUT_INCOMPLETE' in logs['error']: + return self.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE diff --git a/aiida_quantumespresso/workflows/ph/base.py b/aiida_quantumespresso/workflows/ph/base.py index 1dc91759c..3a2526253 100644 --- a/aiida_quantumespresso/workflows/ph/base.py +++ b/aiida_quantumespresso/workflows/ph/base.py @@ -131,6 +131,23 @@ def handle_scheduler_out_of_walltime(self, node): self.report_error_handled(node, action) return ProcessHandlerReport(True) + @process_handler(priority=585, exit_codes=PhCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY) + def handle_diagonalization_errors(self, calculation): + """Handle known issues related to the diagonalization. + + Switch to ``diagonalization = 'cg'`` if not already running with this setting, and restart from the charge + density. In case the run already used conjugate gradient diagonalization, abort. + """ + if self.ctx.inputs.parameters['INPUTPH'].get('diagonalization', None) == 'cg': + action = 'found diagonalization issues but already running with conjugate gradient algorithm, aborting...' + self.report_error_handled(calculation, action) + return ProcessHandlerReport(True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE) + + self.ctx.inputs.parameters['INPUTPH']['diagonalization'] = 'cg' + action = 'found diagonalization issues, switching to conjugate gradient diagonalization.' + self.report_error_handled(calculation, action) + return ProcessHandlerReport(True) + @process_handler(priority=580, exit_codes=PhCalculation.exit_codes.ERROR_OUT_OF_WALLTIME) def handle_out_of_walltime(self, node): """Handle `ERROR_OUT_OF_WALLTIME` exit code: calculation shut down neatly and we can simply restart.""" diff --git a/tests/parsers/fixtures/ph/failed_computing_cholesky/DYN_MAT/.gitignore b/tests/parsers/fixtures/ph/failed_computing_cholesky/DYN_MAT/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/tests/parsers/fixtures/ph/failed_computing_cholesky/aiida.out b/tests/parsers/fixtures/ph/failed_computing_cholesky/aiida.out new file mode 100644 index 000000000..9fa8e8341 --- /dev/null +++ b/tests/parsers/fixtures/ph/failed_computing_cholesky/aiida.out @@ -0,0 +1,95 @@ + + Program PHONON v.6.3MaX starts on 9Aug2019 at 12:13:51 + + This program is part of the open-source Quantum ESPRESSO suite + for quantum simulation of materials; please cite + "P. Giannozzi et al., J. Phys.:Condens. Matter 21 395502 (2009); + "P. Giannozzi et al., J. Phys.:Condens. Matter 29 465901 (2017); + URL http://www.quantum-espresso.org", + in publications or presentations arising from this work. More details at + http://www.quantum-espresso.org/quote + + *** WARNING: using old-style file format, will disappear from next version *** + + Serial version + Title line not specified: using 'default'. + Message from routine phq_readin: + iverbosity is obsolete, use "verbosity" instead + + Reading data from directory: + ./out/aiida.save + Message from routine volume: + axis vectors are left-handed + + IMPORTANT: XC functional enforced from input : + Exchange-correlation = PBE ( 1 4 3 4 0 0) + Any further DFT definition will be discarded + Please, verify this is what you really want + + + G-vector sticks info + -------------------- + sticks: dense smooth PW G-vecs: dense smooth PW + Sum 859 433 127 16889 5985 965 + + 3 / 3 q-points for this run, from 1 to 3: + N xq(1) xq(2) xq(3) + 1 0.000000000 0.000000000 0.000000000 + 2 0.353553391 -0.353553391 -0.353553391 + 3 0.000000000 0.000000000 -0.707106781 + + + Calculation of q = 0.0000000 0.0000000 0.0000000 + + Restart in Phonon calculation + + + + bravais-lattice index = 0 + lattice parameter (alat) = 7.2558 a.u. + unit-cell volume = 270.1072 (a.u.)^3 + number of atoms/cell = 2 + number of atomic types = 1 + kinetic-energy cut-off = 30.0000 Ry + charge density cut-off = 240.0000 Ry + convergence threshold = 1.0E-12 + beta = 0.7000 + number of iterations used = 4 + Exchange-correlation = PBE ( 1 4 3 4 0 0) + + + celldm(1)= 7.25577 celldm(2)= 0.00000 celldm(3)= 0.00000 + celldm(4)= 0.00000 celldm(5)= 0.00000 celldm(6)= 0.00000 + + crystal axes: (cart. coord. in units of alat) + a(1) = ( 0.7071 0.7071 0.0000 ) + a(2) = ( 0.7071 0.0000 0.7071 ) + a(3) = ( 0.0000 0.7071 0.7071 ) + + reciprocal axes: (cart. coord. in units 2 pi/alat) + b(1) = ( 0.7071 0.7071 -0.7071 ) + b(2) = ( 0.7071 -0.7071 0.7071 ) + b(3) = ( -0.7071 0.7071 0.7071 ) + + + Atoms inside the unit cell: + + Cartesian axes + + site n. atom mass positions (alat units) + 1 Si 28.0855 tau( 1) = ( 0.00000 0.00000 0.00000 ) + 2 Si 28.0855 tau( 2) = ( 0.35355 0.35355 0.35355 ) + + Computing dynamical matrix for + q = ( 0.0000000 0.0000000 0.0000000 ) + + 49 Sym.Ops. (with q -> -q+G ) + + s frac. trans. + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + Error in routine cdiaghg (25): + problems computing cholesky + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + stopping ... diff --git a/tests/parsers/test_ph.py b/tests/parsers/test_ph.py index d7e9f04e9..2d687d9a1 100644 --- a/tests/parsers/test_ph.py +++ b/tests/parsers/test_ph.py @@ -57,3 +57,22 @@ def test_ph_out_of_walltime(fixture_localhost, generate_calc_job_node, generate_ assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_OUT_OF_WALLTIME.status assert 'output_parameters' in results data_regression.check(results['output_parameters'].get_dict()) + + +def test_pw_failed_computing_cholesky(fixture_localhost, generate_calc_job_node, generate_parser): + """Test the parsing of a calculation that failed during cholesky factorization. + + In this test the stdout is incomplete, and the XML is missing completely. The stdout contains + the relevant error message. + """ + name = 'failed_computing_cholesky' + entry_point_calc_job = 'quantumespresso.ph' + entry_point_parser = 'quantumespresso.ph' + + node = generate_calc_job_node(entry_point_calc_job, fixture_localhost, name, generate_inputs()) + parser = generate_parser(entry_point_parser) + _, calcfunction = parser.parse_from_node(node, store_provenance=False) + + assert calcfunction.is_finished, calcfunction.exception + assert calcfunction.is_failed, calcfunction.exit_status + assert calcfunction.exit_status == node.process_class.exit_codes.ERROR_COMPUTING_CHOLESKY.status diff --git a/tests/workflows/ph/test_base.py b/tests/workflows/ph/test_base.py index 33e406bc1..6d857fdbc 100644 --- a/tests/workflows/ph/test_base.py +++ b/tests/workflows/ph/test_base.py @@ -111,6 +111,27 @@ def test_handle_convergence_not_reached(generate_workchain_ph): assert result.status == 0 +def test_handle_diagonalization_errors(generate_workchain_ph): + """Test `PhBaseWorkChain.handle_diagonalization_errors`.""" + process = generate_workchain_ph(exit_code=PhCalculation.exit_codes.ERROR_COMPUTING_CHOLESKY) + process.setup() + process.validate_parameters() + process.prepare_process() + + process.ctx.inputs.parameters['INPUTPH']['diagonalization'] = 'david' + + result = process.handle_diagonalization_errors(process.ctx.children[-1]) + assert isinstance(result, ProcessHandlerReport) + assert process.ctx.inputs.parameters['INPUTPH']['diagonalization'] == 'cg' + assert result.do_break + + result = process.handle_diagonalization_errors(process.ctx.children[-1]) + assert result.do_break + + result = process.inspect_process() + assert result == PhBaseWorkChain.exit_codes.ERROR_UNRECOVERABLE_FAILURE + + def test_set_max_seconds(generate_workchain_ph): """Test that `max_seconds` gets set in the parameters based on `max_wallclock_seconds` unless already set.""" inputs = generate_workchain_ph(return_inputs=True)