From 4040d49840a03b5b5d41887e7b63ad552b461fe1 Mon Sep 17 00:00:00 2001
From: Marnik Bercx <mbercx@gmail.com>
Date: Wed, 29 Sep 2021 13:55:34 +0200
Subject: [PATCH] Refactor ProjwfcParser code

---
 aiida_quantumespresso/parsers/projwfc.py | 710 +++++++++++------------
 1 file changed, 326 insertions(+), 384 deletions(-)

diff --git a/aiida_quantumespresso/parsers/projwfc.py b/aiida_quantumespresso/parsers/projwfc.py
index d2dfb05d9..670a00d03 100644
--- a/aiida_quantumespresso/parsers/projwfc.py
+++ b/aiida_quantumespresso/parsers/projwfc.py
@@ -2,288 +2,39 @@
 from pathlib import Path
 import re
 import fnmatch
+from typing import Tuple
+from aiida.common.extendeddicts import AttributeDict
+from numpy.typing import ArrayLike
 
 import numpy as np
 
-from aiida.common import LinkType
-from aiida.orm import Dict, ProjectionData, BandsData, XyData, CalcJobNode
-from aiida.plugins import OrbitalFactory
+from aiida.plugins import OrbitalFactory, DataFactory
 
 from aiida_quantumespresso.parsers import QEOutputParsingError
 from aiida_quantumespresso.parsers.parse_raw.base import (
-    parse_output_base, convert_qe2aiida_structure, convert_qe_to_kpoints
+    convert_qe2aiida_structure, convert_qe_to_kpoints, convert_qe_time_to_sec
 )
 from .base import Parser
 
-
-def find_orbitals_from_statelines(out_info_dict):
-    """This function reads in all the state_lines, that is, the lines describing which atomic states, taken from the
-    pseudopotential, are used for the projection. Then it converts these state_lines into a set of orbitals.
-
-    :param out_info_dict: contains various technical internals useful in parsing
-    :return: orbitals, a list of orbitals suitable for setting ProjectionData
-    """
-
-    # Format of statelines
-    # From PP/src/projwfc.f90: (since Oct. 8 2019)
-    #
-    # 1000 FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2," (l=",i1)
-    # IF (lspinorb) THEN
-    # 1001 FORMAT (" j=",f3.1," m_j=",f4.1,")")
-    # ELSE IF (noncolin) THEN
-    # 1002 FORMAT (" m=",i2," s_z=",f4.1,")")
-    # ELSE
-    # 1003 FORMAT (" m=",i2,")")
-    # ENDIF
-    #
-    # Before:
-    # IF (lspinorb) THEN
-    # ...
-    # 1000    FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2, &
-    #               " (j=",f3.1," l=",i1," m_j=",f4.1,")")
-    # ELSE
-    # ...
-    # 1500    FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2, &
-    #               " (l=",i1," m=",i2," s_z=",f4.1,")")
-    # ENDIF
-
-    out_file = out_info_dict['out_file']
-    atomnum_re = re.compile(r'atom\s*([0-9]+?)[^0-9]')
-    element_re = re.compile(r'atom\s*[0-9]+\s*\(\s*([A-Za-z0-9-_]+?)\s*\)')
-    if out_info_dict['spinorbit']:
-        # spinorbit
-        lnum_re = re.compile(r'l=\s*([0-9]+?)[^0-9]')
-        jnum_re = re.compile(r'j=\s*([0-9.]+?)[^0-9.]')
-        mjnum_re = re.compile(r'm_j=\s*([-0-9.]+?)[^-0-9.]')
-    elif not out_info_dict['collinear']:
-        # non-collinear
-        lnum_re = re.compile(r'l=\s*([0-9]+?)[^0-9]')
-        mnum_re = re.compile(r'm=\s*([-0-9]+?)[^-0-9]')
-        sznum_re = re.compile(r's_z=\s*([-0-9.]*?)[^-0-9.]')
-    else:
-        # collinear / no spin
-        lnum_re = re.compile(r'l=\s*([0-9]+?)[^0-9]')
-        mnum_re = re.compile(r'm=\s*([-0-9]+?)[^-0-9]')
-    wfc_lines = out_info_dict['wfc_lines']
-    state_lines = [out_file[wfc_line] for wfc_line in wfc_lines]
-    state_dicts = []
-    for state_line in state_lines:
-        try:
-            state_dict = {}
-            state_dict['atomnum'] = int(atomnum_re.findall(state_line)[0])
-            state_dict['atomnum'] -= 1  # to keep with orbital indexing
-            state_dict['kind_name'] = element_re.findall(state_line)[0].strip()
-            state_dict['angular_momentum'] = int(lnum_re.findall(state_line)[0])
-            if out_info_dict['spinorbit']:
-                state_dict['total_angular_momentum'] = float(jnum_re.findall(state_line)[0])
-                state_dict['magnetic_number'] = float(mjnum_re.findall(state_line)[0])
-            else:
-                if not out_info_dict['collinear']:
-                    state_dict['spin'] = float(sznum_re.findall(state_line)[0])
-                state_dict['magnetic_number'] = int(mnum_re.findall(state_line)[0])
-                state_dict['magnetic_number'] -= 1  # to keep with orbital indexing
-        except ValueError:
-            raise QEOutputParsingError('State lines are not formatted in a standard way.')
-        state_dicts.append(state_dict)
-
-    # here is some logic to figure out the value of radial_nodes to use
-    new_state_dicts = []
-    for i in range(len(state_dicts)):
-        radial_nodes = 0
-        state_dict = state_dicts[i].copy()
-        for j in range(i - 1, -1, -1):
-            if state_dict == state_dicts[j]:
-                radial_nodes += 1
-        state_dict['radial_nodes'] = radial_nodes
-        new_state_dicts.append(state_dict)
-    state_dicts = new_state_dicts
-
-    # here is some logic to assign positions based on the atom_index
-    structure = out_info_dict['structure']
-    for state_dict in state_dicts:
-        site_index = state_dict.pop('atomnum')
-        state_dict['position'] = structure.sites[site_index].position
-
-    # here we set the resulting state_dicts to a new set of orbitals
-    orbitals = []
-    if out_info_dict['spinorbit']:
-        OrbitalCls = OrbitalFactory('spinorbithydrogen')
-    elif not out_info_dict['collinear']:
-        OrbitalCls = OrbitalFactory('noncollinearhydrogen')
-    else:
-        OrbitalCls = OrbitalFactory('realhydrogen')
-    for state_dict in state_dicts:
-        orbitals.append(OrbitalCls(**state_dict))
-
-    return orbitals
-
-
-def spin_dependent_subparser(out_info_dict):
-    """This find the projection and bands arrays from the out_file and out_info_dict. Used to handle the different
-    possible spin-cases in a convenient manner.
-
-    :param out_info_dict: contains various technical internals useful in parsing
-    :return: ProjectionData, BandsData parsed from out_file
-    """
-    out_file = out_info_dict['out_file']
-    spin_down = out_info_dict['spin_down']
-    od = out_info_dict  # using a shorter name for convenience
-    #   regular expressions needed for later parsing
-    WaveFraction1_re = re.compile(r'\=(.*?)\*')  # state composition 1
-    WaveFractionremain_re = re.compile(r'\+(.*?)\*')  # state comp 2
-    FunctionId_re = re.compile(r'\#(.*?)\]')  # state identity
-    # primes arrays for the later parsing
-    num_wfc = len(od['wfc_lines'])
-    bands = np.zeros([od['k_states'], od['num_bands']])
-    projection_arrays = np.zeros([od['k_states'], od['num_bands'], num_wfc])
-
-    try:
-        for i in range(od['k_states']):
-            if spin_down:
-                i += od['k_states']
-            # grabs band energy
-            for j in range(i * od['num_bands'], (i + 1) * od['num_bands'], 1):
-                out_ind = od['e_lines'][j]
-                try:
-                    # post ~6.3 <6.5 output format "e ="
-                    val = out_file[out_ind].split()[2]
-                    float(val)
-                except ValueError:
-                    # pre ~6.3 and 6.5+ output format "==== e("
-                    val = out_file[out_ind].split(' = ')[1].split()[0]
-                bands[i % od['k_states']][j % od['num_bands']] = val
-                #subloop grabs pdos
-                wave_fraction = []
-                wave_id = []
-                for k in range(od['e_lines'][j] + 1, od['psi_lines'][j], 1):
-                    out_line = out_file[k]
-                    wave_fraction += WaveFraction1_re.findall(out_line)
-                    wave_fraction += WaveFractionremain_re.findall(out_line)
-                    wave_id += FunctionId_re.findall(out_line)
-                if len(wave_id) != len(wave_fraction):
-                    raise IndexError
-                for l in range(len(wave_id)):
-                    wave_id[l] = int(wave_id[l])
-                    wave_fraction[l] = float(wave_fraction[l])
-                    #sets relevant values in pdos_array
-                    projection_arrays[i % od['k_states']][j % od['num_bands']][wave_id[l] - 1] = wave_fraction[l]
-    except IndexError:
-        raise QEOutputParsingError('the standard out file does not comply with the official documentation.')
-
-    bands_data = BandsData()
-    kpoints = od['kpoints']
-    try:
-        if len(od['k_vect']) != len(kpoints.get_kpoints()):
-            raise AttributeError
-        bands_data.set_kpointsdata(kpoints)
-    except AttributeError:
-        bands_data.set_kpoints(od['k_vect'].astype(float))
-
-    bands_data.set_bands(bands, units='eV')
-
-    orbitals = out_info_dict['orbitals']
-    if len(orbitals) != np.shape(projection_arrays[0, 0, :])[0]:
-        raise QEOutputParsingError(
-            'There was an internal parsing error, '
-            ' the projection array shape does not agree'
-            ' with the number of orbitals'
-        )
-    projection_data = ProjectionData()
-    projection_data.set_reference_bandsdata(bands_data)
-    projections = [projection_arrays[:, :, i] for i in range(len(orbitals))]
-
-    # Do the bands_check manually here
-    for projection in projections:
-        if np.shape(projection) != np.shape(bands):
-            raise AttributeError('Projections not the same shape as the bands')
-
-    #insert here some logic to assign pdos to the orbitals
-    pdos_arrays = spin_dependent_pdos_subparser(out_info_dict)
-    energy_arrays = [out_info_dict['energy']] * len(orbitals)
-    projection_data.set_projectiondata(
-        orbitals,
-        list_of_projections=projections,
-        list_of_energy=energy_arrays,
-        list_of_pdos=pdos_arrays,
-        bands_check=False
-    )
-    # pdos=pdos_arrays
-    return bands_data, projection_data
-
-
-def natural_sort_key(sort_key, _nsre=re.compile('([0-9]+)')):
-    """Pass to ``key`` for ``str.sort`` to achieve natural sorting. For example, ``["2", "11", "1"]`` will be sorted to
-    ``["1", "2", "11"]`` instead of ``["1", "11", "2"]``
-
-    :param sort_key: Original key to be processed
-    :return: A list of string and integers.
-    """
-    keys = []
-    for text in _nsre.split(sort_key):
-        if text.isdigit():
-            keys.append(int(text))
-        else:
-            keys.append(text)
-    return keys
-
-
-def spin_dependent_pdos_subparser(out_info_dict):
-    """Finds and labels the pdos arrays associated with the out_info_dict.
-
-    :param out_info_dict: contains various technical internals useful in parsing
-    :return: (pdos_name, pdos_array) tuples for all the specific pdos
-    """
-    spin = out_info_dict['spin']
-    collinear = out_info_dict.get('collinear', True)
-    spinorbit = out_info_dict.get('spinorbit', False)
-    spin_down = out_info_dict['spin_down']
-    pdos_atm_array_dict = out_info_dict['pdos_atm_array_dict']
-    if spin:
-        mult_factor = 2
-        if spin_down:
-            first_array = 4
-        else:
-            first_array = 3
-    else:
-        mult_factor = 1
-        first_array = 2
-    mf = mult_factor
-    fa = first_array
-    pdos_file_names = [k for k in pdos_atm_array_dict]
-    pdos_file_names.sort(key=natural_sort_key)
-    out_arrays = []
-    # we can keep the pdos in synch with the projections by relying on the fact
-    # both are produced in the same order (thus the sorted file_names)
-    for name in pdos_file_names:
-        this_array = pdos_atm_array_dict[name]
-        if not collinear and not spinorbit:
-            # In the non-collinear, non-spinorbit case, the "up"-spin orbitals
-            # come first, followed by all "down" orbitals
-            for i in range(3, np.shape(this_array)[1], 2):
-                out_arrays.append(this_array[:, i])
-            for i in range(4, np.shape(this_array)[1], 2):
-                out_arrays.append(this_array[:, i])
-        else:
-            for i in range(fa, np.shape(this_array)[1], mf):
-                out_arrays.append(this_array[:, i])
-
-    return out_arrays
+Dict = DataFactory('dict')
+XyData = DataFactory('array.xy')
+BandsData = DataFactory('array.bands')
+ProjectionData = DataFactory('array.projection')
+StructureData = DataFactory('structure')
 
 
 class ProjwfcParser(Parser):
-    """This class is the implementation of the Parser class for projwfc.x in Quantum Espresso.
+    """This class is the implementation of the Parser class for the ``projwfc.x`` code  in Quantum ESPRESSO.
 
     Parses projection arrays that map the projection onto each point in the bands structure, as well as pdos arrays,
     which map the projected density of states onto an energy axis.
     """
 
     def parse(self, **kwargs):
-        """Parses the datafolder, stores results.
+        """Parses the retrieved files of the ``ProjwfcCalculation`` and converts them into output nodes.
 
         Retrieves projwfc output, and some basic information from the out_file, such as warnings and wall_time
         """
-        # Check that the retrieved folder is there
-        retrieved = self.retrieved
         retrieve_temporary_list = self.node.get_attribute('retrieve_temporary_list', None)
 
         # If temporary files were specified, check that we have them
@@ -293,79 +44,74 @@ def parse(self, **kwargs):
             except KeyError:
                 return self.exit(self.exit_codes.ERROR_NO_RETRIEVED_TEMPORARY_FOLDER)
 
-        # Read standard out
-        try:
-            filename_stdout = self.node.get_option('output_filename')  # or get_attribute(), but this is clearer
-            with retrieved.open(filename_stdout, 'r') as fil:
-                out_file = fil.readlines()
-        except OSError:
-            return self.exit(self.exit_codes.ERROR_OUTPUT_STDOUT_READ)
+        header, kpoint_blocks, lowdin_block = self.read_stdout()
 
-        job_done = False
-        for i in range(len(out_file)):
-            line = out_file[-i]
-            if 'JOB DONE' in line:
-                job_done = True
-                break
-        if not job_done:
-            return self.exit(self.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE)
-
-        # Parse basic info and warnings, and output them as output_parmeters
-        parsed_data, logs = parse_output_base(out_file, 'PROJWFC')
-        self.emit_logs(logs)
-        self.out('output_parameters', Dict(dict=parsed_data))
+        output_parameters = {
+            'code_version': re.search(r'Program\sPROJWFC\sv\.([\.\d]+)\sstarts', header).groups()[0],
+            'wall_time_seconds': convert_qe_time_to_sec(re.search(r'CPU(.+)WALL', lowdin_block).groups()[0]),
+        }
+        self.out('output_parameters', Dict(dict=output_parameters))
 
         # Parse the XML to obtain the `structure`, `kpoints` and spin-related settings from the parent calculation
         parsed_xml, logs_xml = self._parse_xml(retrieved_temporary_folder)
         self.emit_logs(logs_xml)
 
-        # we create a dictionary the progressively accumulates more info
-        out_info_dict = {}
+        structure = convert_qe2aiida_structure(parsed_xml['structure'])
+        kpoints = convert_qe_to_kpoints(parsed_xml, structure)
 
-        out_info_dict['structure'] = convert_qe2aiida_structure(parsed_xml['structure'])
-        out_info_dict['kpoints'] = convert_qe_to_kpoints(parsed_xml, out_info_dict['structure'])
-        out_info_dict['nspin'] = parsed_xml.get('number_of_spin_components')
-        out_info_dict['collinear'] = not parsed_xml.get('non_colinear_calculation')
-        out_info_dict['spinorbit'] = parsed_xml.get('spin_orbit_calculation')
-        out_info_dict['spin'] = out_info_dict['nspin'] == 2
+        nspin = parsed_xml.get('number_of_spin_components')
+        spinorbit = parsed_xml.get('spin_orbit_calculation')
+        non_collinear = parsed_xml.get('non_colinear_calculation')
 
-        # check and read pdos_tot file
-        out_filenames = retrieved.list_object_names()
-        try:
-            pdostot_filename = fnmatch.filter(out_filenames, '*pdos_tot*')[0]
-            with retrieved.open(pdostot_filename, 'r') as pdostot_file:
-                # Columns: Energy(eV), Ldos, Pdos
-                pdostot_array = np.atleast_2d(np.genfromtxt(pdostot_file))
-                energy = pdostot_array[:, 0]
-                dos = pdostot_array[:, 1]
-        except (OSError, KeyError):
-            return self.exit(self.exit_codes.ERROR_READING_PDOSTOT_FILE)
+        orbitals = self._parse_orbitals(header, structure, non_collinear, spinorbit)
+        bands, projections = self._parse_bands_and_projections(kpoint_blocks, len(orbitals))
+        energy, dos, pdos_array = self._parse_pdos_files(nspin, spinorbit)
 
-        # check and read all of the individual pdos_atm files
-        pdos_atm_filenames = fnmatch.filter(out_filenames, '*pdos_atm*')
-        pdos_atm_array_dict = {}
-        for name in pdos_atm_filenames:
-            with retrieved.open(name, 'r') as pdosatm_file:
-                pdos_atm_array_dict[name] = np.atleast_2d(np.genfromtxt(pdosatm_file))
-
-        # finding the bands and projections
-        out_info_dict['out_file'] = out_file
-        out_info_dict['energy'] = energy
-        out_info_dict['pdos_atm_array_dict'] = pdos_atm_array_dict
-        try:
-            new_nodes_list = self._parse_bands_and_projections(out_info_dict)
-        except QEOutputParsingError as err:
-            self.logger.error(f'Error parsing bands and projections: {err}')
-            return self.exit(self.exit_codes.ERROR_PARSING_PROJECTIONS)
-        for linkname, node in new_nodes_list:
+        output_node_dict = self._build_bands_and_projections(
+            kpoints, bands, energy, orbitals, projections, pdos_array, nspin
+        )
+        for linkname, node in output_node_dict.items():
             self.out(linkname, node)
 
-        Dos_out = XyData()
-        Dos_out.set_x(energy, 'Energy', 'eV')
-        Dos_out.set_y(dos, 'Dos', 'states/eV')
-        self.out('Dos', Dos_out)
+        dos_out = XyData()
+        dos_out.set_x(energy, 'Energy', 'eV')
+        dos_out.set_y(dos, 'Dos', 'states/eV')
+        self.out('Dos', dos_out)
+
+    def read_stdout(self) -> Tuple[str, list, str]:
+        """Read the standard output.
+
+        The method will make sure that the standard output file is present and that the job has completed, and exit
+        with the corresponding exit code if not.
 
-    def _parse_xml(self, retrieved_temporary_folder):
+        :return: tuple with three outputs:
+
+            * ``header``: all output up to the first block of k-point outputs.
+            * ``kpoint_blocks``: a list of text blocks with energies/projections for each k-point
+            * ``lowdin_block``: all output after the k-point blocks, i.e. after "Lowdin Charges:" has been printed
+        """
+        filename_stdout = self.node.get_attribute('output_filename')
+
+        try:
+            with self.retrieved.open(filename_stdout, 'r') as handle:
+                stdout = handle.read()
+        except OSError:
+            return self.exit(self.exit_codes.ERROR_OUTPUT_STDOUT_READ)
+
+        # Split the stdout into header, k-point blocks and the (final) Lowdin block
+        stdout_blocks = stdout.split('Lowdin Charges:')
+        lowdin_block = stdout_blocks[-1]
+        stdout_blocks = stdout_blocks[0].split('k = ')
+        header = stdout_blocks[0]
+        kpoint_blocks = stdout_blocks[1:]
+
+        # The Lowdin block also contains the footer, so check for 'JOB DONE' there to see if the stdout is complete
+        if not 'JOB DONE' in lowdin_block:
+            return self.exit(self.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE)
+
+        return header, kpoint_blocks, lowdin_block
+
+    def _parse_xml(self, retrieved_temporary_folder) -> Tuple[dict, AttributeDict]:
         """Parse the XML file.
 
         The XML must be parsed in order to obtain the required information for the orbital parsing.
@@ -392,7 +138,218 @@ def _parse_xml(self, retrieved_temporary_folder):
 
         return parsed_xml, logs
 
-    def _parse_bands_and_projections(self, out_info_dict):
+    @staticmethod
+    def _parse_orbitals(header: str, structure: StructureData, non_collinear: bool, spinorbit: bool) -> list:
+        """Parse the orbitals from the stdout header.
+
+        This method reads in all the state lines - that is, the lines describing which atomic states taken from the
+        pseudopotential are used for the projections. Then it converts these state lines into a set of orbitals.
+
+        :param header: the header block of text before the first k-point is printed.
+        :param structure: the input structure.
+        :param non_collinear: True if the calculation is non-collinear
+        :param spinorbit: True if the calculation used spin-orbit coupling
+
+        :return: orbitals, a list of orbitals suitable for setting ProjectionData
+        """
+        # Format of statelines
+        # From PP/src/projwfc.f90: (since Oct. 8 2019)
+        #
+        # 1000 FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2," (l=",i1)
+        # IF (lspinorb) THEN
+        # 1001 FORMAT (" j=",f3.1," m_j=",f4.1,")")
+        # ELSE IF (noncolin) THEN
+        # 1002 FORMAT (" m=",i2," s_z=",f4.1,")")
+        # ELSE
+        # 1003 FORMAT (" m=",i2,")")
+        # ENDIF
+        #
+        # Before:
+        # IF (lspinorb) THEN
+        # ...
+        # 1000    FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2, &
+        #               " (j=",f3.1," l=",i1," m_j=",f4.1,")")
+        # ELSE
+        # ...
+        # 1500    FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2, &
+        #               " (l=",i1," m=",i2," s_z=",f4.1,")")
+        # ENDIF
+
+        # Based on the formats above, set up the orbital regex patterns, with the corresponding keys and types
+        if spinorbit:
+            # FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2," (l=",i1," j=",f3.1," m_j=",f4.1,")")
+            orbital_pattern = re.compile(
+                r'state\s#\s*\d+:\satom\s+(\d+)\s\((\S+)\s*\),\swfc\s+\d+\s\(l=(\d)\sj=\s*(.*)\sm_j=(.*)\)'
+            )
+            orbital_keys = ('atomnum', 'kind_name', 'angular_momentum', 'total_angular_momentum', 'magnetic_number')
+            orbital_types = (int, str, int, float, float)
+        elif non_collinear:
+            # FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2," (l=",i1," m=",i2," s_z=",f4.1,")")
+            orbital_pattern = re.compile(
+                r'state\s#\s*\d+:\satom\s+(\d+)\s\((\S+)\s*\),\swfc\s+\d+\s\(l=(\d)\sm=\s?(\d+)\ss_z=(.*)\)'
+            )
+            orbital_keys = ('atomnum', 'kind_name', 'angular_momentum', 'magnetic_number', 'spin')
+            orbital_types = (int, str, int, int, float)
+        else:
+            # This works for both collinear / no spin
+            # FORMAT (5x,"state #",i4,": atom ",i3," (",a3,"), wfc ",i2," (l=",i1," m=",i2,")")
+            orbital_pattern = re.compile(
+                r'state\s#\s*\d+:\satom\s+(\d+)\s\((\S+)\s*\),\swfc\s+\d+\s\(l=(\d)\sm=\s?(\d+)\)'
+            )
+            orbital_keys = ('atomnum', 'kind_name', 'angular_momentum', 'magnetic_number')
+            orbital_types = (int, str, int, int)
+
+        orbital_dicts = []
+
+        for orbital_data in orbital_pattern.findall(header):
+            orbital_dict = {
+                key: data_type(data) for key, data_type, data in zip(orbital_keys, orbital_types, orbital_data)
+            }
+            orbital_dict['atomnum'] -= 1  # convert to zero indexing
+            orbital_dict['magnetic_number'] -= int(not spinorbit)  # convert to zero indexing, except for spinorbit
+            orbital_dicts.append(orbital_dict)
+
+        # Figure out the value of radial_nodes
+        new_orbital_dicts = []
+        for i, orbital_dict in enumerate(orbital_dicts):
+            radial_nodes = 0
+            new_orbital_dict = orbital_dict.copy()
+            for j in range(i - 1, -1, -1):
+                if new_orbital_dict == orbital_dicts[j]:
+                    radial_nodes += 1
+            new_orbital_dict['radial_nodes'] = radial_nodes
+            new_orbital_dicts.append(new_orbital_dict)
+        orbital_dicts = new_orbital_dicts
+
+        # Assign positions based on the atom_index
+        for new_orbital_dict in orbital_dicts:
+            site_index = new_orbital_dict.pop('atomnum')
+            new_orbital_dict['position'] = structure.sites[site_index].position
+
+        # Convert the resulting orbital_dicts into the list of orbitals
+        orbitals = []
+        if spinorbit:
+            orbital_class = OrbitalFactory('spinorbithydrogen')
+        elif non_collinear:
+            orbital_class = OrbitalFactory('noncollinearhydrogen')
+        else:
+            orbital_class = OrbitalFactory('realhydrogen')
+        for new_orbital_dict in orbital_dicts:
+            orbitals.append(orbital_class(**new_orbital_dict))
+
+        return orbitals
+
+    @staticmethod
+    def _parse_bands_and_projections(kpoint_blocks: list, num_orbitals: int) -> Tuple[ArrayLike, ArrayLike]:
+        """Parse the bands energies and orbital projections from the kpoint blocks in the stdout.
+
+        :param kpoint_blocks: list of blocks for each k-point that contain the energies and projections in the stdout.
+        :param num_orbitals: number of orbitals used for the projections.
+
+        :return:
+        """
+
+        # Parse the bands energies
+        energy_pattern = re.compile(r'====\se\(\s*\d+\)\s=\s*(\S+)\seV\s====')
+        bands = np.array([energy_pattern.findall(block) for block in kpoint_blocks])
+
+        # Parse the projection arrays
+        energy_pattern = re.compile(r'\n====.+==== \n')
+        psi_pattern = re.compile(r'([.\d]+)\*\[#\s*(\d+)\]')
+
+        projections = []
+
+        for block in kpoint_blocks:
+
+            kpoint_projections = []
+
+            for band_projections in re.split(energy_pattern, block)[1:]:
+
+                proj_array = np.zeros(num_orbitals)
+
+                for [projection_value, orbital_index] in psi_pattern.findall(band_projections):
+                    proj_array[int(orbital_index) - 1] = projection_value
+
+                kpoint_projections.append(proj_array)
+
+            projections.append(kpoint_projections)
+
+        projections = np.array(projections)
+
+        return bands, projections
+
+    def _parse_pdos_files(self, nspin: int, spinorbit: bool) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
+        """Finds and labels the pdos arrays associated with the out_info_dict.
+
+        :param out_info_dict: contains various technical internals useful in parsing
+        :return: (pdos_name, pdos_array) tuples for all the specific pdos
+        """
+
+        def natural_sort_key(sort_key, _nsre=re.compile('([0-9]+)')):
+            """Pass to ``key`` for ``str.sort`` to achieve natural sorting. For example, ``["2", "11", "1"]`` will be sorted to
+            ``["1", "2", "11"]`` instead of ``["1", "11", "2"]``
+
+            :param sort_key: Original key to be processed
+            :return: A list of string and integers.
+            """
+            keys = []
+            for text in _nsre.split(sort_key):
+                if text.isdigit():
+                    keys.append(int(text))
+                else:
+                    keys.append(text)
+            return keys
+
+        # Read the `pdos_tot` file
+        out_filenames = self.retrieved.list_object_names()
+        try:
+            pdostot_filename = fnmatch.filter(out_filenames, '*pdos_tot*')[0]
+            with self.retrieved.open(pdostot_filename, 'r') as pdostot_file:
+                # Columns: Energy(eV), Ldos, Pdos
+                pdostot_array = np.atleast_2d(np.genfromtxt(pdostot_file))
+                energy = pdostot_array[:, 0]
+                dos = pdostot_array[:, 1]
+        except (OSError, KeyError):
+            return self.exit(self.exit_codes.ERROR_READING_PDOSTOT_FILE)
+
+        # We're only interested in the PDOS, so we skip the first columns corresponding to the energy and LDOS
+        if nspin == 1 or spinorbit:
+            first_pdos_column = 2
+        else:
+            first_pdos_column = 3
+
+        # Read the `pdos_atm` files
+        pdos_atm_array_dict = {}
+        for filename in fnmatch.filter(out_filenames, '*pdos_atm*'):
+            with self.retrieved.open(filename, 'r') as pdosatm_file:
+                pdos_atm_array_dict[filename] = np.atleast_2d(np.genfromtxt(pdosatm_file))[:, first_pdos_column:]
+
+        # Keep the pdos in sync with the orbitals by properly sorting the filenames
+        pdos_file_names = [k for k in pdos_atm_array_dict]
+        pdos_file_names.sort(key=natural_sort_key)
+
+        # Make sure the order of the PDOS columns matches with the orbitals
+        if nspin != 4 or spinorbit:
+            # Simply concatenate the PDOS columns as they are ordered by the filenames
+            pdos_array = np.concatenate([pdos_atm_array_dict[name] for name in pdos_file_names], axis=1)
+            if nspin == 2:
+                # Reshuffle the order of the columns so the 'up' spin columns are first
+                pdos_array = np.concatenate([pdos_array[:, 0::2], pdos_array[:, 1::2]], axis=1)
+            if nspin == 4:
+                # Reshuffle the order of the columns in accordance with order of orbitals for spin-orbit
+                pdos_array = np.concatenate([pdos_array[:, 1::2], pdos_array[:, 0::2]], axis=1)
+        else:
+            # Here all the 'up' orbitals for each l number come first, so the PDOS columns must be sorted accordingly
+            pdos_list = []
+            for wfc_pdos_array in [pdos_atm_array_dict[name] for name in pdos_file_names]:
+                # Reshuffle the PDOS columns so the 'up' spin columns _for this l number_ come first
+                pdos_list.append(np.concatenate([wfc_pdos_array[:, 0::2], wfc_pdos_array[:, 1::2]], axis=1))
+            pdos_array = np.concatenate(pdos_list, axis=1)
+
+        return energy, dos, pdos_array
+
+    @classmethod
+    def _build_bands_and_projections(cls, kpoints, bands, energy, orbitals, projections, pdos_array, nspin):
         """Function that parses the standard output into bands and projection data.
 
         :param out_info_dict: used to pass technical internal variables
@@ -400,70 +357,55 @@ def _parse_bands_and_projections(self, out_info_dict):
         :return: append_nodes_list a list containing BandsData and
                  ProjectionData parsed from standard_out
         """
-        out_file = out_info_dict['out_file']  # Note: we expect a list of lines
-        out_info_dict['k_lines'] = []
-        out_info_dict['e_lines'] = []
-        out_info_dict['psi_lines'] = []
-        out_info_dict['wfc_lines'] = []
-        append_nodes_list = []
-
-        for i, line in enumerate(out_file):
-            if 'k =' in line:
-                out_info_dict['k_lines'].append(i)
-            if '==== e(' in line or line.strip().startswith('e ='):
-                # The energy format in output was changed in QE6.3
-                # this check supports old and new format
-                out_info_dict['e_lines'].append(i)
-            if '|psi|^2' in line:
-                out_info_dict['psi_lines'].append(i)
-            if 'state #' in line:
-                out_info_dict['wfc_lines'].append(i)
-
-        # Basic check
-        if len(out_info_dict['e_lines']) != len(out_info_dict['psi_lines']):
-            raise QEOutputParsingError('e-lines and psi-lines are in different number')
-        if len(out_info_dict['psi_lines']) % len(out_info_dict['k_lines']) != 0:
-            raise QEOutputParsingError('Band Energy Points is not a multiple of kpoints')
-        # calculates the number of bands
-        out_info_dict['num_bands'] = len(out_info_dict['psi_lines']) // len(out_info_dict['k_lines'])
-
-        # changes k-numbers to match spin
-        # because if spin is on, k points double for up and down
-        out_info_dict['k_states'] = len(out_info_dict['k_lines'])
-        if out_info_dict['spin']:
-            if out_info_dict['k_states'] % 2 != 0:
-                raise QEOutputParsingError('Internal formatting error regarding spin')
-            out_info_dict['k_states'] = out_info_dict['k_states'] // 2
-
-        # adds in the k-vector for each kpoint
-        k_vect = [out_file[out_info_dict['k_lines'][i]].split()[2:] for i in range(out_info_dict['k_states'])]
-        out_info_dict['k_vect'] = np.array(k_vect)
-        out_info_dict['orbitals'] = find_orbitals_from_statelines(out_info_dict)
-
-        spin = out_info_dict['spin']
-
-        if spin:
-            # I had to guess what the ordering of the spin is, because
-            # the projwfc.x documentation doesn't say, but looking at the
-            # source code I found:
+        num_kpoints = len(kpoints.get_kpoints())
+        num_orbitals = len(orbitals)
+
+        bands_data, projection_data = cls._intialize_bands_projection_data(
+            kpoints, bands[:num_kpoints, :], energy, orbitals, projections[:num_kpoints, :, :],
+            pdos_array[:, :num_orbitals]
+        )
+        if nspin == 2:
+            # For collinear spin-polarised calculations, each orbital has an 'up' and 'down' projection.
+            # Based on the Quantum ESPRESSO source code:
             #
             # DO is=1,nspin
             #   IF (nspin==2) THEN
             #       IF (is==1) filename=trim(filproj)//'.up'
             #       IF (is==2) filename=trim(filproj)//'.down'
             #
-            # Which would say that it is reasonable to assume that the
-            # spin up states are written first, then spin down
-            #
-            out_info_dict['spin_down'] = False
-            bands_data1, projection_data1 = spin_dependent_subparser(out_info_dict)
-            append_nodes_list += [('projections_up', projection_data1), ('bands_up', bands_data1)]
-            out_info_dict['spin_down'] = True
-            bands_data2, projection_data2 = spin_dependent_subparser(out_info_dict)
-            append_nodes_list += [('projections_down', projection_data2), ('bands_down', bands_data2)]
-        else:
-            out_info_dict['spin_down'] = False
-            bands_data, projection_data = spin_dependent_subparser(out_info_dict)
-            append_nodes_list += [('projections', projection_data), ('bands', bands_data)]
-
-        return append_nodes_list
+            # It is reasonable to assume that the spin up states are written first, then spin down in the stdout.
+            # So the first/second half of the `bands` and `projections` correspond to spin up/down.
+            # The `pdos_arrays` are constructed to match this order, i.e. the first len(orbitals) correspond to 'up'.
+            # The 'up' bands and projections have already been initialized above, the 'down' we do now.
+            bands_data_down, projection_data_down = cls._intialize_bands_projection_data(
+                kpoints, bands[num_kpoints:, :], energy, orbitals, projections[num_kpoints:, :, :],
+                pdos_array[:, num_orbitals:]
+            )
+            return {
+                'bands_up': bands_data,
+                'bands_down': bands_data_down,
+                'projections_up': projection_data,
+                'projections_down': projection_data_down,
+            }
+
+        return {'bands': bands_data, 'projections': projection_data}
+
+    @staticmethod
+    def _intialize_bands_projection_data(kpoints, bands, energy, orbitals, projections, pdos_array):
+        """Initialize an instance of ``BandsData`` and correpsonding ``ProjectionData``."""
+        bands_data = BandsData()
+        bands_data.set_kpointsdata(kpoints)
+        bands_data.set_bands(bands, units='eV')
+
+        projection_data = ProjectionData()
+        projection_data.set_reference_bandsdata(bands_data)
+
+        energy_arrays = [energy] * len(orbitals)
+        projection_data.set_projectiondata(
+            orbitals,
+            list_of_projections=[projections[:, :, i] for i in range(len(orbitals))],
+            list_of_energy=energy_arrays,
+            list_of_pdos=[pdos_array[:, i] for i in range(len(orbitals))],
+            bands_check=False
+        )
+        return bands_data, projection_data