Skip to content

Commit

Permalink
Uploaded MPtrj filtering criterions for better documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
bowen-bd committed Dec 9, 2023
1 parent db40e05 commit c2ef048
Showing 1 changed file with 262 additions and 0 deletions.
262 changes: 262 additions & 0 deletions examples/QueryMPtrj.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
# MPtrj Dataset Query and Parsing criterion

This file documents the major filters that were used when parsing
Materials Project trajectory dataset in Sept.2022.
Exact reproducibility of MPtrj is challenging since the evolution of MP dataset and `MPRester` syntax

### Define your MPRester

```python
from mp_api.client import MPRester
mpr = MPRester(MP_API_KEY)
```

### Get the material_ids for all MP entries

```python
material_ids = mpr.summary.search(
energy_above_hull = [0, 100],
fields = ['material_id']
)
```

### Get GGA and GGA+U task_ids for all the materials in MP

```python
opt_task_types = [
'GGA Static', 'GGA Structure Optimization',
'GGA+U Static', 'GGA+U Structure Optimization'
]

optimization_task_ids = {}
for doc in material_ids:
material_id = doc.material_id
tmp = mpr.materials.get_data_by_id(material_id)

for task_id, task_type in tmp.calc_types.items():
if task_type in opt_task_types:
optimization_task_ids[material_id.string].append(task_id)
```

### Query Materials Project Thermodoc entry and the relaxation tasks

The thermodoc entry is the entry you normally see on the MP website

```python
# ThermoDoc: Query MP main entries
main_entry = mpr.get_entry_by_material_id(material_id=material_id)[0]
# Query one relaxation task
taskdoc = mpr.tasks.get_data_by_id(task_id, fields=["input", "output", "calcs_reversed", 'task_id', "run_type"])
```

## Filtering the data

This is done in two steps:

1. Loop through queried relaxation tasks and check whether the task are compatible
2. Loop through compatible tasks and check whether the frames are valid and unique

### Task Check

Check whether a task is compatible to Materials Project main entry, by comparing its DFT settings
and converged results with MP main entry.

- Note this step can no longer work for the current MP data, since a lot of `thermodoc` entry (main entry) have changed to `r2SCAN`

```python
def calc_type_equal(
taskdoc,
main_entry,
trjdata
) -> bool:
# Check the LDAU of task
try:
is_hubbard = taskdoc.calcs_reversed[0].input['parameters']['LDAU']
except:
is_hubbard = taskdoc.calcs_reversed[0].input['incar']['LDAU']

# Make sure we don't include both GGA and GGA+U for the same mp_id
if main_entry.parameters['is_hubbard'] != is_hubbard:
print(f'{main_entry.entry_id}, {taskdoc.task_id} is_hubbard= {is_hubbard}')
trjdata.exception[
taskdoc.task_id] = f'is_hubbard inconsistent task is_hubbard={is_hubbard}'
return False
elif is_hubbard == True:
# If the task is calculated with GGA+U
# Make sure the +U values are the same for each element
composition = taskdoc.output.structure.composition
hubbards = {element.symbol: U for element, U in
zip(composition.elements,
taskdoc.calcs_reversed[0].input['incar']['LDAUU'])}
if main_entry.parameters['hubbards'] != hubbards:
thermo_hubbards = main_entry.parameters['hubbards']
trjdata.exception[
taskdoc.task_id] = f'hubbards inconsistent task hubbards={hubbards}, thermo hubbards={thermo_hubbards}'
return False
else:
# Check the energy convergence of the task wrt. the main entry
return check_energy_convergence(
taskdoc,
main_entry.uncorrected_energy_per_atom,
trjdata=trjdata
)
else:
# Check energy convergence for pure GGA tasks
check_energy_convergence(
taskdoc,
main_entry.uncorrected_energy_per_atom,
trjdata=trjdata
)

def check_energy_convergence(
taskdoc,
relaxed_entry_uncorrected_energy_per_atom,
trjdata
) -> bool:
task_energy = taskdoc.calcs_reversed[0].output['ionic_steps'][-1]['e_fr_energy']
n_atom = taskdoc.calcs_reversed[0].output['ionic_steps'][-1][
'structure'].composition.num_atoms
e_per_atom = task_energy / n_atom
# This is the energy difference of the last frame of the task vs main_entry energy
e_diff = abs(e_per_atom - relaxed_entry_uncorrected_energy_per_atom)

if e_diff < 0.02:
# The task is properly relaxed if the final frame has less than 20 meV/atom difference
return True
else:
# The task is falsely relaxed, we will discard the whole task
# This step will filter out tasks that relaxed into different spin states
# that caused large energy discrepancies
trjdata.exception[taskdoc.task_id] =
f'e_diff is too large, '
f'task last step energy_per_atom = {e_per_atom}, '
f'relaxed_entry_uncorrected_e_per_atom = {relaxed_entry_uncorrected_energy_per_atom}'
return False
```

### Frame Check

For trajectories(tasks) that are compatible, we further check each of its frames

```python
from pymatgen.analysis.structure_matcher import StructureMatcher

class Uniqueness_check(object):
'''
check whether a frame in trajectory is valid and unique
'''

def __init__(
self,
main_entry_uncorrected_energy_per_atom,
trj_data,
ltol=0.002,
stol=0.001,
angle_tol=0.05
):
self.unique_struct_list = []
self.relaxed_entry_uncorrected_energy_per_atom = main_entry_uncorrected_energy_per_atom
self.added_relaxed = False
self.matcher = StructureMatcher(
ltol=ltol,
stol=stol,
angle_tol=angle_tol,
scale=False
)
self.energy_threshold = 0.002
self.trj_data = trj_data

def is_unique(self, step, struct_id, NELM, mag=None):
self.adjust_matcher(mag=mag)
struct = step['structure']
energy = step['e_fr_energy'] / struct.composition.num_atoms

# Check whether a frame is valid
# Discard frame with no energy
if energy == None:
self.trj_data.exception[struct_id] = 'no energy'
return False

# Always accept the relaxed frame on Materials Project website
if abs(energy - self.relaxed_entry_uncorrected_energy_per_atom) < 1e-5
and self.added_relaxed == False:
# we prioritize to add the relaxed entry
self.unique_struct_list.append((energy, struct))
self.added_relaxed = True
return True

# Discard frame with electronic step not converged
if len(step['electronic_steps']) == NELM:
self.trj_data.exception[struct_id] = 'electronic step not converged'
return False

e_diff = energy - self.relaxed_entry_uncorrected_energy_per_atom
if e_diff < -0.01:
# Discard frame that is more stable than the Materials Project website frame
self.trj_data.exception[
struct_id] = f'ediff_per_atom = {e_diff}, energy is too low compared to relaxed entry'
return False
elif e_diff > 1:
# Discard frame that is too unstable than the Materials Project website frame
self.trj_data.exception[
struct_id] = f'ediff_per_atom = {e_diff}, energy is too high compared to relaxed entry'
return False

# Now we're in uniqueness check
# Accept the frame if we still have no frame parsed
if len(self.unique_struct_list) == 0:
self.unique_struct_list.append((energy, struct))
return True

n_atom = struct.composition.num_atoms
min_e_diff = min([abs(energy - tmp[0]) / n_atom
for tmp in self.unique_struct_list])

if min_e_diff > self.energy_threshold:
# Accept the frame if its energy is different from all parsed frames
self.unique_struct_list.append((energy, struct))
return True

# Discard the frame if it's structural similar to another frame in the unique_struc_list
for (unique_energy, unique_struct) in self.unique_struct_list:
if self.matcher.fit(struct, unique_struct):
return False

# Accept the frame
self.unique_struct_list.append((energy, struct))
return True

def adjust_matcher(self, mag):
if mag != None:
# prioritize frame with mag
self.matcher = StructureMatcher(ltol=0.002,
stol=0.001,
angle_tol=0.05,
scale=False)
self.energy_threshold = 0.002

if len(self.unique_struct_list) > 50:
self.matcher = StructureMatcher(ltol=0.3,
stol=0.2,
angle_tol=3,
scale=False)
self.energy_threshold = 0.03
elif len(self.unique_struc_list) > 30:
self.matcher = StructureMatcher(ltol=0.1,
stol=0.1,
angle_tol=1,
scale=False)
self.energy_threshold = 0.01
elif len(self.unique_struct_list) > 10:
self.matcher = StructureMatcher(ltol=0.01,
stol=0.01,
angle_tol=0.1,
scale=False)
self.energy_threshold = 0.005
else:
self.matcher = StructureMatcher(ltol=0.002,
stol=0.001,
angle_tol=0.05,
scale=False)
self.energy_threshold = 0.002
```

0 comments on commit c2ef048

Please sign in to comment.