"""Utility functions for acquiring, processing, or modifiying entries."""
from __future__ import annotations
import itertools
import re
import warnings
from copy import deepcopy
from typing import TYPE_CHECKING
from pymatgen.core.composition import Element
from pymatgen.core.structure import Structure
from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from rxn_network.core import Composition
from rxn_network.entries.entry_set import GibbsEntrySet
from rxn_network.utils.funcs import get_logger
if TYPE_CHECKING:
from collections.abc import Iterable
from maggma.stores import MongoStore
logger = get_logger(__name__)
[docs]
def process_entries(
entries: Iterable[ComputedStructureEntry],
temperature: float,
e_above_hull: float,
filter_at_temperature: int | None = None,
include_nist_data: bool = True,
include_freed_data: bool = False,
include_polymorphs: bool = False,
formulas_to_include: Iterable[str] | None = None,
calculate_e_above_hulls: bool = False,
ignore_nist_solids: bool = True,
) -> GibbsEntrySet:
"""Convenience function for processing a set of ComputedStructureEntry objects into a
GibbsEntrySet with specified parameters. This is used when building entries in most
of the jobs/flows.
Args:
entries: Iterable of ComputedStructureEntry objects. These can be downloaded
from The Materials Project API or created manually with pymatgen.
temperature: Temperature [K] for determining Gibbs Free Energy of
formation, dGf(T).
e_above_hull: Energy above hull (eV/atom) for thermodynamic stability threshold;
i.e., include all entries with energies below this value.
filter_at_temperature: Temperature (in Kelvin) at which entries are filtered for
thermodynamic stability (e.g., room temperature). Generally, this often
differs from the synthesis temperature.
include_nist_data: Whether to include NIST-JANAF data in the entry set.
Defaults to True.
include_freed_data: Whether to include FREED data in the entry set. Defaults
to False. WARNING: This dataset has not been thoroughly tested. Use at
your own risk!
include_polymorphs: Whether to include non-ground state polymorphs in the entry
set. Defaults to False.
formulas_to_include: An iterable of compositional formulas to ensure are
included in the processed dataset. Sometimes, entries are filtered out that
one would like to include, or entries don't exist for those compositions.
calculate_e_above_hulls: Whether to calculate e_above_hull and store as an
attribute in the data dictionary for each entry.
ignore_nist_solids: Whether to ignore NIST data for solids with high melting
points (Tm >= 1500 ÂșC). Defaults to True.
Returns:
A GibbsEntrySet object containing entry objects with the user-specified
constraints.
"""
temp = temperature
if filter_at_temperature:
temp = filter_at_temperature
entry_set = GibbsEntrySet.from_computed_entries(
entries=entries,
temperature=temp,
include_nist_data=include_nist_data,
include_freed_data=include_freed_data,
ignore_nist_solids=ignore_nist_solids,
)
included_entries = [initialize_entry(f, entry_set) for f in formulas_to_include] if formulas_to_include else []
entry_set = entry_set.filter_by_stability(e_above_hull=e_above_hull, include_polymorphs=include_polymorphs)
entry_set.update(included_entries) # make sure these aren't filtered out
if filter_at_temperature and (filter_at_temperature != temperature):
entry_set = entry_set.get_entries_with_new_temperature(temperature)
if calculate_e_above_hulls:
entry_set = GibbsEntrySet(deepcopy(entry_set), calculate_e_above_hulls=True)
return entry_set
[docs]
def initialize_entry(formula: str, entry_set: GibbsEntrySet, stabilize: bool = False):
"""Acquire an entry by user-specified formula. This method attemps to first
get the entry; if it is not included in the set, it will create an interpolated
entry. Finally, if stabilize=True, the energy will be lowered until it appears on
teh hull.
Args:
formula: Chemical formula
entry_set: Set of entries
stabilize: Whether or not to stabilize the entry by decreasing its energy
such that it is 'on the hull'.
"""
try:
entry = entry_set.get_min_entry_by_formula(formula)
except KeyError:
entry = entry_set.get_interpolated_entry(formula)
warnings.warn(f"Using interpolated entry for {entry.composition.reduced_formula}")
if stabilize:
entry = entry_set.get_stabilized_entry(entry)
return entry
[docs]
def get_entries(
db: MongoStore,
chemsys_formula_id_criteria: str | dict,
compatible_only: bool = True,
inc_structure: str | None = None,
property_data: list[str] | None = None,
use_premade_entries: bool = False,
conventional_unit_cell: bool = False,
sort_by_e_above_hull: bool = False,
): # pragma: no cover
"""Warning:
This function is legacy code directly adapted from pymatgen.ext.matproj. It is
not broadly useful or applicable to other databases. It is only used in jobs
interfaced directly with internal databases at the Materials Project. This code
is not adequately tested and may not work as expected.
Get a list of ComputedEntries or ComputedStructureEntries corresponding to a
chemical system, formula, or materials_id or full criteria.
Args:
db: MongoStore object with database connection
chemsys_formula_id_criteria: A chemical system
(e.g., Li-Fe-O), or formula (e.g., Fe2O3) or materials_id (e.g., mp-1234) or
full Mongo-style dict criteria.
compatible_only: Whether to return only "compatible"
entries. Compatible entries are entries that have been processed using the
MaterialsProjectCompatibility class, which performs adjustments to allow
mixing of GGA and GGA+U calculations for more accurate phase diagrams and
reaction energies.
inc_structure: If None, entries returned are
ComputedEntries. If inc_structure="initial", ComputedStructureEntries with
initial structures are returned. Otherwise, ComputedStructureEntries with
final structures are returned.
property_data: Specify additional properties to include in
entry.data. If None, no data. Should be a subset of supported_properties.
use_premade_entries: Whether to use entry objects that have already been
constructed. Defaults to False.
conventional_unit_cell: Whether to get the standard
conventional unit cell
sort_by_e_above_hull: Whether to sort the list of entries by
e_above_hull (will query e_above_hull as a property_data if True).
Returns:
List of ComputedEntry or ComputedStructureEntry objects.
"""
params = [
"deprecated",
"run_type",
"is_hubbard",
"pseudo_potential",
"hubbards",
"potcar_symbols",
"oxide_type",
]
props = ["final_energy", "unit_cell_formula", "task_id", *params]
if sort_by_e_above_hull:
if property_data and "e_above_hull" not in property_data:
property_data.append("e_above_hull")
elif not property_data:
property_data = ["e_above_hull"]
if property_data:
props += property_data
if inc_structure:
if inc_structure == "initial":
props.append("initial_structure")
else:
props.append("structure")
if not isinstance(chemsys_formula_id_criteria, dict):
criteria = parse_criteria(chemsys_formula_id_criteria)
else:
criteria = chemsys_formula_id_criteria
if use_premade_entries:
props = ["entries", "deprecated"]
entries = []
for d in db.query(criteria, props):
if d.get("deprecated"):
continue
if use_premade_entries:
ent = d["entries"]
if ent.get("GGA"):
e = ComputedStructureEntry.from_dict(ent["GGA"])
elif ent.get("GGA+U"):
e = ComputedStructureEntry.from_dict(ent["GGA+U"])
else:
print(f"Missing entry for {d['_id']}")
continue
else:
d["potcar_symbols"] = [
f"{d['pseudo_potential']['functional']} {label}" for label in d["pseudo_potential"].get("labels", [])
]
data = {"oxide_type": d["oxide_type"]}
if property_data:
data.update({k: d[k] for k in property_data})
if not inc_structure:
e = ComputedEntry(
d["unit_cell_formula"],
d["final_energy"],
parameters={k: d[k] for k in params},
data=data,
entry_id=d["task_id"],
)
else:
prim = Structure.from_dict(d["initial_structure"] if inc_structure == "initial" else d["structure"])
if conventional_unit_cell:
s = SpacegroupAnalyzer(prim).get_conventional_standard_structure()
energy = d["final_energy"] * (len(s) / len(prim))
else:
s = prim.copy()
energy = d["final_energy"]
e = ComputedStructureEntry(
s,
energy,
parameters={k: d[k] for k in params},
data=data,
entry_id=d["task_id"],
)
entries.append(e)
if compatible_only:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Failed to guess oxidation states.*")
entries = MaterialsProject2020Compatibility().process_entries(entries, clean=True)
if sort_by_e_above_hull:
entries = sorted(entries, key=lambda entry: entry.data["e_above_hull"])
return entries
[docs]
def get_all_entries_in_chemsys(
db: MongoStore,
elements: str | list[str],
compatible_only: bool = True,
inc_structure: str | None = "final",
property_data: list | None = None,
use_premade_entries: bool = False,
conventional_unit_cell: bool = False,
n: int = 1000,
) -> list[ComputedEntry]: # pragma: no cover
"""Warning:
This function is legacy code directly adapted from pymatgen.ext.matproj. It is
not broadly useful or applicable to other databases. It is only used in jobs
interfaced directly with internal databases at the Materials Project. This code
is not adequately tested and may not work as expected.
Helper method for getting all entries in a total chemical system by querying
database for all sub-chemical systems. Code adadpted from pymatgen.ext.matproj and
modified to support very large chemical systems.
Args:
db: MongoStore object with database connection
elements (str or [str]): Chemical system string comprising element
symbols separated by dashes, e.g., "Li-Fe-O" or List of element symbols,
e.g., ["Li", "Fe", "O"].
compatible_only (bool): Whether to return only "compatible"
entries. Compatible entries are entries that have been processed using the
MaterialsProjectCompatibility class, which performs adjustments to allow
mixing of GGA and GGA+U calculations for more accurate phase diagrams and
reaction energies.
inc_structure (str): If None, entries returned are
ComputedEntries. If inc_structure="final", ComputedStructureEntries with
final structures are returned. Otherwise, ComputedStructureEntries with
initial structures are returned.
property_data (list): Specify additional properties to include in
entry.data. If None, no data. Should be a subset of
supported_properties.
use_premade_entries: Whether to use entry objects that have already been
constructed. Defaults to False.
conventional_unit_cell (bool): Whether to get the standard
conventional unit cell
n (int): Chunk size, i.e., number of sub-chemical systems to consider
Returns:
List of ComputedEntries.
"""
def divide_chunks(my_list, n):
for i in range(0, len(my_list), n):
yield my_list[i : i + n]
if isinstance(elements, str):
elements = elements.split("-")
if len(elements) < 13:
all_chemsyses = []
for i in range(len(elements)):
for els in itertools.combinations(elements, i + 1):
all_chemsyses.append("-".join(sorted(els)))
all_chemsyses = list(divide_chunks(all_chemsyses, n))
entries = []
for chemsys_group in all_chemsyses:
entries.extend(
get_entries(
db,
{"chemsys": {"$in": chemsys_group}},
compatible_only=compatible_only,
inc_structure=inc_structure,
property_data=property_data,
use_premade_entries=use_premade_entries,
conventional_unit_cell=conventional_unit_cell,
)
)
else: # for very large chemical systems, use a different approach
entries = get_entries(
db,
{"elements": {"$not": {"$elemMatch": {"$nin": elements}}}},
compatible_only=compatible_only,
inc_structure=inc_structure,
property_data=property_data,
use_premade_entries=use_premade_entries,
conventional_unit_cell=conventional_unit_cell,
)
return entries
[docs]
def parse_criteria(criteria_string): # pragma: no cover
"""Parses a powerful and simple string criteria and generates a proper
mongo syntax criteria.
Args:
criteria_string (str): A string representing a search criteria.
Also supports wild cards. E.g.,
something like "*2O" gets converted to
{'pretty_formula': {'$in': [u'B2O', u'Xe2O', u"Li2O", ...]}}
Other syntax examples:
mp-1234: Interpreted as a Materials ID.
Fe2O3 or *2O3: Interpreted as reduced formulas.
Li-Fe-O or *-Fe-O: Interpreted as chemical systems.
You can mix and match with spaces, which are interpreted as
"OR". E.g., "mp-1234 FeO" means query for all compounds with
reduced formula FeO or with materials_id mp-1234.
Returns:
A mongo query dict.
"""
toks = criteria_string.split()
def parse_sym(sym):
if sym == "*":
return [el.symbol for el in Element]
m = re.match(r"\{(.*)\}", sym)
if m:
return [s.strip() for s in m.group(1).split(",")]
return [sym]
def parse_tok(t):
if re.match(r"\w+-\d+", t):
return {"task_id": t}
if "-" in t:
elements = [parse_sym(sym) for sym in t.split("-")]
chemsyss = []
for cs in itertools.product(*elements):
if len(set(cs)) == len(cs):
# Check for valid symbols
cs = [Element(s).symbol for s in cs]
chemsyss.append("-".join(sorted(cs)))
return {"chemsys": {"$in": chemsyss}}
all_formulas = set()
explicit_els = []
wild_card_els = []
for sym in re.findall(r"(\*[\.\d]*|\{.*\}[\.\d]*|[A-Z][a-z]*)[\.\d]*", t):
if ("*" in sym) or ("{" in sym):
wild_card_els.append(sym)
else:
m = re.match(r"([A-Z][a-z]*)[\.\d]*", sym)
explicit_els.append(m.group(1))
nelements = len(wild_card_els) + len(set(explicit_els))
parts = re.split(r"(\*|\{.*\})", t)
parts = [parse_sym(s) for s in parts if s != ""]
for f in itertools.product(*parts):
c = Composition("".join(f))
if len(c) == nelements:
# Check for valid Elements in keys.
for e in c:
Element(e.symbol)
all_formulas.add(c.reduced_formula)
return {"pretty_formula": {"$in": list(all_formulas)}}
if len(toks) == 1:
return parse_tok(toks[0])
return {"$or": list(map(parse_tok, toks))}