Source code for rxn_network.entries.utils

"""Utility functions for acquiring, processing, or modifiying entries."""

from __future__ import annotations

import itertools
import re
import warnings
from copy import deepcopy
from typing import TYPE_CHECKING

from pymatgen.core.composition import Element
from pymatgen.core.structure import Structure
from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer

from rxn_network.core import Composition
from rxn_network.entries.entry_set import GibbsEntrySet
from rxn_network.utils.funcs import get_logger

if TYPE_CHECKING:
    from collections.abc import Iterable

    from maggma.stores import MongoStore

logger = get_logger(__name__)


[docs] def process_entries( entries: Iterable[ComputedStructureEntry], temperature: float, e_above_hull: float, filter_at_temperature: int | None = None, include_nist_data: bool = True, include_freed_data: bool = False, include_polymorphs: bool = False, formulas_to_include: Iterable[str] | None = None, calculate_e_above_hulls: bool = False, ignore_nist_solids: bool = True, ) -> GibbsEntrySet: """Convenience function for processing a set of ComputedStructureEntry objects into a GibbsEntrySet with specified parameters. This is used when building entries in most of the jobs/flows. Args: entries: Iterable of ComputedStructureEntry objects. These can be downloaded from The Materials Project API or created manually with pymatgen. temperature: Temperature [K] for determining Gibbs Free Energy of formation, dGf(T). e_above_hull: Energy above hull (eV/atom) for thermodynamic stability threshold; i.e., include all entries with energies below this value. filter_at_temperature: Temperature (in Kelvin) at which entries are filtered for thermodynamic stability (e.g., room temperature). Generally, this often differs from the synthesis temperature. include_nist_data: Whether to include NIST-JANAF data in the entry set. Defaults to True. include_freed_data: Whether to include FREED data in the entry set. Defaults to False. WARNING: This dataset has not been thoroughly tested. Use at your own risk! include_polymorphs: Whether to include non-ground state polymorphs in the entry set. Defaults to False. formulas_to_include: An iterable of compositional formulas to ensure are included in the processed dataset. Sometimes, entries are filtered out that one would like to include, or entries don't exist for those compositions. calculate_e_above_hulls: Whether to calculate e_above_hull and store as an attribute in the data dictionary for each entry. ignore_nist_solids: Whether to ignore NIST data for solids with high melting points (Tm >= 1500 ÂșC). Defaults to True. Returns: A GibbsEntrySet object containing entry objects with the user-specified constraints. """ temp = temperature if filter_at_temperature: temp = filter_at_temperature entry_set = GibbsEntrySet.from_computed_entries( entries=entries, temperature=temp, include_nist_data=include_nist_data, include_freed_data=include_freed_data, ignore_nist_solids=ignore_nist_solids, ) included_entries = [initialize_entry(f, entry_set) for f in formulas_to_include] if formulas_to_include else [] entry_set = entry_set.filter_by_stability(e_above_hull=e_above_hull, include_polymorphs=include_polymorphs) entry_set.update(included_entries) # make sure these aren't filtered out if filter_at_temperature and (filter_at_temperature != temperature): entry_set = entry_set.get_entries_with_new_temperature(temperature) if calculate_e_above_hulls: entry_set = GibbsEntrySet(deepcopy(entry_set), calculate_e_above_hulls=True) return entry_set
[docs] def initialize_entry(formula: str, entry_set: GibbsEntrySet, stabilize: bool = False): """Acquire an entry by user-specified formula. This method attemps to first get the entry; if it is not included in the set, it will create an interpolated entry. Finally, if stabilize=True, the energy will be lowered until it appears on teh hull. Args: formula: Chemical formula entry_set: Set of entries stabilize: Whether or not to stabilize the entry by decreasing its energy such that it is 'on the hull'. """ try: entry = entry_set.get_min_entry_by_formula(formula) except KeyError: entry = entry_set.get_interpolated_entry(formula) warnings.warn(f"Using interpolated entry for {entry.composition.reduced_formula}") if stabilize: entry = entry_set.get_stabilized_entry(entry) return entry
[docs] def get_entries( db: MongoStore, chemsys_formula_id_criteria: str | dict, compatible_only: bool = True, inc_structure: str | None = None, property_data: list[str] | None = None, use_premade_entries: bool = False, conventional_unit_cell: bool = False, sort_by_e_above_hull: bool = False, ): # pragma: no cover """Warning: This function is legacy code directly adapted from pymatgen.ext.matproj. It is not broadly useful or applicable to other databases. It is only used in jobs interfaced directly with internal databases at the Materials Project. This code is not adequately tested and may not work as expected. Get a list of ComputedEntries or ComputedStructureEntries corresponding to a chemical system, formula, or materials_id or full criteria. Args: db: MongoStore object with database connection chemsys_formula_id_criteria: A chemical system (e.g., Li-Fe-O), or formula (e.g., Fe2O3) or materials_id (e.g., mp-1234) or full Mongo-style dict criteria. compatible_only: Whether to return only "compatible" entries. Compatible entries are entries that have been processed using the MaterialsProjectCompatibility class, which performs adjustments to allow mixing of GGA and GGA+U calculations for more accurate phase diagrams and reaction energies. inc_structure: If None, entries returned are ComputedEntries. If inc_structure="initial", ComputedStructureEntries with initial structures are returned. Otherwise, ComputedStructureEntries with final structures are returned. property_data: Specify additional properties to include in entry.data. If None, no data. Should be a subset of supported_properties. use_premade_entries: Whether to use entry objects that have already been constructed. Defaults to False. conventional_unit_cell: Whether to get the standard conventional unit cell sort_by_e_above_hull: Whether to sort the list of entries by e_above_hull (will query e_above_hull as a property_data if True). Returns: List of ComputedEntry or ComputedStructureEntry objects. """ params = [ "deprecated", "run_type", "is_hubbard", "pseudo_potential", "hubbards", "potcar_symbols", "oxide_type", ] props = ["final_energy", "unit_cell_formula", "task_id", *params] if sort_by_e_above_hull: if property_data and "e_above_hull" not in property_data: property_data.append("e_above_hull") elif not property_data: property_data = ["e_above_hull"] if property_data: props += property_data if inc_structure: if inc_structure == "initial": props.append("initial_structure") else: props.append("structure") if not isinstance(chemsys_formula_id_criteria, dict): criteria = parse_criteria(chemsys_formula_id_criteria) else: criteria = chemsys_formula_id_criteria if use_premade_entries: props = ["entries", "deprecated"] entries = [] for d in db.query(criteria, props): if d.get("deprecated"): continue if use_premade_entries: ent = d["entries"] if ent.get("GGA"): e = ComputedStructureEntry.from_dict(ent["GGA"]) elif ent.get("GGA+U"): e = ComputedStructureEntry.from_dict(ent["GGA+U"]) else: print(f"Missing entry for {d['_id']}") continue else: d["potcar_symbols"] = [ f"{d['pseudo_potential']['functional']} {label}" for label in d["pseudo_potential"].get("labels", []) ] data = {"oxide_type": d["oxide_type"]} if property_data: data.update({k: d[k] for k in property_data}) if not inc_structure: e = ComputedEntry( d["unit_cell_formula"], d["final_energy"], parameters={k: d[k] for k in params}, data=data, entry_id=d["task_id"], ) else: prim = Structure.from_dict(d["initial_structure"] if inc_structure == "initial" else d["structure"]) if conventional_unit_cell: s = SpacegroupAnalyzer(prim).get_conventional_standard_structure() energy = d["final_energy"] * (len(s) / len(prim)) else: s = prim.copy() energy = d["final_energy"] e = ComputedStructureEntry( s, energy, parameters={k: d[k] for k in params}, data=data, entry_id=d["task_id"], ) entries.append(e) if compatible_only: with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="Failed to guess oxidation states.*") entries = MaterialsProject2020Compatibility().process_entries(entries, clean=True) if sort_by_e_above_hull: entries = sorted(entries, key=lambda entry: entry.data["e_above_hull"]) return entries
[docs] def get_all_entries_in_chemsys( db: MongoStore, elements: str | list[str], compatible_only: bool = True, inc_structure: str | None = "final", property_data: list | None = None, use_premade_entries: bool = False, conventional_unit_cell: bool = False, n: int = 1000, ) -> list[ComputedEntry]: # pragma: no cover """Warning: This function is legacy code directly adapted from pymatgen.ext.matproj. It is not broadly useful or applicable to other databases. It is only used in jobs interfaced directly with internal databases at the Materials Project. This code is not adequately tested and may not work as expected. Helper method for getting all entries in a total chemical system by querying database for all sub-chemical systems. Code adadpted from pymatgen.ext.matproj and modified to support very large chemical systems. Args: db: MongoStore object with database connection elements (str or [str]): Chemical system string comprising element symbols separated by dashes, e.g., "Li-Fe-O" or List of element symbols, e.g., ["Li", "Fe", "O"]. compatible_only (bool): Whether to return only "compatible" entries. Compatible entries are entries that have been processed using the MaterialsProjectCompatibility class, which performs adjustments to allow mixing of GGA and GGA+U calculations for more accurate phase diagrams and reaction energies. inc_structure (str): If None, entries returned are ComputedEntries. If inc_structure="final", ComputedStructureEntries with final structures are returned. Otherwise, ComputedStructureEntries with initial structures are returned. property_data (list): Specify additional properties to include in entry.data. If None, no data. Should be a subset of supported_properties. use_premade_entries: Whether to use entry objects that have already been constructed. Defaults to False. conventional_unit_cell (bool): Whether to get the standard conventional unit cell n (int): Chunk size, i.e., number of sub-chemical systems to consider Returns: List of ComputedEntries. """ def divide_chunks(my_list, n): for i in range(0, len(my_list), n): yield my_list[i : i + n] if isinstance(elements, str): elements = elements.split("-") if len(elements) < 13: all_chemsyses = [] for i in range(len(elements)): for els in itertools.combinations(elements, i + 1): all_chemsyses.append("-".join(sorted(els))) all_chemsyses = list(divide_chunks(all_chemsyses, n)) entries = [] for chemsys_group in all_chemsyses: entries.extend( get_entries( db, {"chemsys": {"$in": chemsys_group}}, compatible_only=compatible_only, inc_structure=inc_structure, property_data=property_data, use_premade_entries=use_premade_entries, conventional_unit_cell=conventional_unit_cell, ) ) else: # for very large chemical systems, use a different approach entries = get_entries( db, {"elements": {"$not": {"$elemMatch": {"$nin": elements}}}}, compatible_only=compatible_only, inc_structure=inc_structure, property_data=property_data, use_premade_entries=use_premade_entries, conventional_unit_cell=conventional_unit_cell, ) return entries
[docs] def parse_criteria(criteria_string): # pragma: no cover """Parses a powerful and simple string criteria and generates a proper mongo syntax criteria. Args: criteria_string (str): A string representing a search criteria. Also supports wild cards. E.g., something like "*2O" gets converted to {'pretty_formula': {'$in': [u'B2O', u'Xe2O', u"Li2O", ...]}} Other syntax examples: mp-1234: Interpreted as a Materials ID. Fe2O3 or *2O3: Interpreted as reduced formulas. Li-Fe-O or *-Fe-O: Interpreted as chemical systems. You can mix and match with spaces, which are interpreted as "OR". E.g., "mp-1234 FeO" means query for all compounds with reduced formula FeO or with materials_id mp-1234. Returns: A mongo query dict. """ toks = criteria_string.split() def parse_sym(sym): if sym == "*": return [el.symbol for el in Element] m = re.match(r"\{(.*)\}", sym) if m: return [s.strip() for s in m.group(1).split(",")] return [sym] def parse_tok(t): if re.match(r"\w+-\d+", t): return {"task_id": t} if "-" in t: elements = [parse_sym(sym) for sym in t.split("-")] chemsyss = [] for cs in itertools.product(*elements): if len(set(cs)) == len(cs): # Check for valid symbols cs = [Element(s).symbol for s in cs] chemsyss.append("-".join(sorted(cs))) return {"chemsys": {"$in": chemsyss}} all_formulas = set() explicit_els = [] wild_card_els = [] for sym in re.findall(r"(\*[\.\d]*|\{.*\}[\.\d]*|[A-Z][a-z]*)[\.\d]*", t): if ("*" in sym) or ("{" in sym): wild_card_els.append(sym) else: m = re.match(r"([A-Z][a-z]*)[\.\d]*", sym) explicit_els.append(m.group(1)) nelements = len(wild_card_els) + len(set(explicit_els)) parts = re.split(r"(\*|\{.*\})", t) parts = [parse_sym(s) for s in parts if s != ""] for f in itertools.product(*parts): c = Composition("".join(f)) if len(c) == nelements: # Check for valid Elements in keys. for e in c: Element(e.symbol) all_formulas.add(c.reduced_formula) return {"pretty_formula": {"$in": list(all_formulas)}} if len(toks) == 1: return parse_tok(toks[0]) return {"$or": list(map(parse_tok, toks))}