Source code for atomate2.common.jobs.mpmorph

"""Define utility functions for amorphous structure equilibration.

This file generalizes the MPMorph workflows of
https://github.com/materialsproject/mpmorph
originally written in atomate for VASP only to a more general
code agnostic form.

For information about the current flows, contact:
- Bryant Li (@BryantLi-BLI)
- Aaron Kaplan (@esoteric-ephemera)
- Max Gallant (@mcgalcode)
"""

from __future__ import annotations

import os
from pathlib import Path
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
from jobflow import Job
from pymatgen.core import Composition, Molecule, Structure
from pymatgen.io.packmol import PackmolBoxGen

_DEFAULT_AVG_VOL_FILE = Path("~/.cache/atomate2").expanduser() / "db_avg_vols.json.gz"
if not _DEFAULT_AVG_VOL_FILE.parents[0].exists():
    os.makedirs(_DEFAULT_AVG_VOL_FILE.parents[0], exist_ok=True)
_DEFAULT_AVG_VOL_URL = "https://figshare.com/ndownloader/files/49704288"


def _get_average_volumes_file(
    chunk_size: int = 2048, timeout: float = 60
) -> pd.DataFrame:
    """
    Retrieve stored average volume data from figshare if needed.

    Parameters
    ----------
    chunk_size : int = 2048
        Chunk size for downloading from figshare
    timeout : float = 60
        Timeout time in seconds to wait for the request to resolve
    """
    if not _DEFAULT_AVG_VOL_FILE.exists():
        import requests  # type: ignore[import-untyped]

        stream_data = requests.get(_DEFAULT_AVG_VOL_URL, stream=True, timeout=timeout)
        with open(str(_DEFAULT_AVG_VOL_FILE), "wb") as file:
            for chunk in stream_data.iter_content(chunk_size=chunk_size):
                file.write(chunk)

    return pd.read_json(_DEFAULT_AVG_VOL_FILE, orient="split")



[docs]
def get_average_volume_from_mp_api(
    composition: Composition, mp_api_key: str | None = None
) -> float:
    """
    Get the average volume per atom for a given composition from the Materials Project.

    This function will make API calls to the Materials Project.
    Check Materials Project API documentation for more
    information https://next-gen.materialsproject.org/api.

    Parameters
    ----------
    composition : Composition
        The target composition.
    mp_api_key : str or None
        The user's MP API key.

    Returns
    -------
    float
        The average volume per atom for the composition in Angstrom^3.
    """
    from mp_api.client import MPRester

    with MPRester(api_key=mp_api_key) as mpr:
        comp_entries = mpr.get_entries(composition.reduced_formula, inc_structure=True)

    vols = None

    if len(comp_entries) > 0:
        vols = [
            entry.structure.volume / entry.structure.num_sites for entry in comp_entries
        ]

    else:
        # Find all Materials project entries containing the elements in the
        # desired composition to estimate starting volume.
        with MPRester() as mpr:
            _entries = mpr.get_entries_in_chemsys(
                [str(el) for el in composition.elements], inc_structure=True
            )

        # Only take entries with at least two elements in common with target composition
        entries = [
            entry
            for entry in _entries
            if len(set(composition).intersection(set(entry.structure.composition))) > 1
        ]

        vols = [entry.structure.volume / entry.structure.num_sites for entry in entries]

    return np.mean(vols)




[docs]
def get_average_volume_from_db_cached(
    composition: Composition,
    db_name: str,
    cache_file: pd.DataFrame | None = None,
    ignore_oxi_states: bool = True,
) -> float:
    """
    Get the average volume per atom for a given composition from cached data.

    This function uses cached data to accelerate the volume/atom search.

    Parameters
    ----------
    composition : Composition
        The target composition.
    db_name : str
        Name of the database to pull data from.
    cache_file : pandas DataFrame or None (default)
        DataFrame containing cached volumes.
        Should match the format of the data in _DEFAULT_AVG_VOL_FILE,
        and have the following columns:
            "chem_env", "avg_vol", "count", "with_oxi", "source"
    ignore_oxi_states : bool = True
        Whether to ignore oxidation state data.

    Returns
    -------
    float
        The average volume per atom for the composition.
    """
    avg_vols = cache_file or _get_average_volumes_file()

    avg_vols = avg_vols[avg_vols["source"] == db_name]
    return get_average_volume_from_database(
        composition,
        avg_vols=avg_vols,
        ignore_oxi_states=ignore_oxi_states,
    )




[docs]
def get_average_volume_from_mp(
    composition: Composition, use_cached: bool = True, **kwargs
) -> float:
    """
    Get the average volume per atom for a given composition from MP data.

    This function will either make MP API calls or used cached data for
    the search.

    Parameters
    ----------
    composition : Composition
        The target composition.
    use_cached : bool = True
        Whether to use cached MP data (True) or make calls to the MP API (False)
    **kwargs : kwargs to pass to the volume/atom search functions, see
        `get_average_volume_from_db_cached`,
        `get_average_volume_from_mp_api`
        for specific kwargs.

    Returns
    -------
    float
        The average volume per atom for the composition.
    """
    if use_cached:
        return get_average_volume_from_db_cached(composition, db_name="mp", **kwargs)
    return get_average_volume_from_mp_api(composition, **kwargs)



def _get_chem_env_key_from_composition(
    composition: Composition, ignore_oxi_states: bool = True
) -> str:
    """
    Get chemical environment as a string for ICSD avg volume determination.

    Parameters
    ----------
    composition : .Composition
        Structure composition
    ignore_oxi_states : bool = True
        Whether to ignore oxidation states assigned to sites in the structure,
        both in the input composition and ICSD structures.

        Note that 0+ / 0- oxidation states are treated identically even
        when ignore_oxi_states = False.

    Returns
    -------
    Chemical environment returned as a dunder-separated string,
    such as "Ag+__Cu2+__N5+__O2-"
    """
    comp = composition
    if ignore_oxi_states:
        comp = comp.remove_charges()
    chem_env = "__".join(sorted(set(comp.as_dict())))
    for char in ["+", "-"]:
        chem_env = chem_env.replace(f"0{char}", "")
    return chem_env



[docs]
def get_average_volume_from_database(
    composition: Composition,
    avg_vols: pd.DataFrame,
    ignore_oxi_states: bool = True,
) -> float:
    """
    Get average volume for a chemical environment from ICSD data.

    The ICSD data is for "reasonable", ordered, experimental inorganic solids.

    Parameters
    ----------
    composition : .Composition
        Structure composition
    avg_vols : pandas .DataFrame
        Chemical environment data for a given database.
        Should have the following columns:
            "chem_env", "avg_vol", "count", "with_oxi"
    ignore_oxi_states : bool = True
        Whether to ignore oxidation states assigned to sites in the structure,
        both in the input composition and ICSD structures.

        Note that 0+ / 0- oxidation states are treated identically even
        when ignore_oxi_states = False.

    Returns
    -------
    Average volume as a float
    """
    from itertools import combinations

    def get_entry_from_dict(chem_env: str) -> dict | None:
        data = avg_vols[avg_vols["chem_env"] == chem_env]
        data = data[
            data["with_oxi"]
            if (not ignore_oxi_states and len(data[data["with_oxi"]]) > 0)
            else ~data["with_oxi"]
        ]
        if len(data) > 0:
            return {k: data[k].squeeze() for k in ("avg_vol", "count")}
        return None

    chem_env_key = _get_chem_env_key_from_composition(
        composition, ignore_oxi_states=ignore_oxi_states
    )
    if (avg_vol := get_entry_from_dict(chem_env_key)) is not None:
        return avg_vol["avg_vol"]

    vols = []
    counts = 0
    for ielt in range(2, len(composition)):
        for combo in combinations(composition, ielt):
            chem_env_key = _get_chem_env_key_from_composition(
                Composition({spec: 1 for spec in combo}),
                ignore_oxi_states=ignore_oxi_states,
            )

            if (avg_vol := get_entry_from_dict(chem_env_key)) is not None:
                vols.append(avg_vol["avg_vol"] * avg_vol["count"])
                counts += avg_vol["count"]

    return sum(vols) / counts




[docs]
def get_random_packed_structure(
    composition: Composition | str,
    target_atoms: int = 100,
    vol_multiply: float = 1.0,
    tol: float = 2.0,
    return_as_job: bool = False,
    vol_per_atom_source: float | str = "mp",
    db_kwargs: dict | None = None,
    packmol_seed: int = 1,
    packmol_output_dir: str | Path | None = None,
) -> Structure | Job:
    """
    Generate a random packed structure with a target number of atoms.

    Designed to make amorphous/glassy structures.
    Defaults to using cached MP data.

    Parameters
    ----------
    composition : Composition | str
        The composition of the target structure.
    target_atoms : int
        The target number of atoms in the structure.
    vol_multiply : float
        The factor to multiply the structure volume by.
    tol : float
        The tolerance to apply to the box size.
    return_as_job : bool
        Whether to return the structure as a jobflow job object.
    vol_per_atom_source : float | str
        If float - the volume per atom used to generate lattice size
        If str - "mp" to use the Materials Project API to estimate volume per atom.
        If str - "icsd" to use the ICSD database to estimate volume per atom.
    db_kwargs : dict | None = None
        kwargs to pass to the volume-determining function.
    packmol_seed : int
        The seed to use for the packmol random number generator.
    packmol_output_dir : str | Path | None
        The directory to output the packmol files to. If None, a
        temporary directory is used and will be removed after.

    Returns
    -------
    Structure | Job
        The random packed structure.
    """
    if return_as_job:
        return Job(
            get_random_packed_structure,
            function_kwargs={
                "composition": composition,
                "target_atoms": target_atoms,
                "vol_multiply": vol_multiply,
                "tol": tol,
                "return_as_job": False,
                "vol_per_atom_source": vol_per_atom_source,
                "packmol_seed": packmol_seed,
            },
        )
    if isinstance(composition, str | dict):
        composition = Composition(composition)

    struct_db = (
        vol_per_atom_source.lower() if isinstance(vol_per_atom_source, str) else None
    )
    db_kwargs = db_kwargs or ({"use_cached": True} if struct_db == "mp" else {})

    if isinstance(vol_per_atom_source, float | int):
        vol_per_atom = vol_per_atom_source

    elif struct_db == "mp":
        vol_per_atom = get_average_volume_from_mp(composition, **db_kwargs)

    elif struct_db == "icsd":
        vol_per_atom = get_average_volume_from_db_cached(
            composition, db_name="icsd", **db_kwargs
        )

    else:
        raise ValueError(f"Unknown volume per atom source: {vol_per_atom_source}.")

    formula, _ = composition.get_integer_formula_and_factor()
    integer_composition = Composition(formula)
    full_cell_composition = integer_composition * np.ceil(
        target_atoms / integer_composition.num_atoms
    )

    supercell_composition = {
        str(el): int(full_cell_composition.element_composition.get(el))
        for el in full_cell_composition
    }

    with TemporaryDirectory() as tmpdir:
        molecules = []
        for element, num_sites in supercell_composition.items():
            xyz_file = f"{tmpdir}/{element}.xyz"
            with open(xyz_file, "w+") as f:
                f.write("1\ncomment\n" + element + " 0.0 0.0 0.0\n")
            molecules.append({"name": element, "number": num_sites, "coords": xyz_file})

        box_scale = (vol_per_atom * full_cell_composition.num_atoms * vol_multiply) ** (
            1 / 3
        )
        box_lower_bound = tol / 2
        box_upper_bound = box_scale - tol / 2

        box_size = 3 * [box_lower_bound] + 3 * [box_upper_bound]

        packmol_set = PackmolBoxGen(seed=packmol_seed).get_input_set(
            molecules=molecules, box=box_size
        )
        packmol_output_dir = str(packmol_output_dir or tmpdir)
        packmol_set.write_input(directory=packmol_output_dir)
        packmol_set.run(path=packmol_output_dir)

        mol = Molecule.from_file(f"{packmol_output_dir}/packmol_out.xyz")

    return Structure(
        [[box_scale if i == j else 0.0 for j in range(3)] for i in range(3)],
        species=mol.species,
        coords=mol.cart_coords,
        coords_are_cartesian=True,
    )