"""Core definition of a cclib-generated task document."""
import logging
import os
from pathlib import Path
from typing import Any, Union
from emmet.core.structure import MoleculeMetadata
from monty.dev import requires
from monty.json import jsanitize
from pydantic import Field
from pymatgen.core import Molecule
from pymatgen.core.periodic_table import Element
from typing_extensions import Self
from atomate2 import __version__
from atomate2.utils.datetime import datetime_str
from atomate2.utils.path import find_recent_logfile, get_uri
try:
import cclib
except ImportError:
cclib = None
logger = logging.getLogger(__name__)
[docs]
class TaskDocument(MoleculeMetadata, extra="allow"): # type: ignore[call-arg]
"""
Definition of a cclib-generated task document.
This can be used as a general task document for molecular DFT codes.
For the list of supported packages, see https://cclib.github.io
"""
molecule: Molecule | None = Field(
None, description="Final output molecule from the task"
)
energy: float | None = Field(None, description="Final total energy")
dir_name: str | None = Field(
None, description="Directory where the output is parsed"
)
logfile: str | None = Field(
None, description="Path to the log file used in the post-processing analysis"
)
attributes: dict | None = Field(
None, description="Computed properties and calculation outputs"
)
metadata: dict | None = Field(
None,
description="Calculation metadata, including input parameters and runtime "
"statistics",
)
task_label: str | None = Field(None, description="A description of the task")
tags: list[str] | None = Field(
None, description="Optional tags for this task document"
)
last_updated: str = Field(
default_factory=datetime_str,
description="Timestamp for this task document was last updated",
)
schema: str = Field(
__version__, description="Version of atomate2 used to create the document"
)
[docs]
@classmethod
@requires(cclib, "The cclib TaskDocument requires cclib to be installed.")
def from_logfile(
cls,
dir_name: Union[str, Path],
logfile_extensions: Union[str, list[str]],
store_trajectory: bool = False,
additional_fields: dict[str, Any] | None = None,
analysis: Union[str, list[str]] | None = None,
proatom_dir: Union[Path, str] | None = None,
) -> Self:
"""Create a TaskDocument from a log file.
For a full description of each field, see https://cclib.github.io/data.html.
Parameters
----------
dir_name
The path to the folder containing the calculation outputs.
logfile_extensions
Possible extensions of the log file (e.g. ".log", ".out", ".txt", ".chk").
Note that only a partial match is needed. For instance, `.log` will match
`.log.gz` and `.log.1.gz`. If multiple files with this extension are found,
the one with the most recent change time will be used. For an exact match
only, put in the full file name.
store_trajectory
Whether to store the molecule objects along the course of the relaxation
trajectory.
additional_fields
Dictionary of additional fields to add to TaskDocument.
analysis
The name(s) of any cclib post-processing analysis to run. Note that for
bader, ddec6, and hirshfeld, a cube file (.cube, .cub) must be in dir_name.
Supports: cpsa, mpa, lpa, bickelhaupt, density, mbo, bader, ddec6,
hirshfeld.
proatom_dir
The path to the proatom directory if ddec6 or hirshfeld analysis are
requested. See https://cclib.github.io/methods.html for details. If None,
the PROATOM_DIR environment variable must point to the proatom directory.
Returns
-------
TaskDocument
A TaskDocument object summarizing the inputs/outputs of the log file.
"""
from cclib.io import ccread
logger.info(
f"Searching for most recent log file with extensions {logfile_extensions}"
)
# Find the most recent log file with the given extension in the
# specified directory.
logfile = find_recent_logfile(dir_name, logfile_extensions)
if not logfile:
raise FileNotFoundError(
f"Could not find file with extension {logfile_extensions} in {dir_name}"
)
logger.info(f"Getting task doc from {logfile}")
additional_fields = additional_fields or {}
# Let's parse the log file with cclib
cclib_obj = ccread(logfile, logging.ERROR)
if not cclib_obj:
raise ValueError(f"Could not parse {logfile}")
# Fetch all the attributes (i.e. all input/outputs from cclib)
attributes = jsanitize(cclib_obj.getattributes())
# Store charge and multiplicity since we use it frequently
charge = cclib_obj.charge
mult = cclib_obj.mult
# Let's move the metadata out of attributes for convenience and
# store it separately
attributes.pop("metadata")
metadata = jsanitize(cclib_obj.metadata)
# monty datetime bug workaround: github.com/materialsvirtuallab/monty/issues/275
if wall_time := metadata.get("wall_time"):
metadata["wall_time"] = [*map(str, wall_time)]
if cpu_time := metadata.get("cpu_time"):
metadata["cpu_time"] = [*map(str, cpu_time)]
# Get the final energy to store as its own key/value pair
energy = (
cclib_obj.scfenergies[-1] if cclib_obj.scfenergies is not None else None
)
# Now we construct the input molecule. Note that this is not necessarily
# the same as the initial molecule from the relaxation because the
# DFT package may have re-oriented the system. We only try to store
# the input if it is XYZ-formatted though since the Molecule object
# does not support internal coordinates or Gaussian Z-matrix.
if (
cclib_obj.metadata.get("coord_type") == "xyz"
and cclib_obj.metadata.get("coords") is not None
):
coords_obj = cclib_obj.metadata["coords"]
input_species = [Element(row[0]) for row in coords_obj]
input_coords = [row[1:] for row in coords_obj]
input_molecule = Molecule(
input_species,
input_coords,
charge=charge,
spin_multiplicity=mult,
)
attributes["molecule_unoriented"] = input_molecule
# These are duplicates of things made with MoleculeMetadata, so we
# can just remove them here
duplicates = ["atomnos", "atomcoords", "charge", "mult", "natom"]
for duplicate in duplicates:
attributes.pop(duplicate, None)
# We will remove duplicates in the metadata too
metadata_duplicates = ["coords", "coord_type"]
for metadata_duplicate in metadata_duplicates:
metadata.pop(metadata_duplicate, None)
# Construct the Molecule object(s) from the trajectory
species = [Element.from_Z(z) for z in cclib_obj.atomnos]
coords = cclib_obj.atomcoords
molecules = [
Molecule(
species,
coord,
charge=charge,
spin_multiplicity=mult,
)
for coord in coords
]
initial_molecule = molecules[0]
final_molecule = molecules[-1]
attributes["molecule_initial"] = initial_molecule
if store_trajectory:
attributes["trajectory"] = molecules
# Store the HOMO/LUMO energies for convenience
if cclib_obj.moenergies is not None and cclib_obj.homos is not None:
homo_energies, lumo_energies, homo_lumo_gaps = _get_homos_lumos(
cclib_obj.moenergies, cclib_obj.homos
)
attributes["homo_energies"] = homo_energies
if lumo_energies:
attributes["lumo_energies"] = lumo_energies
if homo_lumo_gaps:
attributes["homo_lumo_gaps"] = homo_lumo_gaps
# The HOMO-LUMO gap for a spin-polarized system is ill-defined.
# This is why we report both the alpha and beta channel gaps
# above. Here, we report min(LUMO_alpha-HOMO_alpha,LUMO_beta-HOMO_beta)
# in case the user wants to easily query by this too. For restricted
# systems, this will always be the same as above.
attributes["min_homo_lumo_gap"] = min(homo_lumo_gaps)
# Calculate any properties
if analysis:
if isinstance(analysis, str):
analysis = [analysis]
analysis = [a.lower() for a in analysis]
# Look for .cube or .cub files
cubefile_path = find_recent_logfile(dir_name, [".cube", ".cub"])
for analysis_name in analysis:
logger.info(f"Running {analysis_name}")
calc_attributes = cclib_calculate(
cclib_obj, analysis_name, cubefile_path, proatom_dir
)
if calc_attributes:
attributes[analysis_name] = calc_attributes
else:
attributes[analysis_name] = None
doc = cls.from_molecule(
final_molecule,
energy=energy,
dir_name=get_uri(dir_name),
logfile=get_uri(logfile),
attributes=attributes,
metadata=metadata,
)
doc.molecule = final_molecule
return doc.model_copy(update=additional_fields)
[docs]
@requires(cclib, "cclib_calculate requires cclib to be installed.")
def cclib_calculate(
cclib_obj: Any,
method: str,
cube_file: Union[Path, str],
proatom_dir: Union[Path, str],
) -> dict[str, Any] | None:
"""
Run a cclib population analysis.
Parameters
----------
cclib_obj
The cclib object to run the population analysis on.
method
The population analysis method to use.
cube_file
The path to the cube file to use for the population analysis.
Needed only for Bader, DDEC6, and Hirshfeld
proatom_dir
The path to the proatom directory to use for the population analysis.
Needed only for DDEC6 and Hirshfeld.
"""
from cclib.method import (
CSPA,
DDEC6,
LPA,
MBO,
MPA,
Bader,
Bickelhaupt,
Density,
Hirshfeld,
volume,
)
method = method.lower()
cube_methods = ("bader", "ddec6", "hirshfeld")
if method in cube_methods and not cube_file:
raise FileNotFoundError(f"A cube file must be provided for {method}.")
if method in ("ddec6", "hirshfeld") and not proatom_dir:
if os.getenv("PROATOM_DIR") is None:
raise OSError("PROATOM_DIR environment variable not set.")
proatom_dir = os.path.expandvars(os.environ["PROATOM_DIR"])
if proatom_dir and not os.path.isdir(proatom_dir):
raise FileNotFoundError(f"{proatom_dir=} does not exist.")
if cube_file and method in cube_methods:
vol = volume.read_from_cube(str(cube_file))
if method == "bader":
_method = Bader(cclib_obj, vol)
elif method == "bickelhaupt":
_method = Bickelhaupt(cclib_obj)
elif method == "cpsa":
_method = CSPA(cclib_obj)
elif method == "ddec6":
_method = DDEC6(cclib_obj, vol, str(proatom_dir))
elif method == "density":
_method = Density(cclib_obj)
elif method == "hirshfeld":
_method = Hirshfeld(cclib_obj, vol, str(proatom_dir))
elif method == "lpa":
_method = LPA(cclib_obj)
elif method == "mbo":
_method = MBO(cclib_obj)
elif method == "mpa":
_method = MPA(cclib_obj)
else:
raise ValueError(f"{method=} is not supported.")
try:
_method.calculate()
except AttributeError:
return None
# The list of available attributes after a calculation. This is hardcoded for now
# until https://github.com/cclib/cclib/issues/1097 is resolved. Once it is, we can
# delete this and just do `return calc_attributes.getattributes()`.
avail_attributes = [
"aoresults",
"fragresults",
"fragcharges",
"density",
"donations",
"bdonations",
"repulsions",
"matches",
"refcharges",
]
calc_attributes = {}
for attribute in avail_attributes:
if hasattr(_method, attribute):
calc_attributes[attribute] = getattr(_method, attribute)
return calc_attributes
def _get_homos_lumos(
mo_energies: list[list[float]], homo_indices: list[int]
) -> tuple[list[float], list[float] | None, list[float] | None]:
"""
Calculate the HOMO, LUMO, and HOMO-LUMO gap energies in eV.
Parameters
----------
mo_energies
List of MO energies. For restricted calculations, List[List[float]] is
length one. For unrestricted, it is length two.
homo_indices
Indices of the HOMOs.
Returns
-------
homo_energies
The HOMO energies (eV), split by alpha and beta
lumo_energies
The LUMO energies (eV), split by alpha and beta
homo_lumo_gaps
The HOMO-LUMO gaps (eV), calculated as LUMO_alpha-HOMO_alpha and
LUMO_beta-HOMO_beta
"""
homo_energies = [mo_energies[idx][homo] for idx, homo in enumerate(homo_indices)]
# Make sure that the HOMO+1 (i.e. LUMO) is in MO energies (sometimes virtual
# orbitals aren't printed in the output)
for idx, homo in enumerate(homo_indices):
if len(mo_energies[idx]) < homo + 2:
return homo_energies, None, None
lumo_energies = [
mo_energies[idx][homo + 1] for idx, homo in enumerate(homo_indices)
]
homo_lumo_gaps = [
lumo_energies[i] - homo_energies[i] for i in range(len(homo_energies))
]
return homo_energies, lumo_energies, homo_lumo_gaps