Stores

Note

Some Store classes require extra packages that are not installed by default. Run the following modified installation commands if you want to use these stores:

MongograntStore:

pip install maggma[mongogrant]

MontyStore:

pip install maggma[montydb]

VaultStore:

pip install maggma[vault]

Module containing various definitions of Stores. Stores are a default access pattern to data and provide various utilities.

`JSONStore` ¶

Bases: MemoryStore

A Store for access to a single or multiple JSON files.

Source code in src/maggma/stores/mongolike.py

class JSONStore(MemoryStore):
    """
    A Store for access to a single or multiple JSON files.
    """

    def __init__(
        self,
        paths: Union[str, list[str]],
        read_only: bool = True,
        serialization_option: Optional[int] = None,
        serialization_default: Optional[Callable[[Any], Any]] = None,
        encoding: Optional[str] = None,
        **kwargs,
    ):
        """
        Args:
            paths: paths for json files to turn into a Store
            read_only: whether this JSONStore is read only. When read_only=True,
                       the JSONStore can still apply MongoDB-like writable operations
                       (e.g. an update) because it behaves like a MemoryStore,
                       but it will not write those changes to the file. On the other hand,
                       if read_only=False (i.e., it is writeable), the JSON file
                       will be automatically updated every time a write-like operation is
                       performed.

                       Note that when read_only=False, JSONStore only supports a single JSON
                       file. If the file does not exist, it will be automatically created
                       when the JSONStore is initialized.
            serialization_option:
                option that will be passed to the orjson.dump when saving to the json the file.
            serialization_default:
                default that will be passed to the orjson.dump when saving to the json the file.
            encoding: Character encoding of files to be tracked by the store. The default
                (None) follows python's default behavior, which is to determine the character
                encoding from the platform. This should work in the great majority of cases.
                However, if you encounter a UnicodeDecodeError, consider setting the encoding
                explicitly to 'utf8' or another encoding as appropriate.
        """
        paths = paths if isinstance(paths, (list, tuple)) else [paths]
        self.paths = paths
        self.encoding = encoding

        # file_writable overrides read_only for compatibility reasons
        if "file_writable" in kwargs:
            file_writable = kwargs.pop("file_writable")
            warnings.warn(
                "file_writable is deprecated; use read only instead.",
                DeprecationWarning,
            )
            self.read_only = not file_writable
            if self.read_only != read_only:
                warnings.warn(
                    f"Received conflicting keyword arguments file_writable={file_writable}"
                    f" and read_only={read_only}. Setting read_only={file_writable}.",
                    UserWarning,
                )
        else:
            self.read_only = read_only
        self.kwargs = kwargs

        if not self.read_only and len(paths) > 1:
            raise RuntimeError("Cannot instantiate file-writable JSONStore with multiple JSON files.")

        self.default_sort = None
        self.serialization_option = serialization_option
        self.serialization_default = serialization_default

        super().__init__(**kwargs)

    def connect(self, force_reset: bool = False):
        """
        Loads the files into the collection in memory.

        Args:
            force_reset: whether to reset the connection or not. If False (default) and .connect()
            has been called previously, the .json file will not be read in again. This can improve performance
            on systems with slow storage when multiple connect / disconnects are performed.
        """
        if self._coll is None or force_reset:
            self._coll = mongomock.MongoClient().db[self.name]  # type: ignore

            # create the .json file if it does not exist
            if not self.read_only and not Path(self.paths[0]).exists():
                with zopen(self.paths[0], "w", encoding=self.encoding) as f:
                    data: list[dict] = []
                    bytesdata = orjson.dumps(data)
                    f.write(bytesdata.decode("utf-8"))

            for path in self.paths:
                objects = self.read_json_file(path)
                try:
                    self.update(objects)
                except KeyError:
                    raise KeyError(
                        f"""
                        Key field '{self.key}' not found in {path.name}. This
                        could mean that this JSONStore was initially created with a different key field.
                        The keys found in the .json file are {list(objects[0].keys())}. Try
                        re-initializing your JSONStore using one of these as the key arguments.
                        """
                    )

    def read_json_file(self, path) -> list:
        """
        Helper method to read the contents of a JSON file and generate
        a list of docs.

        Args:
            path: Path to the JSON file to be read
        """
        with zopen(path) as f:
            data = f.read()
            data = data.decode() if isinstance(data, bytes) else data
            objects = bson.json_util.loads(data) if "$oid" in data else orjson.loads(data)
            objects = [objects] if not isinstance(objects, list) else objects
            # datetime objects deserialize to str. Try to convert the last_updated
            # field back to datetime.
            # # TODO - there may still be problems caused if a JSONStore is init'ed from
            # documents that don't contain a last_updated field
            # See Store.last_updated in store.py.
            for obj in objects:
                if obj.get(self.last_updated_field):
                    obj[self.last_updated_field] = to_dt(obj[self.last_updated_field])

        return objects

    def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
        """
        Update documents into the Store.

        For a file-writable JSONStore, the json file is updated.

        Args:
            docs: the document or list of documents to update
            key: field name(s) to determine uniqueness for a
                 document, can be a list of multiple fields,
                 a single field, or None if the Store's key
                 field is to be used
        """
        super().update(docs=docs, key=key)
        if not self.read_only:
            self.update_json_file()

    def remove_docs(self, criteria: dict):
        """
        Remove docs matching the query dictionary.

        For a file-writable JSONStore, the json file is updated.

        Args:
            criteria: query dictionary to match
        """
        super().remove_docs(criteria=criteria)
        if not self.read_only:
            self.update_json_file()

    def update_json_file(self):
        """
        Updates the json file when a write-like operation is performed.
        """
        with zopen(self.paths[0], "w", encoding=self.encoding) as f:
            data = list(self.query())
            for d in data:
                d.pop("_id")
            bytesdata = orjson.dumps(
                data,
                option=self.serialization_option,
                default=self.serialization_default,
            )
            f.write(bytesdata.decode("utf-8"))

    def __hash__(self):
        return hash((*self.paths, self.last_updated_field))

    def __eq__(self, other: object) -> bool:
        """
        Check equality for JSONStore.

        Args:
            other: other JSONStore to compare with
        """
        if not isinstance(other, JSONStore):
            return False

        fields = ["paths", "last_updated_field"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`eq(other)` ¶

Check equality for JSONStore.

Parameters:

Name	Type	Description	Default
`other`	`object`	other JSONStore to compare with	required

Source code in src/maggma/stores/mongolike.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for JSONStore.

    Args:
        other: other JSONStore to compare with
    """
    if not isinstance(other, JSONStore):
        return False

    fields = ["paths", "last_updated_field"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(paths, read_only=True, serialization_option=None, serialization_default=None, encoding=None, **kwargs)` ¶

Parameters:

Name	Type	Description	Default
`paths`	`Union[str, list[str]]`	paths for json files to turn into a Store	required
`read_only`	`bool`	whether this JSONStore is read only. When read_only=True, the JSONStore can still apply MongoDB-like writable operations (e.g. an update) because it behaves like a MemoryStore, but it will not write those changes to the file. On the other hand, if read_only=False (i.e., it is writeable), the JSON file will be automatically updated every time a write-like operation is performed. `Note that when read_only=False, JSONStore only supports a single JSON file. If the file does not exist, it will be automatically created when the JSONStore is initialized.`	`True`
`serialization_option`	`Optional[int]`	option that will be passed to the orjson.dump when saving to the json the file.	`None`
`serialization_default`	`Optional[Callable[[Any], Any]]`	default that will be passed to the orjson.dump when saving to the json the file.	`None`
`encoding`	`Optional[str]`	Character encoding of files to be tracked by the store. The default (None) follows python's default behavior, which is to determine the character encoding from the platform. This should work in the great majority of cases. However, if you encounter a UnicodeDecodeError, consider setting the encoding explicitly to 'utf8' or another encoding as appropriate.	`None`

Source code in src/maggma/stores/mongolike.py

def __init__(
    self,
    paths: Union[str, list[str]],
    read_only: bool = True,
    serialization_option: Optional[int] = None,
    serialization_default: Optional[Callable[[Any], Any]] = None,
    encoding: Optional[str] = None,
    **kwargs,
):
    """
    Args:
        paths: paths for json files to turn into a Store
        read_only: whether this JSONStore is read only. When read_only=True,
                   the JSONStore can still apply MongoDB-like writable operations
                   (e.g. an update) because it behaves like a MemoryStore,
                   but it will not write those changes to the file. On the other hand,
                   if read_only=False (i.e., it is writeable), the JSON file
                   will be automatically updated every time a write-like operation is
                   performed.

                   Note that when read_only=False, JSONStore only supports a single JSON
                   file. If the file does not exist, it will be automatically created
                   when the JSONStore is initialized.
        serialization_option:
            option that will be passed to the orjson.dump when saving to the json the file.
        serialization_default:
            default that will be passed to the orjson.dump when saving to the json the file.
        encoding: Character encoding of files to be tracked by the store. The default
            (None) follows python's default behavior, which is to determine the character
            encoding from the platform. This should work in the great majority of cases.
            However, if you encounter a UnicodeDecodeError, consider setting the encoding
            explicitly to 'utf8' or another encoding as appropriate.
    """
    paths = paths if isinstance(paths, (list, tuple)) else [paths]
    self.paths = paths
    self.encoding = encoding

    # file_writable overrides read_only for compatibility reasons
    if "file_writable" in kwargs:
        file_writable = kwargs.pop("file_writable")
        warnings.warn(
            "file_writable is deprecated; use read only instead.",
            DeprecationWarning,
        )
        self.read_only = not file_writable
        if self.read_only != read_only:
            warnings.warn(
                f"Received conflicting keyword arguments file_writable={file_writable}"
                f" and read_only={read_only}. Setting read_only={file_writable}.",
                UserWarning,
            )
    else:
        self.read_only = read_only
    self.kwargs = kwargs

    if not self.read_only and len(paths) > 1:
        raise RuntimeError("Cannot instantiate file-writable JSONStore with multiple JSON files.")

    self.default_sort = None
    self.serialization_option = serialization_option
    self.serialization_default = serialization_default

    super().__init__(**kwargs)

`connect(force_reset=False)` ¶

Loads the files into the collection in memory.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not. If False (default) and .connect()	`False`

Source code in src/maggma/stores/mongolike.py

def connect(self, force_reset: bool = False):
    """
    Loads the files into the collection in memory.

    Args:
        force_reset: whether to reset the connection or not. If False (default) and .connect()
        has been called previously, the .json file will not be read in again. This can improve performance
        on systems with slow storage when multiple connect / disconnects are performed.
    """
    if self._coll is None or force_reset:
        self._coll = mongomock.MongoClient().db[self.name]  # type: ignore

        # create the .json file if it does not exist
        if not self.read_only and not Path(self.paths[0]).exists():
            with zopen(self.paths[0], "w", encoding=self.encoding) as f:
                data: list[dict] = []
                bytesdata = orjson.dumps(data)
                f.write(bytesdata.decode("utf-8"))

        for path in self.paths:
            objects = self.read_json_file(path)
            try:
                self.update(objects)
            except KeyError:
                raise KeyError(
                    f"""
                    Key field '{self.key}' not found in {path.name}. This
                    could mean that this JSONStore was initially created with a different key field.
                    The keys found in the .json file are {list(objects[0].keys())}. Try
                    re-initializing your JSONStore using one of these as the key arguments.
                    """
                )

`read_json_file(path)` ¶

Helper method to read the contents of a JSON file and generate a list of docs.

Parameters:

Name	Type	Description	Default
`path`		Path to the JSON file to be read	required

Source code in src/maggma/stores/mongolike.py

def read_json_file(self, path) -> list:
    """
    Helper method to read the contents of a JSON file and generate
    a list of docs.

    Args:
        path: Path to the JSON file to be read
    """
    with zopen(path) as f:
        data = f.read()
        data = data.decode() if isinstance(data, bytes) else data
        objects = bson.json_util.loads(data) if "$oid" in data else orjson.loads(data)
        objects = [objects] if not isinstance(objects, list) else objects
        # datetime objects deserialize to str. Try to convert the last_updated
        # field back to datetime.
        # # TODO - there may still be problems caused if a JSONStore is init'ed from
        # documents that don't contain a last_updated field
        # See Store.last_updated in store.py.
        for obj in objects:
            if obj.get(self.last_updated_field):
                obj[self.last_updated_field] = to_dt(obj[self.last_updated_field])

    return objects

`remove_docs(criteria)` ¶

Remove docs matching the query dictionary.

For a file-writable JSONStore, the json file is updated.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required

Source code in src/maggma/stores/mongolike.py

def remove_docs(self, criteria: dict):
    """
    Remove docs matching the query dictionary.

    For a file-writable JSONStore, the json file is updated.

    Args:
        criteria: query dictionary to match
    """
    super().remove_docs(criteria=criteria)
    if not self.read_only:
        self.update_json_file()

`update(docs, key=None)` ¶

Update documents into the Store.

For a file-writable JSONStore, the json file is updated.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used	`None`

Source code in src/maggma/stores/mongolike.py

def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
    """
    Update documents into the Store.

    For a file-writable JSONStore, the json file is updated.

    Args:
        docs: the document or list of documents to update
        key: field name(s) to determine uniqueness for a
             document, can be a list of multiple fields,
             a single field, or None if the Store's key
             field is to be used
    """
    super().update(docs=docs, key=key)
    if not self.read_only:
        self.update_json_file()

`update_json_file()` ¶

Updates the json file when a write-like operation is performed.

Source code in src/maggma/stores/mongolike.py

def update_json_file(self):
    """
    Updates the json file when a write-like operation is performed.
    """
    with zopen(self.paths[0], "w", encoding=self.encoding) as f:
        data = list(self.query())
        for d in data:
            d.pop("_id")
        bytesdata = orjson.dumps(
            data,
            option=self.serialization_option,
            default=self.serialization_default,
        )
        f.write(bytesdata.decode("utf-8"))

`MemoryStore` ¶

Bases: MongoStore

An in-memory Store that functions similarly to a MongoStore.

Source code in src/maggma/stores/mongolike.py

class MemoryStore(MongoStore):
    """
    An in-memory Store that functions similarly
    to a MongoStore.
    """

    def __init__(self, collection_name: str = "memory_db", **kwargs):
        """
        Initializes the Memory Store.

        Args:
            collection_name: name for the collection in memory.
        """
        self.collection_name = collection_name
        self.default_sort = None
        self._coll = None
        self.kwargs = kwargs
        super(MongoStore, self).__init__(**kwargs)

    def connect(self, force_reset: bool = False):
        """
        Connect to the source data.

        Args:
            force_reset: whether to reset the connection or not when the Store is
                already connected.
        """
        if self._coll is None or force_reset:
            self._coll = mongomock.MongoClient().db[self.name]  # type: ignore

    def close(self):
        """Close up all collections."""
        self._coll.database.client.close()

    @property
    def name(self):
        """Name for the store."""
        return f"mem://{self.collection_name}"

    def __hash__(self):
        """Hash for the store."""
        return hash((self.name, self.last_updated_field))

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        """
        Simple grouping function that will group documents
        by keys.

        Args:
            keys: fields to group documents
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned

        Returns:
            generator returning tuples of (key, list of elements)
        """
        keys = keys if isinstance(keys, list) else [keys]

        if properties is None:
            properties = []
        if isinstance(properties, dict):
            properties = list(properties.keys())

        data = [
            doc for doc in self.query(properties=keys + properties, criteria=criteria) if all(has(doc, k) for k in keys)
        ]

        def grouping_keys(doc):
            return tuple(get(doc, k) for k in keys)

        for vals, group in groupby(sorted(data, key=grouping_keys), key=grouping_keys):
            doc = {}  # type: ignore
            for k, v in zip(keys, vals):
                set_(doc, k, v)
            yield doc, list(group)

    def __eq__(self, other: object) -> bool:
        """
        Check equality for MemoryStore
        other: other MemoryStore to compare with.
        """
        if not isinstance(other, MemoryStore):
            return False

        fields = ["collection_name", "last_updated_field"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`name` `property` ¶

Name for the store.

`eq(other)` ¶

Check equality for MemoryStore other: other MemoryStore to compare with.

Source code in src/maggma/stores/mongolike.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for MemoryStore
    other: other MemoryStore to compare with.
    """
    if not isinstance(other, MemoryStore):
        return False

    fields = ["collection_name", "last_updated_field"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`hash()` ¶

Hash for the store.

Source code in src/maggma/stores/mongolike.py

def __hash__(self):
    """Hash for the store."""
    return hash((self.name, self.last_updated_field))

`init(collection_name='memory_db', **kwargs)` ¶

Initializes the Memory Store.

Parameters:

Name	Type	Description	Default
`collection_name`	`str`	name for the collection in memory.	`'memory_db'`

Source code in src/maggma/stores/mongolike.py

def __init__(self, collection_name: str = "memory_db", **kwargs):
    """
    Initializes the Memory Store.

    Args:
        collection_name: name for the collection in memory.
    """
    self.collection_name = collection_name
    self.default_sort = None
    self._coll = None
    self.kwargs = kwargs
    super(MongoStore, self).__init__(**kwargs)

`close()` ¶

Close up all collections.

Source code in src/maggma/stores/mongolike.py

def close(self):
    """Close up all collections."""
    self._coll.database.client.close()

`connect(force_reset=False)` ¶

Connect to the source data.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not when the Store is already connected.	`False`

Source code in src/maggma/stores/mongolike.py

def connect(self, force_reset: bool = False):
    """
    Connect to the source data.

    Args:
        force_reset: whether to reset the connection or not when the Store is
            already connected.
    """
    if self._coll is None or force_reset:
        self._coll = mongomock.MongoClient().db[self.name]  # type: ignore

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Simple grouping function that will group documents by keys.

Parameters:

Name	Type	Description	Default
`keys`	`Union[list[str], str]`	fields to group documents	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Returns:

Type	Description
`Iterator[tuple[dict, list[dict]]]`	generator returning tuples of (key, list of elements)

Source code in src/maggma/stores/mongolike.py

def groupby(
    self,
    keys: Union[list[str], str],
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[tuple[dict, list[dict]]]:
    """
    Simple grouping function that will group documents
    by keys.

    Args:
        keys: fields to group documents
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned

    Returns:
        generator returning tuples of (key, list of elements)
    """
    keys = keys if isinstance(keys, list) else [keys]

    if properties is None:
        properties = []
    if isinstance(properties, dict):
        properties = list(properties.keys())

    data = [
        doc for doc in self.query(properties=keys + properties, criteria=criteria) if all(has(doc, k) for k in keys)
    ]

    def grouping_keys(doc):
        return tuple(get(doc, k) for k in keys)

    for vals, group in groupby(sorted(data, key=grouping_keys), key=grouping_keys):
        doc = {}  # type: ignore
        for k, v in zip(keys, vals):
            set_(doc, k, v)
        yield doc, list(group)

`MongoStore` ¶

Bases: Store

A Store that connects to a Mongo collection.

Source code in src/maggma/stores/mongolike.py

class MongoStore(Store):
    """
    A Store that connects to a Mongo collection.
    """

    def __init__(
        self,
        database: str,
        collection_name: str,
        host: str = "localhost",
        port: int = 27017,
        username: str = "",
        password: str = "",
        ssh_tunnel: Optional[SSHTunnel] = None,
        safe_update: bool = False,
        auth_source: Optional[str] = None,
        mongoclient_kwargs: Optional[dict] = None,
        default_sort: Optional[dict[str, Union[Sort, int]]] = None,
        **kwargs,
    ):
        """
        Args:
            database: The database name
            collection_name: The collection name
            host: Hostname for the database
            port: TCP port to connect to
            username: Username for the collection
            password: Password to connect with
            safe_update: fail gracefully on DocumentTooLarge errors on update
            auth_source: The database to authenticate on. Defaults to the database name.
            default_sort: Default sort field and direction to use when querying. Can be used to
                ensure determinacy in query results.
        """
        self.database = database
        self.collection_name = collection_name
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.ssh_tunnel = ssh_tunnel
        self.safe_update = safe_update
        self.default_sort = default_sort
        self._coll = None  # type: ignore
        self.kwargs = kwargs

        if auth_source is None:
            auth_source = self.database
        self.auth_source = auth_source
        self.mongoclient_kwargs = mongoclient_kwargs or {}

        super().__init__(**kwargs)

    @property
    def name(self) -> str:
        """
        Return a string representing this data source.
        """
        return f"mongo://{self.host}/{self.database}/{self.collection_name}"

    def connect(self, force_reset: bool = False):
        """
        Connect to the source data.

        Args:
            force_reset: whether to reset the connection or not when the Store is
                already connected.
        """
        if self._coll is None or force_reset:
            if self.ssh_tunnel is None:
                host = self.host
                port = self.port
            else:
                self.ssh_tunnel.start()
                host, port = self.ssh_tunnel.local_address

            conn: MongoClient = (
                MongoClient(
                    host=host,
                    port=port,
                    username=self.username,
                    password=self.password,
                    authSource=self.auth_source,
                    **self.mongoclient_kwargs,
                )
                if self.username != ""
                else MongoClient(host, port, **self.mongoclient_kwargs)
            )
            db = conn[self.database]
            self._coll = db[self.collection_name]  # type: ignore

    def __hash__(self) -> int:
        """Hash for MongoStore."""
        return hash((self.database, self.collection_name, self.last_updated_field))

    @classmethod
    def from_db_file(cls, filename: str, **kwargs):
        """
        Convenience method to construct MongoStore from db_file
        from old QueryEngine format.
        """
        kwargs = loadfn(filename)
        if "collection" in kwargs:
            kwargs["collection_name"] = kwargs.pop("collection")
        # Get rid of aliases from traditional query engine db docs
        kwargs.pop("aliases", None)
        return cls(**kwargs)

    @classmethod
    def from_launchpad_file(cls, lp_file, collection_name, **kwargs):
        """
        Convenience method to construct MongoStore from a launchpad file.

        Note: A launchpad file is a special formatted yaml file used in fireworks

        Returns:
        """
        with open(lp_file) as f:
            yaml = YAML(typ="safe", pure=True)
            lp_creds = yaml.load(f.read())

        db_creds = lp_creds.copy()
        db_creds["database"] = db_creds["name"]
        for key in list(db_creds.keys()):
            if key not in ["database", "host", "port", "username", "password"]:
                db_creds.pop(key)
        db_creds["collection_name"] = collection_name

        return cls(**db_creds, **kwargs)

    def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
        """
        Get all distinct values for a field.

        Args:
            field: the field(s) to get distinct values for
            criteria: PyMongo filter for documents to search in
        """
        criteria = criteria or {}
        try:
            distinct_vals = self._collection.distinct(field, criteria)
        except (OperationFailure, DocumentTooLarge):
            distinct_vals = [
                d["_id"] for d in self._collection.aggregate([{"$match": criteria}, {"$group": {"_id": f"${field}"}}])
            ]
            if all(isinstance(d, list) for d in filter(None, distinct_vals)):  # type: ignore
                distinct_vals = list(chain.from_iterable(filter(None, distinct_vals)))

        return distinct_vals if distinct_vals is not None else []

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        """
        Simple grouping function that will group documents
        by keys.

        Args:
            keys: fields to group documents
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned

        Returns:
            generator returning tuples of (key, list of docs)
        """
        pipeline = []
        if isinstance(keys, str):
            keys = [keys]

        if properties is None:
            properties = []
        if isinstance(properties, dict):
            properties = list(properties.keys())

        if criteria is not None:
            pipeline.append({"$match": criteria})

        if len(properties) > 0:
            pipeline.append({"$project": {p: 1 for p in properties + keys}})

        alpha = "abcdefghijklmnopqrstuvwxyz"
        group_id = {letter: f"${key}" for letter, key in zip(alpha, keys)}
        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
        for d in self._collection.aggregate(pipeline, allowDiskUse=True):
            id_doc = {}  # type: ignore
            for letter, key in group_id.items():
                if has(d["_id"], letter):
                    set_(id_doc, key[1:], d["_id"][letter])
            yield (id_doc, d["docs"])

    @classmethod
    def from_collection(cls, collection):
        """
        Generates a MongoStore from a pymongo collection object
        This is not a fully safe operation as it gives dummy information to the MongoStore
        As a result, this will not serialize and can not reset its connection.

        Args:
            collection: the PyMongo collection to create a MongoStore around
        """
        # TODO: How do we make this safer?
        coll_name = collection.name
        db_name = collection.database.name

        store = cls(db_name, coll_name)
        store._coll = collection
        return store

    @property
    def _collection(self):
        """Property referring to underlying pymongo collection."""
        if self._coll is None:
            raise StoreError("Must connect Mongo-like store before attempting to use it")
        return self._coll

    def count(
        self,
        criteria: Optional[dict] = None,
        hint: Optional[dict[str, Union[Sort, int]]] = None,
    ) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in
            hint: Dictionary of indexes to use as hints for query optimizer.
                Keys are field names and values are 1 for ascending or -1 for descending.
        """
        criteria = criteria if criteria else {}

        hint_list = (
            [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None
        )

        if hint_list is not None:  # pragma: no cover
            return self._collection.count_documents(filter=criteria, hint=hint_list)

        return (
            self._collection.count_documents(filter=criteria)
            if criteria
            else self._collection.estimated_document_count()
        )

    def query(  # type: ignore
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        hint: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
        **kwargs,
    ) -> Iterator[dict]:
        """
        Queries the Store for a set of documents.

        Args:
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            hint: Dictionary of indexes to use as hints for query optimizer.
                Keys are field names and values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned
            mongoclient_kwargs: Dict of extra kwargs to pass to pymongo find.
        """
        if isinstance(properties, list):
            properties = {p: 1 for p in properties}

        default_sort_formatted = None

        if self.default_sort is not None:
            default_sort_formatted = [
                (k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in self.default_sort.items()
            ]

        sort_list = (
            [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in sort.items()]
            if sort
            else default_sort_formatted
        )

        hint_list = (
            [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None
        )

        yield from self._collection.find(
            filter=criteria,
            projection=properties,
            skip=skip,
            limit=limit,
            sort=sort_list,
            hint=hint_list,
            **kwargs,
        )

    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
        """
        Tries to create an index and return true if it succeeded.

        Args:
            key: single key to index
            unique: Whether or not this index contains only unique keys.

        Returns:
            bool indicating if the index exists/was created
        """
        if confirm_field_index(self._collection, key):
            return True

        try:
            self._collection.create_index(key, unique=unique, background=True)
            return True
        except Exception:
            return False

    def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
        """
        Update documents into the Store.

        Args:
            docs: the document or list of documents to update
            key: field name(s) to determine uniqueness for a
                 document, can be a list of multiple fields,
                 a single field, or None if the Store's key
                 field is to be used
        """
        requests = []

        if not isinstance(docs, list):
            docs = [docs]

        for d in (jsanitize(x, allow_bson=True, recursive_msonable=True) for x in docs):
            # document-level validation is optional
            validates = True
            if self.validator:
                validates = self.validator.is_valid(d)
                if not validates:
                    if self.validator.strict:
                        raise ValueError(self.validator.validation_errors(d))
                    self.logger.error(self.validator.validation_errors(d))

            if validates:
                key = key or self.key
                search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]}

                requests.append(ReplaceOne(search_doc, d, upsert=True))

        if len(requests) > 0:
            try:
                self._collection.bulk_write(requests, ordered=False)
            except (OperationFailure, DocumentTooLarge) as e:
                if self.safe_update:
                    for req in requests:
                        try:
                            self._collection.bulk_write([req], ordered=False)
                        except (OperationFailure, DocumentTooLarge):
                            self.logger.error(
                                f"Could not upload document for {req._filter} as it was too large for Mongo"
                            )
                else:
                    raise e

    def remove_docs(self, criteria: dict):
        """
        Remove docs matching the query dictionary.

        Args:
            criteria: query dictionary to match
        """
        self._collection.delete_many(filter=criteria)

    def close(self):
        """Close up all collections."""
        self._collection.database.client.close()
        self._coll = None
        if self.ssh_tunnel is not None:
            self.ssh_tunnel.stop()

    def __eq__(self, other: object) -> bool:
        """
        Check equality for MongoStore
        other: other mongostore to compare with.
        """
        if not isinstance(other, MongoStore):
            return False

        fields = ["database", "collection_name", "host", "port", "last_updated_field"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`name: str` `property` ¶

Return a string representing this data source.

`eq(other)` ¶

Check equality for MongoStore other: other mongostore to compare with.

Source code in src/maggma/stores/mongolike.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for MongoStore
    other: other mongostore to compare with.
    """
    if not isinstance(other, MongoStore):
        return False

    fields = ["database", "collection_name", "host", "port", "last_updated_field"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`hash()` ¶

Hash for MongoStore.

Source code in src/maggma/stores/mongolike.py

def __hash__(self) -> int:
    """Hash for MongoStore."""
    return hash((self.database, self.collection_name, self.last_updated_field))

`init(database, collection_name, host='localhost', port=27017, username='', password='', ssh_tunnel=None, safe_update=False, auth_source=None, mongoclient_kwargs=None, default_sort=None, **kwargs)` ¶

Parameters:

Name	Type	Description	Default
`database`	`str`	The database name	required
`collection_name`	`str`	The collection name	required
`host`	`str`	Hostname for the database	`'localhost'`
`port`	`int`	TCP port to connect to	`27017`
`username`	`str`	Username for the collection	`''`
`password`	`str`	Password to connect with	`''`
`safe_update`	`bool`	fail gracefully on DocumentTooLarge errors on update	`False`
`auth_source`	`Optional[str]`	The database to authenticate on. Defaults to the database name.	`None`
`default_sort`	`Optional[dict[str, Union[Sort, int]]]`	Default sort field and direction to use when querying. Can be used to ensure determinacy in query results.	`None`

Source code in src/maggma/stores/mongolike.py

def __init__(
    self,
    database: str,
    collection_name: str,
    host: str = "localhost",
    port: int = 27017,
    username: str = "",
    password: str = "",
    ssh_tunnel: Optional[SSHTunnel] = None,
    safe_update: bool = False,
    auth_source: Optional[str] = None,
    mongoclient_kwargs: Optional[dict] = None,
    default_sort: Optional[dict[str, Union[Sort, int]]] = None,
    **kwargs,
):
    """
    Args:
        database: The database name
        collection_name: The collection name
        host: Hostname for the database
        port: TCP port to connect to
        username: Username for the collection
        password: Password to connect with
        safe_update: fail gracefully on DocumentTooLarge errors on update
        auth_source: The database to authenticate on. Defaults to the database name.
        default_sort: Default sort field and direction to use when querying. Can be used to
            ensure determinacy in query results.
    """
    self.database = database
    self.collection_name = collection_name
    self.host = host
    self.port = port
    self.username = username
    self.password = password
    self.ssh_tunnel = ssh_tunnel
    self.safe_update = safe_update
    self.default_sort = default_sort
    self._coll = None  # type: ignore
    self.kwargs = kwargs

    if auth_source is None:
        auth_source = self.database
    self.auth_source = auth_source
    self.mongoclient_kwargs = mongoclient_kwargs or {}

    super().__init__(**kwargs)

`close()` ¶

Close up all collections.

Source code in src/maggma/stores/mongolike.py

def close(self):
    """Close up all collections."""
    self._collection.database.client.close()
    self._coll = None
    if self.ssh_tunnel is not None:
        self.ssh_tunnel.stop()

`connect(force_reset=False)` ¶

Connect to the source data.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not when the Store is already connected.	`False`

Source code in src/maggma/stores/mongolike.py

def connect(self, force_reset: bool = False):
    """
    Connect to the source data.

    Args:
        force_reset: whether to reset the connection or not when the Store is
            already connected.
    """
    if self._coll is None or force_reset:
        if self.ssh_tunnel is None:
            host = self.host
            port = self.port
        else:
            self.ssh_tunnel.start()
            host, port = self.ssh_tunnel.local_address

        conn: MongoClient = (
            MongoClient(
                host=host,
                port=port,
                username=self.username,
                password=self.password,
                authSource=self.auth_source,
                **self.mongoclient_kwargs,
            )
            if self.username != ""
            else MongoClient(host, port, **self.mongoclient_kwargs)
        )
        db = conn[self.database]
        self._coll = db[self.collection_name]  # type: ignore

`count(criteria=None, hint=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in	`None`
`hint`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of indexes to use as hints for query optimizer. Keys are field names and values are 1 for ascending or -1 for descending.	`None`

Source code in src/maggma/stores/mongolike.py

def count(
    self,
    criteria: Optional[dict] = None,
    hint: Optional[dict[str, Union[Sort, int]]] = None,
) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in
        hint: Dictionary of indexes to use as hints for query optimizer.
            Keys are field names and values are 1 for ascending or -1 for descending.
    """
    criteria = criteria if criteria else {}

    hint_list = (
        [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None
    )

    if hint_list is not None:  # pragma: no cover
        return self._collection.count_documents(filter=criteria, hint=hint_list)

    return (
        self._collection.count_documents(filter=criteria)
        if criteria
        else self._collection.estimated_document_count()
    )

`distinct(field, criteria=None, all_exist=False)` ¶

Get all distinct values for a field.

Parameters:

Name	Type	Description	Default
`field`	`str`	the field(s) to get distinct values for	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`

Source code in src/maggma/stores/mongolike.py

def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
    """
    Get all distinct values for a field.

    Args:
        field: the field(s) to get distinct values for
        criteria: PyMongo filter for documents to search in
    """
    criteria = criteria or {}
    try:
        distinct_vals = self._collection.distinct(field, criteria)
    except (OperationFailure, DocumentTooLarge):
        distinct_vals = [
            d["_id"] for d in self._collection.aggregate([{"$match": criteria}, {"$group": {"_id": f"${field}"}}])
        ]
        if all(isinstance(d, list) for d in filter(None, distinct_vals)):  # type: ignore
            distinct_vals = list(chain.from_iterable(filter(None, distinct_vals)))

    return distinct_vals if distinct_vals is not None else []

`ensure_index(key, unique=False)` ¶

Tries to create an index and return true if it succeeded.

Parameters:

Name	Type	Description	Default
`key`	`str`	single key to index	required
`unique`	`Optional[bool]`	Whether or not this index contains only unique keys.	`False`

Returns:

Type	Description
`bool`	bool indicating if the index exists/was created

Source code in src/maggma/stores/mongolike.py

def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
    """
    Tries to create an index and return true if it succeeded.

    Args:
        key: single key to index
        unique: Whether or not this index contains only unique keys.

    Returns:
        bool indicating if the index exists/was created
    """
    if confirm_field_index(self._collection, key):
        return True

    try:
        self._collection.create_index(key, unique=unique, background=True)
        return True
    except Exception:
        return False

`from_collection(collection)` `classmethod` ¶

Generates a MongoStore from a pymongo collection object This is not a fully safe operation as it gives dummy information to the MongoStore As a result, this will not serialize and can not reset its connection.

Parameters:

Name	Type	Description	Default
`collection`		the PyMongo collection to create a MongoStore around	required

Source code in src/maggma/stores/mongolike.py

@classmethod
def from_collection(cls, collection):
    """
    Generates a MongoStore from a pymongo collection object
    This is not a fully safe operation as it gives dummy information to the MongoStore
    As a result, this will not serialize and can not reset its connection.

    Args:
        collection: the PyMongo collection to create a MongoStore around
    """
    # TODO: How do we make this safer?
    coll_name = collection.name
    db_name = collection.database.name

    store = cls(db_name, coll_name)
    store._coll = collection
    return store

`from_db_file(filename, **kwargs)` `classmethod` ¶

Convenience method to construct MongoStore from db_file from old QueryEngine format.

Source code in src/maggma/stores/mongolike.py

@classmethod
def from_db_file(cls, filename: str, **kwargs):
    """
    Convenience method to construct MongoStore from db_file
    from old QueryEngine format.
    """
    kwargs = loadfn(filename)
    if "collection" in kwargs:
        kwargs["collection_name"] = kwargs.pop("collection")
    # Get rid of aliases from traditional query engine db docs
    kwargs.pop("aliases", None)
    return cls(**kwargs)

`from_launchpad_file(lp_file, collection_name, **kwargs)` `classmethod` ¶

Convenience method to construct MongoStore from a launchpad file.

Note: A launchpad file is a special formatted yaml file used in fireworks

Returns:

Source code in src/maggma/stores/mongolike.py

@classmethod
def from_launchpad_file(cls, lp_file, collection_name, **kwargs):
    """
    Convenience method to construct MongoStore from a launchpad file.

    Note: A launchpad file is a special formatted yaml file used in fireworks

    Returns:
    """
    with open(lp_file) as f:
        yaml = YAML(typ="safe", pure=True)
        lp_creds = yaml.load(f.read())

    db_creds = lp_creds.copy()
    db_creds["database"] = db_creds["name"]
    for key in list(db_creds.keys()):
        if key not in ["database", "host", "port", "username", "password"]:
            db_creds.pop(key)
    db_creds["collection_name"] = collection_name

    return cls(**db_creds, **kwargs)

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Simple grouping function that will group documents by keys.

Parameters:

Name	Type	Description	Default
`keys`	`Union[list[str], str]`	fields to group documents	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Returns:

Type	Description
`Iterator[tuple[dict, list[dict]]]`	generator returning tuples of (key, list of docs)

Source code in src/maggma/stores/mongolike.py

def groupby(
    self,
    keys: Union[list[str], str],
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[tuple[dict, list[dict]]]:
    """
    Simple grouping function that will group documents
    by keys.

    Args:
        keys: fields to group documents
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned

    Returns:
        generator returning tuples of (key, list of docs)
    """
    pipeline = []
    if isinstance(keys, str):
        keys = [keys]

    if properties is None:
        properties = []
    if isinstance(properties, dict):
        properties = list(properties.keys())

    if criteria is not None:
        pipeline.append({"$match": criteria})

    if len(properties) > 0:
        pipeline.append({"$project": {p: 1 for p in properties + keys}})

    alpha = "abcdefghijklmnopqrstuvwxyz"
    group_id = {letter: f"${key}" for letter, key in zip(alpha, keys)}
    pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
    for d in self._collection.aggregate(pipeline, allowDiskUse=True):
        id_doc = {}  # type: ignore
        for letter, key in group_id.items():
            if has(d["_id"], letter):
                set_(id_doc, key[1:], d["_id"][letter])
        yield (id_doc, d["docs"])

`query(criteria=None, properties=None, sort=None, hint=None, skip=0, limit=0, **kwargs)` ¶

Queries the Store for a set of documents.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`hint`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of indexes to use as hints for query optimizer. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`
`mongoclient_kwargs`		Dict of extra kwargs to pass to pymongo find.	required

Source code in src/maggma/stores/mongolike.py

def query(  # type: ignore
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    hint: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
    **kwargs,
) -> Iterator[dict]:
    """
    Queries the Store for a set of documents.

    Args:
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        hint: Dictionary of indexes to use as hints for query optimizer.
            Keys are field names and values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned
        mongoclient_kwargs: Dict of extra kwargs to pass to pymongo find.
    """
    if isinstance(properties, list):
        properties = {p: 1 for p in properties}

    default_sort_formatted = None

    if self.default_sort is not None:
        default_sort_formatted = [
            (k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in self.default_sort.items()
        ]

    sort_list = (
        [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in sort.items()]
        if sort
        else default_sort_formatted
    )

    hint_list = (
        [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None
    )

    yield from self._collection.find(
        filter=criteria,
        projection=properties,
        skip=skip,
        limit=limit,
        sort=sort_list,
        hint=hint_list,
        **kwargs,
    )

`remove_docs(criteria)` ¶

Remove docs matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required

Source code in src/maggma/stores/mongolike.py

def remove_docs(self, criteria: dict):
    """
    Remove docs matching the query dictionary.

    Args:
        criteria: query dictionary to match
    """
    self._collection.delete_many(filter=criteria)

`update(docs, key=None)` ¶

Update documents into the Store.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used	`None`

Source code in src/maggma/stores/mongolike.py

def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
    """
    Update documents into the Store.

    Args:
        docs: the document or list of documents to update
        key: field name(s) to determine uniqueness for a
             document, can be a list of multiple fields,
             a single field, or None if the Store's key
             field is to be used
    """
    requests = []

    if not isinstance(docs, list):
        docs = [docs]

    for d in (jsanitize(x, allow_bson=True, recursive_msonable=True) for x in docs):
        # document-level validation is optional
        validates = True
        if self.validator:
            validates = self.validator.is_valid(d)
            if not validates:
                if self.validator.strict:
                    raise ValueError(self.validator.validation_errors(d))
                self.logger.error(self.validator.validation_errors(d))

        if validates:
            key = key or self.key
            search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]}

            requests.append(ReplaceOne(search_doc, d, upsert=True))

    if len(requests) > 0:
        try:
            self._collection.bulk_write(requests, ordered=False)
        except (OperationFailure, DocumentTooLarge) as e:
            if self.safe_update:
                for req in requests:
                    try:
                        self._collection.bulk_write([req], ordered=False)
                    except (OperationFailure, DocumentTooLarge):
                        self.logger.error(
                            f"Could not upload document for {req._filter} as it was too large for Mongo"
                        )
            else:
                raise e

`MongoURIStore` ¶

Bases: MongoStore

A Store that connects to a Mongo collection via a URI This is expected to be a special mongodb+srv:// URIs that include client parameters via TXT records.

Source code in src/maggma/stores/mongolike.py

class MongoURIStore(MongoStore):
    """
    A Store that connects to a Mongo collection via a URI
    This is expected to be a special mongodb+srv:// URIs that include
    client parameters via TXT records.
    """

    def __init__(
        self,
        uri: str,
        collection_name: str,
        database: Optional[str] = None,
        ssh_tunnel: Optional[SSHTunnel] = None,
        mongoclient_kwargs: Optional[dict] = None,
        default_sort: Optional[dict[str, Union[Sort, int]]] = None,
        **kwargs,
    ):
        """
        Args:
            uri: MongoDB+SRV URI
            database: database to connect to
            collection_name: The collection name
            default_sort: Default sort field and direction to use when querying. Can be used to
                ensure determinacy in query results.
        """
        self.uri = uri
        self.ssh_tunnel = ssh_tunnel
        self.default_sort = default_sort
        self.mongoclient_kwargs = mongoclient_kwargs or {}

        # parse the dbname from the uri
        if database is None:
            d_uri = uri_parser.parse_uri(uri)
            if d_uri["database"] is None:
                raise ConfigurationError("If database name is not supplied, a database must be set in the uri")
            self.database = d_uri["database"]
        else:
            self.database = database

        self.collection_name = collection_name
        self.kwargs = kwargs
        self._coll = None
        super(MongoStore, self).__init__(**kwargs)  # lgtm

    @property
    def name(self) -> str:
        """
        Return a string representing this data source.
        """
        # TODO: This is not very safe since it exposes the username/password info
        return self.uri

    def connect(self, force_reset: bool = False):
        """
        Connect to the source data.

        Args:
            force_reset: whether to reset the connection or not when the Store is
                already connected.
        """
        if self._coll is None or force_reset:  # pragma: no cover
            conn: MongoClient = MongoClient(self.uri, **self.mongoclient_kwargs)
            db = conn[self.database]
            self._coll = db[self.collection_name]  # type: ignore

`name: str` `property` ¶

Return a string representing this data source.

`init(uri, collection_name, database=None, ssh_tunnel=None, mongoclient_kwargs=None, default_sort=None, **kwargs)` ¶

Parameters:

Name	Type	Description	Default
`uri`	`str`	MongoDB+SRV URI	required
`database`	`Optional[str]`	database to connect to	`None`
`collection_name`	`str`	The collection name	required
`default_sort`	`Optional[dict[str, Union[Sort, int]]]`	Default sort field and direction to use when querying. Can be used to ensure determinacy in query results.	`None`

Source code in src/maggma/stores/mongolike.py

def __init__(
    self,
    uri: str,
    collection_name: str,
    database: Optional[str] = None,
    ssh_tunnel: Optional[SSHTunnel] = None,
    mongoclient_kwargs: Optional[dict] = None,
    default_sort: Optional[dict[str, Union[Sort, int]]] = None,
    **kwargs,
):
    """
    Args:
        uri: MongoDB+SRV URI
        database: database to connect to
        collection_name: The collection name
        default_sort: Default sort field and direction to use when querying. Can be used to
            ensure determinacy in query results.
    """
    self.uri = uri
    self.ssh_tunnel = ssh_tunnel
    self.default_sort = default_sort
    self.mongoclient_kwargs = mongoclient_kwargs or {}

    # parse the dbname from the uri
    if database is None:
        d_uri = uri_parser.parse_uri(uri)
        if d_uri["database"] is None:
            raise ConfigurationError("If database name is not supplied, a database must be set in the uri")
        self.database = d_uri["database"]
    else:
        self.database = database

    self.collection_name = collection_name
    self.kwargs = kwargs
    self._coll = None
    super(MongoStore, self).__init__(**kwargs)  # lgtm

`connect(force_reset=False)` ¶

Connect to the source data.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not when the Store is already connected.	`False`

Source code in src/maggma/stores/mongolike.py

def connect(self, force_reset: bool = False):
    """
    Connect to the source data.

    Args:
        force_reset: whether to reset the connection or not when the Store is
            already connected.
    """
    if self._coll is None or force_reset:  # pragma: no cover
        conn: MongoClient = MongoClient(self.uri, **self.mongoclient_kwargs)
        db = conn[self.database]
        self._coll = db[self.collection_name]  # type: ignore

`MontyStore` ¶

Bases: MemoryStore

A MongoDB compatible store that uses on disk files for storage.

This is handled under the hood using MontyDB. A number of on-disk storage options are available but MontyDB provides a mongo style interface for all options. The options include:

sqlite: Uses an sqlite database to store documents.
lightning: Uses Lightning Memory-Mapped Database (LMDB) for storage. This can provide fast read and write times but requires lmdb to be installed (in most cases this can be achieved using pip install lmdb).
flatfile: Uses a system of flat json files. This is not recommended as multiple simultaneous connections to the store will not work correctly.

Note that MontyDB (and, therefore, MontyStore) will write out a new database to the disk but cannot be used to read an existing (e.g. SQLite) database that wasn't formatted by MontyDB.

See the MontyDB repository for more information: https://github.com/davidlatwe/montydb

Source code in src/maggma/stores/mongolike.py

@requires(
    MontyClient is not None,
    "MontyStore requires MontyDB to be installed. See the MontyDB repository for more "
    "information: https://github.com/davidlatwe/montydb",
)
class MontyStore(MemoryStore):
    """
    A MongoDB compatible store that uses on disk files for storage.

    This is handled under the hood using MontyDB. A number of on-disk storage options
    are available but MontyDB provides a mongo style interface for all options. The
    options include:

    - sqlite: Uses an sqlite database to store documents.
    - lightning: Uses Lightning Memory-Mapped Database (LMDB) for storage. This can
      provide fast read and write times but requires lmdb to be installed (in most cases
      this can be achieved using ``pip install lmdb``).
    - flatfile: Uses a system of flat json files. This is not recommended as multiple
      simultaneous connections to the store will not work correctly.

    Note that MontyDB (and, therefore, MontyStore) will write out a new database to
    the disk but cannot be used to read an existing (e.g. SQLite) database that wasn't
    formatted by MontyDB.

    See the MontyDB repository for more information: https://github.com/davidlatwe/montydb
    """

    def __init__(
        self,
        collection_name,
        database_path: Optional[str] = None,
        database_name: str = "db",
        storage: Literal["sqlite", "flatfile", "lightning"] = "sqlite",
        storage_kwargs: Optional[dict] = None,
        client_kwargs: Optional[dict] = None,
        **kwargs,
    ):
        """
        Initializes the Monty Store.

        Args:
            collection_name: Name for the collection.
            database_path: Path to on-disk database files. If None, the current working
                directory will be used.
            database_name: The database name.
            storage: The storage type. Options include "sqlite", "lightning", "flatfile". Note that
            although MontyDB supports in memory storage, this capability is disabled in maggma to avoid unintended
            behavior, since multiple in-memory MontyStore would actually point to the same data.
            storage_kwargs: Keyword arguments passed to ``montydb.set_storage``.
            client_kwargs: Keyword arguments passed to the ``montydb.MontyClient``
                constructor.
            **kwargs: Additional keyword arguments passed to the Store constructor.
        """
        if database_path is None:
            database_path = str(Path.cwd())

        self.database_path = database_path
        self.database_name = database_name
        self.collection_name = collection_name
        self._coll = None  # type: ignore
        self.default_sort = None
        self.ssh_tunnel = None  # This is to fix issues with the tunnel on close
        self.kwargs = kwargs
        self.storage = storage
        self.storage_kwargs = storage_kwargs or {
            "use_bson": True,  # import pymongo's BSON; do not use montydb's
            "mongo_version": "4.0",
        }
        self.client_kwargs = client_kwargs or {}
        super(MongoStore, self).__init__(**kwargs)

    def connect(self, force_reset: bool = False):
        """
        Connect to the database store.

        Args:
            force_reset: whether to reset the connection or not when the Store is
                already connected.
        """
        if not self._coll or force_reset:
            # TODO - workaround, may be obviated by a future montydb update
            if self.database_path != ":memory:":
                set_storage(self.database_path, storage=self.storage, **self.storage_kwargs)
            client = MontyClient(self.database_path, **self.client_kwargs)
            self._coll = client[self.database_name][self.collection_name]

    @property
    def name(self) -> str:
        """Return a string representing this data source."""
        return f"monty://{self.database_path}/{self.database_name}/{self.collection_name}"

    def count(
        self,
        criteria: Optional[dict] = None,
        hint: Optional[dict[str, Union[Sort, int]]] = None,
    ) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in
            hint: Dictionary of indexes to use as hints for query optimizer.
                Keys are field names and values are 1 for ascending or -1 for descending.
        """
        criteria = criteria if criteria else {}

        hint_list = (
            [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None
        )

        if hint_list is not None:  # pragma: no cover
            return self._collection.count_documents(filter=criteria, hint=hint_list)

        return self._collection.count_documents(filter=criteria)

    def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
        """
        Update documents into the Store.

        Args:
            docs: The document or list of documents to update.
            key: Field name(s) to determine uniqueness for a document, can be a list of
                multiple fields, a single field, or None if the Store's key field is to be
                used.
        """
        if not isinstance(docs, list):
            docs = [docs]

        for d in docs:
            d = jsanitize(d, allow_bson=True)

            # document-level validation is optional
            validates = True
            if self.validator:
                validates = self.validator.is_valid(d)
                if not validates:
                    if self.validator.strict:
                        raise ValueError(self.validator.validation_errors(d))
                    self.logger.error(self.validator.validation_errors(d))

            if validates:
                key = key or self.key
                search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]}

                self._collection.replace_one(search_doc, d, upsert=True)

`name: str` `property` ¶

Return a string representing this data source.

`init(collection_name, database_path=None, database_name='db', storage='sqlite', storage_kwargs=None, client_kwargs=None, **kwargs)` ¶

Initializes the Monty Store.

Parameters:

Name	Type	Description	Default
`collection_name`		Name for the collection.	required
`database_path`	`Optional[str]`	Path to on-disk database files. If None, the current working directory will be used.	`None`
`database_name`	`str`	The database name.	`'db'`
`storage`	`Literal['sqlite', 'flatfile', 'lightning']`	The storage type. Options include "sqlite", "lightning", "flatfile". Note that	`'sqlite'`
`storage_kwargs`	`Optional[dict]`	Keyword arguments passed to `montydb.set_storage`.	`None`
`client_kwargs`	`Optional[dict]`	Keyword arguments passed to the `montydb.MontyClient` constructor.	`None`
`**kwargs`		Additional keyword arguments passed to the Store constructor.	`{}`

Source code in src/maggma/stores/mongolike.py

def __init__(
    self,
    collection_name,
    database_path: Optional[str] = None,
    database_name: str = "db",
    storage: Literal["sqlite", "flatfile", "lightning"] = "sqlite",
    storage_kwargs: Optional[dict] = None,
    client_kwargs: Optional[dict] = None,
    **kwargs,
):
    """
    Initializes the Monty Store.

    Args:
        collection_name: Name for the collection.
        database_path: Path to on-disk database files. If None, the current working
            directory will be used.
        database_name: The database name.
        storage: The storage type. Options include "sqlite", "lightning", "flatfile". Note that
        although MontyDB supports in memory storage, this capability is disabled in maggma to avoid unintended
        behavior, since multiple in-memory MontyStore would actually point to the same data.
        storage_kwargs: Keyword arguments passed to ``montydb.set_storage``.
        client_kwargs: Keyword arguments passed to the ``montydb.MontyClient``
            constructor.
        **kwargs: Additional keyword arguments passed to the Store constructor.
    """
    if database_path is None:
        database_path = str(Path.cwd())

    self.database_path = database_path
    self.database_name = database_name
    self.collection_name = collection_name
    self._coll = None  # type: ignore
    self.default_sort = None
    self.ssh_tunnel = None  # This is to fix issues with the tunnel on close
    self.kwargs = kwargs
    self.storage = storage
    self.storage_kwargs = storage_kwargs or {
        "use_bson": True,  # import pymongo's BSON; do not use montydb's
        "mongo_version": "4.0",
    }
    self.client_kwargs = client_kwargs or {}
    super(MongoStore, self).__init__(**kwargs)

`connect(force_reset=False)` ¶

Connect to the database store.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not when the Store is already connected.	`False`

Source code in src/maggma/stores/mongolike.py

def connect(self, force_reset: bool = False):
    """
    Connect to the database store.

    Args:
        force_reset: whether to reset the connection or not when the Store is
            already connected.
    """
    if not self._coll or force_reset:
        # TODO - workaround, may be obviated by a future montydb update
        if self.database_path != ":memory:":
            set_storage(self.database_path, storage=self.storage, **self.storage_kwargs)
        client = MontyClient(self.database_path, **self.client_kwargs)
        self._coll = client[self.database_name][self.collection_name]

`count(criteria=None, hint=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in	`None`
`hint`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of indexes to use as hints for query optimizer. Keys are field names and values are 1 for ascending or -1 for descending.	`None`

Source code in src/maggma/stores/mongolike.py

def count(
    self,
    criteria: Optional[dict] = None,
    hint: Optional[dict[str, Union[Sort, int]]] = None,
) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in
        hint: Dictionary of indexes to use as hints for query optimizer.
            Keys are field names and values are 1 for ascending or -1 for descending.
    """
    criteria = criteria if criteria else {}

    hint_list = (
        [(k, Sort(v).value) if isinstance(v, int) else (k, v.value) for k, v in hint.items()] if hint else None
    )

    if hint_list is not None:  # pragma: no cover
        return self._collection.count_documents(filter=criteria, hint=hint_list)

    return self._collection.count_documents(filter=criteria)

`update(docs, key=None)` ¶

Update documents into the Store.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	The document or list of documents to update.	required
`key`	`Union[list, str, None]`	Field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used.	`None`

Source code in src/maggma/stores/mongolike.py

def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
    """
    Update documents into the Store.

    Args:
        docs: The document or list of documents to update.
        key: Field name(s) to determine uniqueness for a document, can be a list of
            multiple fields, a single field, or None if the Store's key field is to be
            used.
    """
    if not isinstance(docs, list):
        docs = [docs]

    for d in docs:
        d = jsanitize(d, allow_bson=True)

        # document-level validation is optional
        validates = True
        if self.validator:
            validates = self.validator.is_valid(d)
            if not validates:
                if self.validator.strict:
                    raise ValueError(self.validator.validation_errors(d))
                self.logger.error(self.validator.validation_errors(d))

        if validates:
            key = key or self.key
            search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]}

            self._collection.replace_one(search_doc, d, upsert=True)

Module defining a FileStore that enables accessing files in a local directory using typical maggma access patterns.

`FileStore` ¶

Bases: MemoryStore

A Store for files on disk. Provides a common access method consistent with other stores. Each Item in the Store represents one file. Files can be organized into any type of directory structure.

A hash of the full path to each file is used to define a file_id that uniquely identifies each item.

Any metadata added to the items is written to a .json file in the root directory of the FileStore.

Source code in src/maggma/stores/file_store.py

class FileStore(MemoryStore):
    """
    A Store for files on disk. Provides a common access method consistent with
    other stores. Each Item in the Store represents one file. Files can be organized
    into any type of directory structure.

    A hash of the full path to each file is used to define a file_id that uniquely
    identifies each item.

    Any metadata added to the items is written to a .json file in the root directory
    of the FileStore.
    """

    def __init__(
        self,
        path: Union[str, Path],
        file_filters: Optional[list] = None,
        max_depth: Optional[int] = None,
        read_only: bool = True,
        include_orphans: bool = False,
        json_name: str = "FileStore.json",
        encoding: Optional[str] = None,
        **kwargs,
    ):
        """
        Initializes a FileStore.

        Args:
            path: parent directory containing all files and subdirectories to process
            file_filters: List of fnmatch patterns defining the files to be tracked by
                the FileStore. Only files that match one of the patterns  provided will
                be included in the Store If None (default), all files are included.

                Examples: ["*.txt", "test-[abcd].txt"], etc.
                See https://docs.python.org/3/library/fnmatch.html for full syntax
            max_depth: The maximum depth to look into subdirectories. 0 = no recursion,
                1 = include files 1 directory below the FileStore, etc.
                None (default) will scan all files below
                the FileStore root directory, regardless of depth.
            read_only: If True (default), the .update() and .remove_docs()
                methods are disabled, preventing any changes to the files on
                disk. In addition, metadata cannot be written to disk.
            include_orphans: Whether to include orphaned metadata records in query results.
                Orphaned metadata records are records found in the local JSON file that can
                no longer be associated to a file on disk. This can happen if a file is renamed
                or deleted, or if the FileStore is re-initialized with a more restrictive
                file_filters or max_depth argument. By default (False), these records
                do not appear in query results. Nevertheless, the metadata records are
                retained in the JSON file and the FileStore to prevent accidental data loss.
            json_name: Name of the .json file to which metadata is saved. If read_only
                is False, this file will be created in the root directory of the
                FileStore.
            encoding: Character encoding of files to be tracked by the store. The default
                (None) follows python's default behavior, which is to determine the character
                encoding from the platform. This should work in the great majority of cases.
                However, if you encounter a UnicodeDecodeError, consider setting the encoding
                explicitly to 'utf8' or another encoding as appropriate.
            kwargs: kwargs passed to MemoryStore.__init__()
        """
        # this conditional block is needed in order to guarantee that the 'name'
        # property, which is passed to `MemoryStore`, works correctly
        # collection names passed to MemoryStore cannot end with '.'
        if path == ".":
            path = Path.cwd()
        self.path = Path(path) if isinstance(path, str) else path

        self.json_name = json_name
        file_filters = file_filters if file_filters else ["*"]
        self.file_filters = re.compile("|".join(fnmatch.translate(p) for p in file_filters))
        self.collection_name = "file_store"
        self.key = "file_id"
        self.include_orphans = include_orphans
        self.read_only = read_only
        self.max_depth = max_depth
        self.encoding = encoding

        self.metadata_store = JSONStore(
            paths=[str(self.path / self.json_name)],
            read_only=self.read_only,
            collection_name=self.collection_name,
            key=self.key,
        )

        self.kwargs = kwargs

        super().__init__(
            collection_name=self.collection_name,
            key=self.key,
            **self.kwargs,
        )

    @property
    def name(self) -> str:
        """
        Return a string representing this data source.
        """
        return f"file://{self.path}"

    def add_metadata(
        self,
        metadata: Optional[dict] = None,
        query: Optional[dict] = None,
        auto_data: Optional[Callable[[dict], dict]] = None,
        **kwargs,
    ):
        """
        Add metadata to a record in the FileStore, either manually or by computing it automatically
        from another field, such as name or path (see auto_data).

        Args:
            metadata: dict of additional data to add to the records returned by query.
                      Note that any protected keys (such as 'name', 'path', etc.)
                      will be ignored.
            query: Query passed to FileStore.query()
            auto_data: A function that automatically computes metadata based on a field in
                    the record itself. The function must take in the item as a dict and
                    return a dict containing the desired metadata. A typical use case is
                    to assign metadata based on the name of a file. For example, for
                    data files named like `2022-04-01_april_fool_experiment.txt`, the
                    auto_data function could be:

                    def get_metadata_from_filename(d):
                        return {"date": d["name"].split("_")[0],
                                "test_name": d["name"].split("_")[1]
                                }

                    Note that in the case of conflict between manual and automatically
                    computed metadata (for example, if metadata={"name": "another_name"} was
                    supplied alongside the auto_data function above), the manually-supplied
                    metadata is used.
            kwargs: kwargs passed to FileStore.query()
        """
        if metadata is None:
            metadata = {}
        # sanitize the metadata
        filtered_metadata = self._filter_data(metadata)
        updated_docs = []

        for doc in self.query(query, **kwargs):
            if auto_data:
                extra_data = self._filter_data(auto_data(doc))
                doc.update(extra_data)
            doc.update(filtered_metadata)
            updated_docs.append(doc)

        self.update(updated_docs, key=self.key)

    def read(self) -> list[dict]:
        """
        Iterate through all files in the Store folder and populate
        the Store with dictionaries containing basic information about each file.

        The keys of the documents added to the Store are:

        - name: str = File name
        - path: Path = Absolute path of this file
        - parent: str = Name of the parent directory (if any)
        - file_id: str = Unique identifier for this file, computed from the hash
                    of its path relative to the base FileStore directory and
                    the file creation time. The key of this field is 'file_id'
                    by default but can be changed via the 'key' kwarg to
                    `FileStore.__init__()`.
        - size: int = Size of this file in bytes
        - last_updated: datetime = Time this file was last modified
        - hash: str = Hash of the file contents
        - orphan: bool = Whether this record is an orphan
        """
        file_list = []
        # generate a list of files in subdirectories
        for root, _dirs, files in os.walk(self.path):
            # for pattern in self.file_filters:
            for match in filter(self.file_filters.match, files):
                # for match in fnmatch.filter(files, pattern):
                path = Path(os.path.join(root, match))
                # ignore the .json file created by the Store
                if path.is_file() and path.name != self.json_name:
                    # filter based on depth
                    depth = len(path.relative_to(self.path).parts) - 1
                    if self.max_depth is None or depth <= self.max_depth:
                        file_list.append(self._create_record_from_file(path))

        return file_list

    def _create_record_from_file(self, f: Path) -> dict:
        """
        Given the path to a file, return a Dict that constitutes a record of
        basic information about that file. The keys in the returned dict
        are:

        - name: str = File name
        - path: Path = Absolute path of this file
        - parent: str = Name of the parent directory (if any)
        - file_id: str = Unique identifier for this file, computed from the hash
                    of its path relative to the base FileStore directory and
                    the file creation time. The key of this field is 'file_id'
                    by default but can be changed via the 'key' kwarg to
                    FileStore.__init__().
        - size: int = Size of this file in bytes
        - last_updated: datetime = Time this file was last modified
        - hash: str = Hash of the file contents
        - orphan: bool = Whether this record is an orphan
        """
        # compute the file_id from the relative path
        relative_path = f.relative_to(self.path)
        digest = hashlib.md5()
        digest.update(str(relative_path).encode())
        file_id = str(digest.hexdigest())

        # hash the file contents
        digest2 = hashlib.md5()
        b = bytearray(128 * 2056)
        mv = memoryview(b)
        digest2.update(self.name.encode())
        with open(f.as_posix(), "rb", buffering=0) as file:
            # this block copied from the file_digest method in python 3.11+
            # see https://github.com/python/cpython/blob/0ba07b2108d4763273f3fb85544dde34c5acd40a/Lib/hashlib.py#L213
            if hasattr(file, "getbuffer"):
                # io.BytesIO object, use zero-copy buffer
                digest2.update(file.getbuffer())
            else:
                for n in iter(lambda: file.readinto(mv), 0):
                    digest2.update(mv[:n])

        content_hash = str(digest2.hexdigest())
        stats = f.stat()

        return {
            "name": f.name,
            "path": f,
            "path_relative": relative_path,
            "parent": f.parent.name,
            "size": stats.st_size,
            "last_updated": datetime.fromtimestamp(stats.st_mtime, tz=timezone.utc),
            "orphan": False,
            "hash": content_hash,
            self.key: file_id,
        }

    def connect(self, force_reset: bool = False):
        """
        Connect to the source data.

        Read all the files in the directory, create corresponding File
        items in the internal MemoryStore.

        If there is a metadata .json file in the directory, read its
        contents into the MemoryStore

        Args:
            force_reset: whether to reset the connection or not when the Store is
                already connected.
        """
        # read all files and place them in the MemoryStore
        # use super.update to bypass the read_only guard statement
        # because we want the file data to be populated in memory
        super().connect(force_reset=force_reset)
        super().update(self.read())

        # now read any metadata from the .json file
        try:
            self.metadata_store.connect(force_reset=force_reset)
            metadata = list(self.metadata_store.query())
        except FileNotFoundError:
            metadata = []
            warnings.warn(
                f"""
                JSON file '{self.json_name}' not found. To create this file automatically, re-initialize
                the FileStore with read_only=False.
                """
            )

        # merge metadata with file data and check for orphaned metadata
        requests = []
        found_orphans = False
        key = self.key
        file_ids = self.distinct(self.key)
        for d in metadata:
            search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]}

            if d[key] not in file_ids:
                found_orphans = True
                d.update({"orphan": True})

            del d["_id"]

            requests.append(UpdateOne(search_doc, {"$set": d}, upsert=True))

        if found_orphans:
            warnings.warn(
                f"Orphaned metadata was found in {self.json_name}. This metadata"
                "will be added to the store with {'orphan': True}"
            )
        if len(requests) > 0:
            self._collection.bulk_write(requests, ordered=False)

    def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
        """
        Update items in the Store. Only possible if the store is not read only. Any new
        fields that are added will be written to the JSON file in the root directory
        of the FileStore.

        Note that certain fields that come from file metadata on disk are protected and
        cannot be updated with this method. This prevents the contents of the FileStore
        from becoming out of sync with the files on which it is based. The protected fields
        are keys in the dict returned by _create_record_from_file, e.g. 'name', 'parent',
        'path', 'last_updated', 'hash', 'size', 'contents', and 'orphan'. The 'path_relative' and key fields are
        retained to make each document in the JSON file identifiable by manual inspection.

        Args:
            docs: the document or list of documents to update
            key: field name(s) to determine uniqueness for a
                 document, can be a list of multiple fields,
                 a single field, or None if the Store's key
                 field is to be used
        """
        if self.read_only:
            raise StoreError(
                "This Store is read-only. To enable file I/O, re-initialize the store with read_only=False."
            )

        super().update(docs, key)
        data = list(self.query())
        filtered_data = []
        # remove fields that are populated by .read()
        for d in data:
            filtered_d = self._filter_data(d)
            # don't write records that contain only file_id
            if len(set(filtered_d.keys()).difference({"path_relative", self.key})) != 0:
                filtered_data.append(filtered_d)
        self.metadata_store.update(filtered_data, self.key)

    def _filter_data(self, d):
        """
        Remove any protected keys from a dictionary.

        Args:
            d: Dictionary whose keys are to be filtered
        """
        return {k: v for k, v in d.items() if k not in PROTECTED_KEYS.union({self.last_updated_field})}

    def query(  # type: ignore
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        hint: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
        contents_size_limit: Optional[int] = 0,
    ) -> Iterator[dict]:
        """
        Queries the Store for a set of documents.

        Args:
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            hint: Dictionary of indexes to use as hints for query optimizer.
                Keys are field names and values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned
            contents_size_limit: Maximum file size in bytes for which to return contents.
                The FileStore will attempt to read the file and populate the 'contents' key
                with its content at query time, unless the file size is larger than this value.
                By default, reading content is disabled. Note that enabling content reading
                can substantially slow down the query operation, especially when there
                are large numbers of files.
        """
        return_contents = False
        criteria = criteria if criteria else {}
        if criteria.get("orphan", None) is None and not self.include_orphans:
            criteria.update({"orphan": False})

        if criteria.get("contents"):
            warnings.warn("'contents' is not a queryable field! Ignoring.")

        if isinstance(properties, list):
            properties = {p: 1 for p in properties}

        orig_properties = properties.copy() if properties else None

        if properties is None:
            # None means return all fields, including contents
            return_contents = True
        elif properties.get("contents"):
            return_contents = True
            # remove contents b/c it isn't stored in the MemoryStore
            properties.pop("contents")
            # add size and path to query so that file can be read
            properties.update({"size": 1})
            properties.update({"path": 1})

        for d in super().query(
            criteria=criteria,
            properties=properties,
            sort=sort,
            hint=hint,
            skip=skip,
            limit=limit,
        ):
            # add file contents to the returned documents, if appropriate
            if return_contents and not d.get("orphan"):
                if contents_size_limit is None or d["size"] <= contents_size_limit:
                    # attempt to read the file contents and inject into the document
                    # TODO - could add more logic for detecting different file types
                    # and more nuanced exception handling
                    try:
                        with zopen(d["path"], "r", encoding=self.encoding) as f:
                            data = f.read()
                    except Exception as e:
                        data = f"Unable to read: {e}"

                elif d["size"] > contents_size_limit:
                    data = f"File exceeds size limit of {contents_size_limit} bytes"
                else:
                    data = "Unable to read: Unknown error"

                d.update({"contents": data})

                # remove size and path if not explicitly requested
                if orig_properties is not None and "size" not in orig_properties:
                    d.pop("size")
                if orig_properties is not None and "path" not in orig_properties:
                    d.pop("path")

            yield d

    def query_one(
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        contents_size_limit: Optional[int] = None,
    ):
        """
        Queries the Store for a single document.

        Args:
            criteria: PyMongo filter for documents to search
            properties: properties to return in the document
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            contents_size_limit: Maximum file size in bytes for which to return contents.
                The FileStore will attempt to read the file and populate the 'contents' key
                with its content at query time, unless the file size is larger than this value.
        """
        return next(
            self.query(
                criteria=criteria,
                properties=properties,
                sort=sort,
                contents_size_limit=contents_size_limit,
            ),
            None,
        )

    def remove_docs(self, criteria: dict, confirm: bool = False):
        """
        Remove items matching the query dictionary.

        Args:
            criteria: query dictionary to match
            confirm: Boolean flag to confirm that remove_docs should delete
                     files on disk. Default: False.
        """
        if self.read_only:
            raise StoreError(
                "This Store is read-only. To enable file I/O, re-initialize the store with read_only=False."
            )

        docs = list(self.query(criteria))
        # this ensures that any modifications to criteria made by self.query
        # (e.g., related to orphans or contents) are propagated through to the superclass
        new_criteria = {"file_id": {"$in": [d["file_id"] for d in docs]}}

        if len(docs) > 0 and not confirm:
            raise StoreError(
                f"Warning! This command is about to delete {len(docs)} items from disk! "
                "If this is what you want, reissue this command with confirm=True."
            )

        for d in docs:
            Path(d["path"]).unlink()
            super().remove_docs(criteria=new_criteria)

`name: str` `property` ¶

Return a string representing this data source.

`init(path, file_filters=None, max_depth=None, read_only=True, include_orphans=False, json_name='FileStore.json', encoding=None, **kwargs)` ¶

Initializes a FileStore.

Parameters:

Name	Type	Description	Default
`path`	`Union[str, Path]`	parent directory containing all files and subdirectories to process	required
`file_filters`	`Optional[list]`	List of fnmatch patterns defining the files to be tracked by the FileStore. Only files that match one of the patterns provided will be included in the Store If None (default), all files are included. Examples: ["*.txt", "test-[abcd].txt"], etc. See https://docs.python.org/3/library/fnmatch.html for full syntax	`None`
`max_depth`	`Optional[int]`	The maximum depth to look into subdirectories. 0 = no recursion, 1 = include files 1 directory below the FileStore, etc. None (default) will scan all files below the FileStore root directory, regardless of depth.	`None`
`read_only`	`bool`	If True (default), the .update() and .remove_docs() methods are disabled, preventing any changes to the files on disk. In addition, metadata cannot be written to disk.	`True`
`include_orphans`	`bool`	Whether to include orphaned metadata records in query results. Orphaned metadata records are records found in the local JSON file that can no longer be associated to a file on disk. This can happen if a file is renamed or deleted, or if the FileStore is re-initialized with a more restrictive file_filters or max_depth argument. By default (False), these records do not appear in query results. Nevertheless, the metadata records are retained in the JSON file and the FileStore to prevent accidental data loss.	`False`
`json_name`	`str`	Name of the .json file to which metadata is saved. If read_only is False, this file will be created in the root directory of the FileStore.	`'FileStore.json'`
`encoding`	`Optional[str]`	Character encoding of files to be tracked by the store. The default (None) follows python's default behavior, which is to determine the character encoding from the platform. This should work in the great majority of cases. However, if you encounter a UnicodeDecodeError, consider setting the encoding explicitly to 'utf8' or another encoding as appropriate.	`None`
`kwargs`		kwargs passed to MemoryStore.init()	`{}`

Source code in src/maggma/stores/file_store.py

def __init__(
    self,
    path: Union[str, Path],
    file_filters: Optional[list] = None,
    max_depth: Optional[int] = None,
    read_only: bool = True,
    include_orphans: bool = False,
    json_name: str = "FileStore.json",
    encoding: Optional[str] = None,
    **kwargs,
):
    """
    Initializes a FileStore.

    Args:
        path: parent directory containing all files and subdirectories to process
        file_filters: List of fnmatch patterns defining the files to be tracked by
            the FileStore. Only files that match one of the patterns  provided will
            be included in the Store If None (default), all files are included.

            Examples: ["*.txt", "test-[abcd].txt"], etc.
            See https://docs.python.org/3/library/fnmatch.html for full syntax
        max_depth: The maximum depth to look into subdirectories. 0 = no recursion,
            1 = include files 1 directory below the FileStore, etc.
            None (default) will scan all files below
            the FileStore root directory, regardless of depth.
        read_only: If True (default), the .update() and .remove_docs()
            methods are disabled, preventing any changes to the files on
            disk. In addition, metadata cannot be written to disk.
        include_orphans: Whether to include orphaned metadata records in query results.
            Orphaned metadata records are records found in the local JSON file that can
            no longer be associated to a file on disk. This can happen if a file is renamed
            or deleted, or if the FileStore is re-initialized with a more restrictive
            file_filters or max_depth argument. By default (False), these records
            do not appear in query results. Nevertheless, the metadata records are
            retained in the JSON file and the FileStore to prevent accidental data loss.
        json_name: Name of the .json file to which metadata is saved. If read_only
            is False, this file will be created in the root directory of the
            FileStore.
        encoding: Character encoding of files to be tracked by the store. The default
            (None) follows python's default behavior, which is to determine the character
            encoding from the platform. This should work in the great majority of cases.
            However, if you encounter a UnicodeDecodeError, consider setting the encoding
            explicitly to 'utf8' or another encoding as appropriate.
        kwargs: kwargs passed to MemoryStore.__init__()
    """
    # this conditional block is needed in order to guarantee that the 'name'
    # property, which is passed to `MemoryStore`, works correctly
    # collection names passed to MemoryStore cannot end with '.'
    if path == ".":
        path = Path.cwd()
    self.path = Path(path) if isinstance(path, str) else path

    self.json_name = json_name
    file_filters = file_filters if file_filters else ["*"]
    self.file_filters = re.compile("|".join(fnmatch.translate(p) for p in file_filters))
    self.collection_name = "file_store"
    self.key = "file_id"
    self.include_orphans = include_orphans
    self.read_only = read_only
    self.max_depth = max_depth
    self.encoding = encoding

    self.metadata_store = JSONStore(
        paths=[str(self.path / self.json_name)],
        read_only=self.read_only,
        collection_name=self.collection_name,
        key=self.key,
    )

    self.kwargs = kwargs

    super().__init__(
        collection_name=self.collection_name,
        key=self.key,
        **self.kwargs,
    )

`add_metadata(metadata=None, query=None, auto_data=None, **kwargs)` ¶

Add metadata to a record in the FileStore, either manually or by computing it automatically from another field, such as name or path (see auto_data).

Parameters:

Name	Type	Description	Default
`metadata`	`Optional[dict]`	dict of additional data to add to the records returned by query. Note that any protected keys (such as 'name', 'path', etc.) will be ignored.	`None`
`query`	`Optional[dict]`	Query passed to FileStore.query()	`None`
`auto_data`	`Optional[Callable[[dict], dict]]`	A function that automatically computes metadata based on a field in the record itself. The function must take in the item as a dict and return a dict containing the desired metadata. A typical use case is to assign metadata based on the name of a file. For example, for data files named like `2022-04-01_april_fool_experiment.txt`, the auto_data function could be: `def get_metadata_from_filename(d): return {"date": d["name"].split("_")[0], "test_name": d["name"].split("_")[1] } Note that in the case of conflict between manual and automatically computed metadata (for example, if metadata={"name": "another_name"} was supplied alongside the auto_data function above), the manually-supplied metadata is used.`	`None`
`kwargs`		kwargs passed to FileStore.query()	`{}`

Source code in src/maggma/stores/file_store.py

def add_metadata(
    self,
    metadata: Optional[dict] = None,
    query: Optional[dict] = None,
    auto_data: Optional[Callable[[dict], dict]] = None,
    **kwargs,
):
    """
    Add metadata to a record in the FileStore, either manually or by computing it automatically
    from another field, such as name or path (see auto_data).

    Args:
        metadata: dict of additional data to add to the records returned by query.
                  Note that any protected keys (such as 'name', 'path', etc.)
                  will be ignored.
        query: Query passed to FileStore.query()
        auto_data: A function that automatically computes metadata based on a field in
                the record itself. The function must take in the item as a dict and
                return a dict containing the desired metadata. A typical use case is
                to assign metadata based on the name of a file. For example, for
                data files named like `2022-04-01_april_fool_experiment.txt`, the
                auto_data function could be:

                def get_metadata_from_filename(d):
                    return {"date": d["name"].split("_")[0],
                            "test_name": d["name"].split("_")[1]
                            }

                Note that in the case of conflict between manual and automatically
                computed metadata (for example, if metadata={"name": "another_name"} was
                supplied alongside the auto_data function above), the manually-supplied
                metadata is used.
        kwargs: kwargs passed to FileStore.query()
    """
    if metadata is None:
        metadata = {}
    # sanitize the metadata
    filtered_metadata = self._filter_data(metadata)
    updated_docs = []

    for doc in self.query(query, **kwargs):
        if auto_data:
            extra_data = self._filter_data(auto_data(doc))
            doc.update(extra_data)
        doc.update(filtered_metadata)
        updated_docs.append(doc)

    self.update(updated_docs, key=self.key)

`connect(force_reset=False)` ¶

Connect to the source data.

Read all the files in the directory, create corresponding File items in the internal MemoryStore.

If there is a metadata .json file in the directory, read its contents into the MemoryStore

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not when the Store is already connected.	`False`

Source code in src/maggma/stores/file_store.py

def connect(self, force_reset: bool = False):
    """
    Connect to the source data.

    Read all the files in the directory, create corresponding File
    items in the internal MemoryStore.

    If there is a metadata .json file in the directory, read its
    contents into the MemoryStore

    Args:
        force_reset: whether to reset the connection or not when the Store is
            already connected.
    """
    # read all files and place them in the MemoryStore
    # use super.update to bypass the read_only guard statement
    # because we want the file data to be populated in memory
    super().connect(force_reset=force_reset)
    super().update(self.read())

    # now read any metadata from the .json file
    try:
        self.metadata_store.connect(force_reset=force_reset)
        metadata = list(self.metadata_store.query())
    except FileNotFoundError:
        metadata = []
        warnings.warn(
            f"""
            JSON file '{self.json_name}' not found. To create this file automatically, re-initialize
            the FileStore with read_only=False.
            """
        )

    # merge metadata with file data and check for orphaned metadata
    requests = []
    found_orphans = False
    key = self.key
    file_ids = self.distinct(self.key)
    for d in metadata:
        search_doc = {k: d[k] for k in key} if isinstance(key, list) else {key: d[key]}

        if d[key] not in file_ids:
            found_orphans = True
            d.update({"orphan": True})

        del d["_id"]

        requests.append(UpdateOne(search_doc, {"$set": d}, upsert=True))

    if found_orphans:
        warnings.warn(
            f"Orphaned metadata was found in {self.json_name}. This metadata"
            "will be added to the store with {'orphan': True}"
        )
    if len(requests) > 0:
        self._collection.bulk_write(requests, ordered=False)

`query(criteria=None, properties=None, sort=None, hint=None, skip=0, limit=0, contents_size_limit=0)` ¶

Queries the Store for a set of documents.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`hint`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of indexes to use as hints for query optimizer. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`
`contents_size_limit`	`Optional[int]`	Maximum file size in bytes for which to return contents. The FileStore will attempt to read the file and populate the 'contents' key with its content at query time, unless the file size is larger than this value. By default, reading content is disabled. Note that enabling content reading can substantially slow down the query operation, especially when there are large numbers of files.	`0`

Source code in src/maggma/stores/file_store.py

def query(  # type: ignore
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    hint: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
    contents_size_limit: Optional[int] = 0,
) -> Iterator[dict]:
    """
    Queries the Store for a set of documents.

    Args:
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        hint: Dictionary of indexes to use as hints for query optimizer.
            Keys are field names and values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned
        contents_size_limit: Maximum file size in bytes for which to return contents.
            The FileStore will attempt to read the file and populate the 'contents' key
            with its content at query time, unless the file size is larger than this value.
            By default, reading content is disabled. Note that enabling content reading
            can substantially slow down the query operation, especially when there
            are large numbers of files.
    """
    return_contents = False
    criteria = criteria if criteria else {}
    if criteria.get("orphan", None) is None and not self.include_orphans:
        criteria.update({"orphan": False})

    if criteria.get("contents"):
        warnings.warn("'contents' is not a queryable field! Ignoring.")

    if isinstance(properties, list):
        properties = {p: 1 for p in properties}

    orig_properties = properties.copy() if properties else None

    if properties is None:
        # None means return all fields, including contents
        return_contents = True
    elif properties.get("contents"):
        return_contents = True
        # remove contents b/c it isn't stored in the MemoryStore
        properties.pop("contents")
        # add size and path to query so that file can be read
        properties.update({"size": 1})
        properties.update({"path": 1})

    for d in super().query(
        criteria=criteria,
        properties=properties,
        sort=sort,
        hint=hint,
        skip=skip,
        limit=limit,
    ):
        # add file contents to the returned documents, if appropriate
        if return_contents and not d.get("orphan"):
            if contents_size_limit is None or d["size"] <= contents_size_limit:
                # attempt to read the file contents and inject into the document
                # TODO - could add more logic for detecting different file types
                # and more nuanced exception handling
                try:
                    with zopen(d["path"], "r", encoding=self.encoding) as f:
                        data = f.read()
                except Exception as e:
                    data = f"Unable to read: {e}"

            elif d["size"] > contents_size_limit:
                data = f"File exceeds size limit of {contents_size_limit} bytes"
            else:
                data = "Unable to read: Unknown error"

            d.update({"contents": data})

            # remove size and path if not explicitly requested
            if orig_properties is not None and "size" not in orig_properties:
                d.pop("size")
            if orig_properties is not None and "path" not in orig_properties:
                d.pop("path")

        yield d

`query_one(criteria=None, properties=None, sort=None, contents_size_limit=None)` ¶

Queries the Store for a single document.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search	`None`
`properties`	`Union[dict, list, None]`	properties to return in the document	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`contents_size_limit`	`Optional[int]`	Maximum file size in bytes for which to return contents. The FileStore will attempt to read the file and populate the 'contents' key with its content at query time, unless the file size is larger than this value.	`None`

Source code in src/maggma/stores/file_store.py

def query_one(
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    contents_size_limit: Optional[int] = None,
):
    """
    Queries the Store for a single document.

    Args:
        criteria: PyMongo filter for documents to search
        properties: properties to return in the document
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        contents_size_limit: Maximum file size in bytes for which to return contents.
            The FileStore will attempt to read the file and populate the 'contents' key
            with its content at query time, unless the file size is larger than this value.
    """
    return next(
        self.query(
            criteria=criteria,
            properties=properties,
            sort=sort,
            contents_size_limit=contents_size_limit,
        ),
        None,
    )

`read()` ¶

Iterate through all files in the Store folder and populate the Store with dictionaries containing basic information about each file.

The keys of the documents added to the Store are:

name: str = File name
path: Path = Absolute path of this file
parent: str = Name of the parent directory (if any)
file_id: str = Unique identifier for this file, computed from the hash of its path relative to the base FileStore directory and the file creation time. The key of this field is 'file_id' by default but can be changed via the 'key' kwarg to FileStore.__init__().
size: int = Size of this file in bytes
last_updated: datetime = Time this file was last modified
hash: str = Hash of the file contents
orphan: bool = Whether this record is an orphan

Source code in src/maggma/stores/file_store.py

def read(self) -> list[dict]:
    """
    Iterate through all files in the Store folder and populate
    the Store with dictionaries containing basic information about each file.

    The keys of the documents added to the Store are:

    - name: str = File name
    - path: Path = Absolute path of this file
    - parent: str = Name of the parent directory (if any)
    - file_id: str = Unique identifier for this file, computed from the hash
                of its path relative to the base FileStore directory and
                the file creation time. The key of this field is 'file_id'
                by default but can be changed via the 'key' kwarg to
                `FileStore.__init__()`.
    - size: int = Size of this file in bytes
    - last_updated: datetime = Time this file was last modified
    - hash: str = Hash of the file contents
    - orphan: bool = Whether this record is an orphan
    """
    file_list = []
    # generate a list of files in subdirectories
    for root, _dirs, files in os.walk(self.path):
        # for pattern in self.file_filters:
        for match in filter(self.file_filters.match, files):
            # for match in fnmatch.filter(files, pattern):
            path = Path(os.path.join(root, match))
            # ignore the .json file created by the Store
            if path.is_file() and path.name != self.json_name:
                # filter based on depth
                depth = len(path.relative_to(self.path).parts) - 1
                if self.max_depth is None or depth <= self.max_depth:
                    file_list.append(self._create_record_from_file(path))

    return file_list

`remove_docs(criteria, confirm=False)` ¶

Remove items matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required
`confirm`	`bool`	Boolean flag to confirm that remove_docs should delete files on disk. Default: False.	`False`

Source code in src/maggma/stores/file_store.py

def remove_docs(self, criteria: dict, confirm: bool = False):
    """
    Remove items matching the query dictionary.

    Args:
        criteria: query dictionary to match
        confirm: Boolean flag to confirm that remove_docs should delete
                 files on disk. Default: False.
    """
    if self.read_only:
        raise StoreError(
            "This Store is read-only. To enable file I/O, re-initialize the store with read_only=False."
        )

    docs = list(self.query(criteria))
    # this ensures that any modifications to criteria made by self.query
    # (e.g., related to orphans or contents) are propagated through to the superclass
    new_criteria = {"file_id": {"$in": [d["file_id"] for d in docs]}}

    if len(docs) > 0 and not confirm:
        raise StoreError(
            f"Warning! This command is about to delete {len(docs)} items from disk! "
            "If this is what you want, reissue this command with confirm=True."
        )

    for d in docs:
        Path(d["path"]).unlink()
        super().remove_docs(criteria=new_criteria)

`update(docs, key=None)` ¶

Update items in the Store. Only possible if the store is not read only. Any new fields that are added will be written to the JSON file in the root directory of the FileStore.

Note that certain fields that come from file metadata on disk are protected and cannot be updated with this method. This prevents the contents of the FileStore from becoming out of sync with the files on which it is based. The protected fields are keys in the dict returned by _create_record_from_file, e.g. 'name', 'parent', 'path', 'last_updated', 'hash', 'size', 'contents', and 'orphan'. The 'path_relative' and key fields are retained to make each document in the JSON file identifiable by manual inspection.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used	`None`

Source code in src/maggma/stores/file_store.py

def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
    """
    Update items in the Store. Only possible if the store is not read only. Any new
    fields that are added will be written to the JSON file in the root directory
    of the FileStore.

    Note that certain fields that come from file metadata on disk are protected and
    cannot be updated with this method. This prevents the contents of the FileStore
    from becoming out of sync with the files on which it is based. The protected fields
    are keys in the dict returned by _create_record_from_file, e.g. 'name', 'parent',
    'path', 'last_updated', 'hash', 'size', 'contents', and 'orphan'. The 'path_relative' and key fields are
    retained to make each document in the JSON file identifiable by manual inspection.

    Args:
        docs: the document or list of documents to update
        key: field name(s) to determine uniqueness for a
             document, can be a list of multiple fields,
             a single field, or None if the Store's key
             field is to be used
    """
    if self.read_only:
        raise StoreError(
            "This Store is read-only. To enable file I/O, re-initialize the store with read_only=False."
        )

    super().update(docs, key)
    data = list(self.query())
    filtered_data = []
    # remove fields that are populated by .read()
    for d in data:
        filtered_d = self._filter_data(d)
        # don't write records that contain only file_id
        if len(set(filtered_d.keys()).difference({"path_relative", self.key})) != 0:
            filtered_data.append(filtered_d)
    self.metadata_store.update(filtered_data, self.key)

Module containing various definitions of Stores. Stores are a default access pattern to data and provide various utilities.

`GridFSStore` ¶

Bases: Store

A Store for GridFS backend. Provides a common access method consistent with other stores.

Source code in src/maggma/stores/gridfs.py

class GridFSStore(Store):
    """
    A Store for GridFS backend. Provides a common access method consistent with other stores.
    """

    def __init__(
        self,
        database: str,
        collection_name: str,
        host: str = "localhost",
        port: int = 27017,
        username: str = "",
        password: str = "",
        compression: bool = False,
        ensure_metadata: bool = False,
        searchable_fields: Optional[list[str]] = None,
        auth_source: Optional[str] = None,
        mongoclient_kwargs: Optional[dict] = None,
        ssh_tunnel: Optional[SSHTunnel] = None,
        **kwargs,
    ):
        """
        Initializes a GridFS Store for binary data
        Args:
            database: database name
            collection_name: The name of the collection.
                This is the string portion before the GridFS extensions
            host: hostname for the database
            port: port to connect to
            username: username to connect as
            password: password to authenticate as
            compression: compress the data as it goes into GridFS
            ensure_metadata: ensure returned documents have the metadata fields
            searchable_fields: fields to keep in the index store
            auth_source: The database to authenticate on. Defaults to the database name.
            ssh_tunnel: An SSHTunnel object to use.
        """
        self.database = database
        self.collection_name = collection_name
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self._coll: Any = None
        self.compression = compression
        self.ensure_metadata = ensure_metadata
        self.searchable_fields = [] if searchable_fields is None else searchable_fields
        self.kwargs = kwargs
        self.ssh_tunnel = ssh_tunnel

        if auth_source is None:
            auth_source = self.database
        self.auth_source = auth_source
        self.mongoclient_kwargs = mongoclient_kwargs or {}

        if "key" not in kwargs:
            kwargs["key"] = "_id"
        super().__init__(**kwargs)

    @classmethod
    def from_launchpad_file(cls, lp_file, collection_name, **kwargs):
        """
        Convenience method to construct a GridFSStore from a launchpad file.

        Note: A launchpad file is a special formatted yaml file used in fireworks

        Returns:
        """
        with open(lp_file) as f:
            yaml = YAML(typ="safe", pure=True)
            lp_creds = yaml.load(f.read())

        db_creds = lp_creds.copy()
        db_creds["database"] = db_creds["name"]
        for key in list(db_creds.keys()):
            if key not in ["database", "host", "port", "username", "password"]:
                db_creds.pop(key)
        db_creds["collection_name"] = collection_name

        return cls(**db_creds, **kwargs)

    @property
    def name(self) -> str:
        """
        Return a string representing this data source.
        """
        return f"gridfs://{self.host}/{self.database}/{self.collection_name}"

    def connect(self, force_reset: bool = False):
        """
        Connect to the source data.

        Args:
            force_reset: whether to reset the connection or not when the Store is
                already connected.
        """
        if not self._coll or force_reset:
            if self.ssh_tunnel is None:
                host = self.host
                port = self.port
            else:
                self.ssh_tunnel.start()
                host, port = self.ssh_tunnel.local_address

            conn: MongoClient = (
                MongoClient(
                    host=host,
                    port=port,
                    username=self.username,
                    password=self.password,
                    authSource=self.auth_source,
                    **self.mongoclient_kwargs,
                )
                if self.username != ""
                else MongoClient(host, port, **self.mongoclient_kwargs)
            )
            db = conn[self.database]
            self._coll = gridfs.GridFS(db, self.collection_name)
            self._files_collection = db[f"{self.collection_name}.files"]
            self._files_store = MongoStore.from_collection(self._files_collection)
            self._files_store.last_updated_field = f"metadata.{self.last_updated_field}"
            self._files_store.key = self.key
            self._chunks_collection = db[f"{self.collection_name}.chunks"]

    @property
    def _collection(self):
        """Property referring to underlying pymongo collection."""
        if self._coll is None:
            raise StoreError("Must connect Mongo-like store before attempting to use it")
        return self._coll

    @property
    def last_updated(self) -> datetime:
        """
        Provides the most recent last_updated date time stamp from
        the documents in this Store.
        """
        return self._files_store.last_updated

    @classmethod
    def transform_criteria(cls, criteria: dict) -> dict:
        """
        Allow client to not need to prepend 'metadata.' to query fields.

        Args:
            criteria: Query criteria
        """
        new_criteria = dict()
        for field in criteria:
            if field not in files_collection_fields and not field.startswith("metadata."):
                new_criteria["metadata." + field] = copy.copy(criteria[field])
            else:
                new_criteria[field] = copy.copy(criteria[field])

        return new_criteria

    def count(self, criteria: Optional[dict] = None) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in
        """
        if isinstance(criteria, dict):
            criteria = self.transform_criteria(criteria)

        return self._files_store.count(criteria)

    def query(
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[dict]:
        """
        Queries the GridFS Store for a set of documents.
        Will check to see if data can be returned from
        files store first.
        If the data from the gridfs is not a json serialized string
        a dict will be returned with the data in the "data" key
        plus the self.key and self.last_updated_field.

        Args:
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned
        """
        if isinstance(criteria, dict):
            criteria = self.transform_criteria(criteria)
        elif criteria is not None:
            raise ValueError("Criteria must be a dictionary or None")

        prop_keys = set()
        if isinstance(properties, dict):
            prop_keys = set(properties.keys())
        elif isinstance(properties, list):
            prop_keys = set(properties)

        for doc in self._files_store.query(criteria=criteria, sort=sort, limit=limit, skip=skip):
            if properties is not None and prop_keys.issubset(set(doc.keys())):
                yield {p: doc[p] for p in properties if p in doc}
            else:
                metadata = doc.get("metadata", {})

                data = self._collection.find_one(
                    filter={"_id": doc["_id"]},
                    skip=skip,
                    limit=limit,
                    sort=sort,
                ).read()

                if metadata.get("compression", "") == "zlib":
                    data = zlib.decompress(data).decode("UTF-8")

                try:
                    data = json.loads(data)
                except Exception:
                    if not isinstance(data, dict):
                        data = {
                            "data": data,
                            self.key: doc.get(self.key),
                            self.last_updated_field: doc.get(self.last_updated_field),
                        }

                if self.ensure_metadata and isinstance(data, dict):
                    data.update(metadata)

                yield data

    def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
        """
        Get all distinct values for a field. This function only operates
        on the metadata in the files collection.

        Args:
            field: the field(s) to get distinct values for
            criteria: PyMongo filter for documents to search in
        """
        criteria = self.transform_criteria(criteria) if isinstance(criteria, dict) else criteria

        field = (
            f"metadata.{field}" if field not in files_collection_fields and not field.startswith("metadata.") else field
        )

        return self._files_store.distinct(field=field, criteria=criteria)

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        """
        Simple grouping function that will group documents
        by keys. Will only work if the keys are included in the files
        collection for GridFS.

        Args:
            keys: fields to group documents
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned

        Returns:
            generator returning tuples of (dict, list of docs)
        """
        criteria = self.transform_criteria(criteria) if isinstance(criteria, dict) else criteria
        keys = [keys] if not isinstance(keys, list) else keys
        keys = [
            f"metadata.{k}" if k not in files_collection_fields and not k.startswith("metadata.") else k for k in keys
        ]
        for group, ids in self._files_store.groupby(keys, criteria=criteria, properties=[f"metadata.{self.key}"]):
            ids = [get(doc, f"metadata.{self.key}") for doc in ids if has(doc, f"metadata.{self.key}")]

            group = {k.replace("metadata.", ""): get(group, k) for k in keys if has(group, k)}

            yield group, list(self.query(criteria={self.key: {"$in": ids}}))

    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
        """
        Tries to create an index and return true if it succeeded
        Currently operators on the GridFS files collection
        Args:
            key: single key to index
            unique: Whether or not this index contains only unique keys.

        Returns:
            bool indicating if the index exists/was created
        """
        # Transform key for gridfs first
        if key not in files_collection_fields:
            files_col_key = f"metadata.{key}"
            return self._files_store.ensure_index(files_col_key, unique=unique)
        return self._files_store.ensure_index(key, unique=unique)

    def update(
        self,
        docs: Union[list[dict], dict],
        key: Union[list, str, None] = None,
        additional_metadata: Union[str, list[str], None] = None,
    ):
        """
        Update documents into the Store.

        Args:
            docs: the document or list of documents to update
            key: field name(s) to determine uniqueness for a
                 document, can be a list of multiple fields,
                 a single field, or None if the Store's key
                 field is to be used
            additional_metadata: field(s) to include in the gridfs metadata
        """
        if not isinstance(docs, list):
            docs = [docs]

        if isinstance(key, str):
            key = [key]
        elif not key:
            key = [self.key]

        key = list(set(key) - set(files_collection_fields))

        if additional_metadata is None:
            additional_metadata = []
        elif isinstance(additional_metadata, str):
            additional_metadata = [additional_metadata]
        else:
            additional_metadata = list(additional_metadata)

        for d in docs:
            search_doc = {k: d[k] for k in key}

            metadata = {
                k: get(d, k)
                for k in [self.last_updated_field, *additional_metadata, *self.searchable_fields]
                if has(d, k)
            }
            metadata.update(search_doc)
            data = json.dumps(jsanitize(d, recursive_msonable=True)).encode("UTF-8")
            if self.compression:
                data = zlib.compress(data)
                metadata["compression"] = "zlib"

            self._collection.put(data, metadata=metadata)
            search_doc = self.transform_criteria(search_doc)

            # Cleans up old gridfs entries
            for fdoc in self._files_collection.find(search_doc, ["_id"]).sort("uploadDate", -1).skip(1):
                self._collection.delete(fdoc["_id"])

    def remove_docs(self, criteria: dict):
        """
        Remove docs matching the query dictionary.

        Args:
            criteria: query dictionary to match
        """
        if isinstance(criteria, dict):
            criteria = self.transform_criteria(criteria)
        ids = [cursor._id for cursor in self._collection.find(criteria)]

        for _id in ids:
            self._collection.delete(_id)

    def close(self):
        self._files_store.close()
        self._coll = None
        if self.ssh_tunnel is not None:
            self.ssh_tunnel.stop()

    def __eq__(self, other: object) -> bool:
        """
        Check equality for GridFSStore
        other: other GridFSStore to compare with.
        """
        if not isinstance(other, GridFSStore):
            return False

        fields = ["database", "collection_name", "host", "port"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`last_updated: datetime` `property` ¶

Provides the most recent last_updated date time stamp from the documents in this Store.

`name: str` `property` ¶

Return a string representing this data source.

`eq(other)` ¶

Check equality for GridFSStore other: other GridFSStore to compare with.

Source code in src/maggma/stores/gridfs.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for GridFSStore
    other: other GridFSStore to compare with.
    """
    if not isinstance(other, GridFSStore):
        return False

    fields = ["database", "collection_name", "host", "port"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(database, collection_name, host='localhost', port=27017, username='', password='', compression=False, ensure_metadata=False, searchable_fields=None, auth_source=None, mongoclient_kwargs=None, ssh_tunnel=None, **kwargs)` ¶

Initializes a GridFS Store for binary data Args: database: database name collection_name: The name of the collection. This is the string portion before the GridFS extensions host: hostname for the database port: port to connect to username: username to connect as password: password to authenticate as compression: compress the data as it goes into GridFS ensure_metadata: ensure returned documents have the metadata fields searchable_fields: fields to keep in the index store auth_source: The database to authenticate on. Defaults to the database name. ssh_tunnel: An SSHTunnel object to use.

Source code in src/maggma/stores/gridfs.py

def __init__(
    self,
    database: str,
    collection_name: str,
    host: str = "localhost",
    port: int = 27017,
    username: str = "",
    password: str = "",
    compression: bool = False,
    ensure_metadata: bool = False,
    searchable_fields: Optional[list[str]] = None,
    auth_source: Optional[str] = None,
    mongoclient_kwargs: Optional[dict] = None,
    ssh_tunnel: Optional[SSHTunnel] = None,
    **kwargs,
):
    """
    Initializes a GridFS Store for binary data
    Args:
        database: database name
        collection_name: The name of the collection.
            This is the string portion before the GridFS extensions
        host: hostname for the database
        port: port to connect to
        username: username to connect as
        password: password to authenticate as
        compression: compress the data as it goes into GridFS
        ensure_metadata: ensure returned documents have the metadata fields
        searchable_fields: fields to keep in the index store
        auth_source: The database to authenticate on. Defaults to the database name.
        ssh_tunnel: An SSHTunnel object to use.
    """
    self.database = database
    self.collection_name = collection_name
    self.host = host
    self.port = port
    self.username = username
    self.password = password
    self._coll: Any = None
    self.compression = compression
    self.ensure_metadata = ensure_metadata
    self.searchable_fields = [] if searchable_fields is None else searchable_fields
    self.kwargs = kwargs
    self.ssh_tunnel = ssh_tunnel

    if auth_source is None:
        auth_source = self.database
    self.auth_source = auth_source
    self.mongoclient_kwargs = mongoclient_kwargs or {}

    if "key" not in kwargs:
        kwargs["key"] = "_id"
    super().__init__(**kwargs)

`connect(force_reset=False)` ¶

Connect to the source data.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not when the Store is already connected.	`False`

Source code in src/maggma/stores/gridfs.py

def connect(self, force_reset: bool = False):
    """
    Connect to the source data.

    Args:
        force_reset: whether to reset the connection or not when the Store is
            already connected.
    """
    if not self._coll or force_reset:
        if self.ssh_tunnel is None:
            host = self.host
            port = self.port
        else:
            self.ssh_tunnel.start()
            host, port = self.ssh_tunnel.local_address

        conn: MongoClient = (
            MongoClient(
                host=host,
                port=port,
                username=self.username,
                password=self.password,
                authSource=self.auth_source,
                **self.mongoclient_kwargs,
            )
            if self.username != ""
            else MongoClient(host, port, **self.mongoclient_kwargs)
        )
        db = conn[self.database]
        self._coll = gridfs.GridFS(db, self.collection_name)
        self._files_collection = db[f"{self.collection_name}.files"]
        self._files_store = MongoStore.from_collection(self._files_collection)
        self._files_store.last_updated_field = f"metadata.{self.last_updated_field}"
        self._files_store.key = self.key
        self._chunks_collection = db[f"{self.collection_name}.chunks"]

`count(criteria=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in	`None`

Source code in src/maggma/stores/gridfs.py

def count(self, criteria: Optional[dict] = None) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in
    """
    if isinstance(criteria, dict):
        criteria = self.transform_criteria(criteria)

    return self._files_store.count(criteria)

`distinct(field, criteria=None, all_exist=False)` ¶

Get all distinct values for a field. This function only operates on the metadata in the files collection.

Parameters:

Name	Type	Description	Default
`field`	`str`	the field(s) to get distinct values for	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`

Source code in src/maggma/stores/gridfs.py

def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
    """
    Get all distinct values for a field. This function only operates
    on the metadata in the files collection.

    Args:
        field: the field(s) to get distinct values for
        criteria: PyMongo filter for documents to search in
    """
    criteria = self.transform_criteria(criteria) if isinstance(criteria, dict) else criteria

    field = (
        f"metadata.{field}" if field not in files_collection_fields and not field.startswith("metadata.") else field
    )

    return self._files_store.distinct(field=field, criteria=criteria)

`ensure_index(key, unique=False)` ¶

Tries to create an index and return true if it succeeded Currently operators on the GridFS files collection Args: key: single key to index unique: Whether or not this index contains only unique keys.

Returns:

Type	Description
`bool`	bool indicating if the index exists/was created

Source code in src/maggma/stores/gridfs.py

def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
    """
    Tries to create an index and return true if it succeeded
    Currently operators on the GridFS files collection
    Args:
        key: single key to index
        unique: Whether or not this index contains only unique keys.

    Returns:
        bool indicating if the index exists/was created
    """
    # Transform key for gridfs first
    if key not in files_collection_fields:
        files_col_key = f"metadata.{key}"
        return self._files_store.ensure_index(files_col_key, unique=unique)
    return self._files_store.ensure_index(key, unique=unique)

`from_launchpad_file(lp_file, collection_name, **kwargs)` `classmethod` ¶

Convenience method to construct a GridFSStore from a launchpad file.

Note: A launchpad file is a special formatted yaml file used in fireworks

Returns:

Source code in src/maggma/stores/gridfs.py

@classmethod
def from_launchpad_file(cls, lp_file, collection_name, **kwargs):
    """
    Convenience method to construct a GridFSStore from a launchpad file.

    Note: A launchpad file is a special formatted yaml file used in fireworks

    Returns:
    """
    with open(lp_file) as f:
        yaml = YAML(typ="safe", pure=True)
        lp_creds = yaml.load(f.read())

    db_creds = lp_creds.copy()
    db_creds["database"] = db_creds["name"]
    for key in list(db_creds.keys()):
        if key not in ["database", "host", "port", "username", "password"]:
            db_creds.pop(key)
    db_creds["collection_name"] = collection_name

    return cls(**db_creds, **kwargs)

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Simple grouping function that will group documents by keys. Will only work if the keys are included in the files collection for GridFS.

Parameters:

Name	Type	Description	Default
`keys`	`Union[list[str], str]`	fields to group documents	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Returns:

Type	Description
`Iterator[tuple[dict, list[dict]]]`	generator returning tuples of (dict, list of docs)

Source code in src/maggma/stores/gridfs.py

def groupby(
    self,
    keys: Union[list[str], str],
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[tuple[dict, list[dict]]]:
    """
    Simple grouping function that will group documents
    by keys. Will only work if the keys are included in the files
    collection for GridFS.

    Args:
        keys: fields to group documents
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned

    Returns:
        generator returning tuples of (dict, list of docs)
    """
    criteria = self.transform_criteria(criteria) if isinstance(criteria, dict) else criteria
    keys = [keys] if not isinstance(keys, list) else keys
    keys = [
        f"metadata.{k}" if k not in files_collection_fields and not k.startswith("metadata.") else k for k in keys
    ]
    for group, ids in self._files_store.groupby(keys, criteria=criteria, properties=[f"metadata.{self.key}"]):
        ids = [get(doc, f"metadata.{self.key}") for doc in ids if has(doc, f"metadata.{self.key}")]

        group = {k.replace("metadata.", ""): get(group, k) for k in keys if has(group, k)}

        yield group, list(self.query(criteria={self.key: {"$in": ids}}))

`query(criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Queries the GridFS Store for a set of documents. Will check to see if data can be returned from files store first. If the data from the gridfs is not a json serialized string a dict will be returned with the data in the "data" key plus the self.key and self.last_updated_field.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Source code in src/maggma/stores/gridfs.py

def query(
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[dict]:
    """
    Queries the GridFS Store for a set of documents.
    Will check to see if data can be returned from
    files store first.
    If the data from the gridfs is not a json serialized string
    a dict will be returned with the data in the "data" key
    plus the self.key and self.last_updated_field.

    Args:
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned
    """
    if isinstance(criteria, dict):
        criteria = self.transform_criteria(criteria)
    elif criteria is not None:
        raise ValueError("Criteria must be a dictionary or None")

    prop_keys = set()
    if isinstance(properties, dict):
        prop_keys = set(properties.keys())
    elif isinstance(properties, list):
        prop_keys = set(properties)

    for doc in self._files_store.query(criteria=criteria, sort=sort, limit=limit, skip=skip):
        if properties is not None and prop_keys.issubset(set(doc.keys())):
            yield {p: doc[p] for p in properties if p in doc}
        else:
            metadata = doc.get("metadata", {})

            data = self._collection.find_one(
                filter={"_id": doc["_id"]},
                skip=skip,
                limit=limit,
                sort=sort,
            ).read()

            if metadata.get("compression", "") == "zlib":
                data = zlib.decompress(data).decode("UTF-8")

            try:
                data = json.loads(data)
            except Exception:
                if not isinstance(data, dict):
                    data = {
                        "data": data,
                        self.key: doc.get(self.key),
                        self.last_updated_field: doc.get(self.last_updated_field),
                    }

            if self.ensure_metadata and isinstance(data, dict):
                data.update(metadata)

            yield data

`remove_docs(criteria)` ¶

Remove docs matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required

Source code in src/maggma/stores/gridfs.py

def remove_docs(self, criteria: dict):
    """
    Remove docs matching the query dictionary.

    Args:
        criteria: query dictionary to match
    """
    if isinstance(criteria, dict):
        criteria = self.transform_criteria(criteria)
    ids = [cursor._id for cursor in self._collection.find(criteria)]

    for _id in ids:
        self._collection.delete(_id)

`transform_criteria(criteria)` `classmethod` ¶

Allow client to not need to prepend 'metadata.' to query fields.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	Query criteria	required

Source code in src/maggma/stores/gridfs.py

@classmethod
def transform_criteria(cls, criteria: dict) -> dict:
    """
    Allow client to not need to prepend 'metadata.' to query fields.

    Args:
        criteria: Query criteria
    """
    new_criteria = dict()
    for field in criteria:
        if field not in files_collection_fields and not field.startswith("metadata."):
            new_criteria["metadata." + field] = copy.copy(criteria[field])
        else:
            new_criteria[field] = copy.copy(criteria[field])

    return new_criteria

`update(docs, key=None, additional_metadata=None)` ¶

Update documents into the Store.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used	`None`
`additional_metadata`	`Union[str, list[str], None]`	field(s) to include in the gridfs metadata	`None`

Source code in src/maggma/stores/gridfs.py

def update(
    self,
    docs: Union[list[dict], dict],
    key: Union[list, str, None] = None,
    additional_metadata: Union[str, list[str], None] = None,
):
    """
    Update documents into the Store.

    Args:
        docs: the document or list of documents to update
        key: field name(s) to determine uniqueness for a
             document, can be a list of multiple fields,
             a single field, or None if the Store's key
             field is to be used
        additional_metadata: field(s) to include in the gridfs metadata
    """
    if not isinstance(docs, list):
        docs = [docs]

    if isinstance(key, str):
        key = [key]
    elif not key:
        key = [self.key]

    key = list(set(key) - set(files_collection_fields))

    if additional_metadata is None:
        additional_metadata = []
    elif isinstance(additional_metadata, str):
        additional_metadata = [additional_metadata]
    else:
        additional_metadata = list(additional_metadata)

    for d in docs:
        search_doc = {k: d[k] for k in key}

        metadata = {
            k: get(d, k)
            for k in [self.last_updated_field, *additional_metadata, *self.searchable_fields]
            if has(d, k)
        }
        metadata.update(search_doc)
        data = json.dumps(jsanitize(d, recursive_msonable=True)).encode("UTF-8")
        if self.compression:
            data = zlib.compress(data)
            metadata["compression"] = "zlib"

        self._collection.put(data, metadata=metadata)
        search_doc = self.transform_criteria(search_doc)

        # Cleans up old gridfs entries
        for fdoc in self._files_collection.find(search_doc, ["_id"]).sort("uploadDate", -1).skip(1):
            self._collection.delete(fdoc["_id"])

`GridFSURIStore` ¶

Bases: GridFSStore

A Store for GridFS backend, with connection via a mongo URI string.

This is expected to be a special mongodb+srv:// URIs that include client parameters via TXT records

Source code in src/maggma/stores/gridfs.py

class GridFSURIStore(GridFSStore):
    """
    A Store for GridFS backend, with connection via a mongo URI string.

    This is expected to be a special mongodb+srv:// URIs that include client parameters
    via TXT records
    """

    def __init__(
        self,
        uri: str,
        collection_name: str,
        database: Optional[str] = None,
        compression: bool = False,
        ensure_metadata: bool = False,
        searchable_fields: Optional[list[str]] = None,
        mongoclient_kwargs: Optional[dict] = None,
        **kwargs,
    ):
        """
        Initializes a GridFS Store for binary data.

        Args:
            uri: MongoDB+SRV URI
            database: database to connect to
            collection_name: The collection name
            compression: compress the data as it goes into GridFS
            ensure_metadata: ensure returned documents have the metadata fields
            searchable_fields: fields to keep in the index store.
        """
        self.uri = uri

        # parse the dbname from the uri
        if database is None:
            d_uri = uri_parser.parse_uri(uri)
            if d_uri["database"] is None:
                raise ConfigurationError("If database name is not supplied, a database must be set in the uri")
            self.database = d_uri["database"]
        else:
            self.database = database

        self.collection_name = collection_name
        self._coll: Any = None
        self.compression = compression
        self.ensure_metadata = ensure_metadata
        self.searchable_fields = [] if searchable_fields is None else searchable_fields
        self.kwargs = kwargs
        self.mongoclient_kwargs = mongoclient_kwargs or {}

        if "key" not in kwargs:
            kwargs["key"] = "_id"
        super(GridFSStore, self).__init__(**kwargs)  # lgtm

    def connect(self, force_reset: bool = False):
        """
        Connect to the source data.

        Args:
            force_reset: whether to reset the connection or not when the Store is
                already connected.
        """
        if not self._coll or force_reset:  # pragma: no cover
            conn: MongoClient = MongoClient(self.uri, **self.mongoclient_kwargs)
            db = conn[self.database]
            self._coll = gridfs.GridFS(db, self.collection_name)
            self._files_collection = db[f"{self.collection_name}.files"]
            self._files_store = MongoStore.from_collection(self._files_collection)
            self._files_store.last_updated_field = f"metadata.{self.last_updated_field}"
            self._files_store.key = self.key
            self._chunks_collection = db[f"{self.collection_name}.chunks"]

`init(uri, collection_name, database=None, compression=False, ensure_metadata=False, searchable_fields=None, mongoclient_kwargs=None, **kwargs)` ¶

Initializes a GridFS Store for binary data.

Parameters:

Name	Type	Description	Default
`uri`	`str`	MongoDB+SRV URI	required
`database`	`Optional[str]`	database to connect to	`None`
`collection_name`	`str`	The collection name	required
`compression`	`bool`	compress the data as it goes into GridFS	`False`
`ensure_metadata`	`bool`	ensure returned documents have the metadata fields	`False`
`searchable_fields`	`Optional[list[str]]`	fields to keep in the index store.	`None`

Source code in src/maggma/stores/gridfs.py

def __init__(
    self,
    uri: str,
    collection_name: str,
    database: Optional[str] = None,
    compression: bool = False,
    ensure_metadata: bool = False,
    searchable_fields: Optional[list[str]] = None,
    mongoclient_kwargs: Optional[dict] = None,
    **kwargs,
):
    """
    Initializes a GridFS Store for binary data.

    Args:
        uri: MongoDB+SRV URI
        database: database to connect to
        collection_name: The collection name
        compression: compress the data as it goes into GridFS
        ensure_metadata: ensure returned documents have the metadata fields
        searchable_fields: fields to keep in the index store.
    """
    self.uri = uri

    # parse the dbname from the uri
    if database is None:
        d_uri = uri_parser.parse_uri(uri)
        if d_uri["database"] is None:
            raise ConfigurationError("If database name is not supplied, a database must be set in the uri")
        self.database = d_uri["database"]
    else:
        self.database = database

    self.collection_name = collection_name
    self._coll: Any = None
    self.compression = compression
    self.ensure_metadata = ensure_metadata
    self.searchable_fields = [] if searchable_fields is None else searchable_fields
    self.kwargs = kwargs
    self.mongoclient_kwargs = mongoclient_kwargs or {}

    if "key" not in kwargs:
        kwargs["key"] = "_id"
    super(GridFSStore, self).__init__(**kwargs)  # lgtm

`connect(force_reset=False)` ¶

Connect to the source data.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not when the Store is already connected.	`False`

Source code in src/maggma/stores/gridfs.py

def connect(self, force_reset: bool = False):
    """
    Connect to the source data.

    Args:
        force_reset: whether to reset the connection or not when the Store is
            already connected.
    """
    if not self._coll or force_reset:  # pragma: no cover
        conn: MongoClient = MongoClient(self.uri, **self.mongoclient_kwargs)
        db = conn[self.database]
        self._coll = gridfs.GridFS(db, self.collection_name)
        self._files_collection = db[f"{self.collection_name}.files"]
        self._files_store = MongoStore.from_collection(self._files_collection)
        self._files_store.last_updated_field = f"metadata.{self.last_updated_field}"
        self._files_store.key = self.key
        self._chunks_collection = db[f"{self.collection_name}.chunks"]

Stores for connecting to AWS data.

`S3Store` ¶

Bases: Store

GridFS like storage using Amazon S3 and a regular store for indexing.

Assumes Amazon AWS key and secret key are set in environment or default config file.

Source code in src/maggma/stores/aws.py

class S3Store(Store):
    """
    GridFS like storage using Amazon S3 and a regular store for indexing.

    Assumes Amazon AWS key and secret key are set in environment or default config file.
    """

    def __init__(
        self,
        index: Store,
        bucket: str,
        s3_profile: Optional[Union[str, dict]] = None,
        compress: bool = False,
        endpoint_url: Optional[str] = None,
        sub_dir: Optional[str] = None,
        s3_workers: int = 1,
        s3_resource_kwargs: Optional[dict] = None,
        ssh_tunnel: Optional[SSHTunnel] = None,
        key: str = "fs_id",
        store_hash: bool = True,
        unpack_data: bool = True,
        searchable_fields: Optional[list[str]] = None,
        index_store_kwargs: Optional[dict] = None,
        **kwargs,
    ):
        """
        Initializes an S3 Store.

        Args:
            index: a store to use to index the S3 bucket.
            bucket: name of the bucket.
            s3_profile: name of AWS profile containing the credentials. Alternatively
                you can pass in a dictionary with the full credentials:
                    aws_access_key_id (string) -- AWS access key ID
                    aws_secret_access_key (string) -- AWS secret access key
                    aws_session_token (string) -- AWS temporary session token
                    region_name (string) -- Default region when creating new connections
            compress: compress files inserted into the store.
            endpoint_url: this allows the interface with minio service; ignored if
                `ssh_tunnel` is provided, in which case it is inferred.
            sub_dir: subdirectory of the S3 bucket to store the data.
            s3_workers: number of concurrent S3 puts to run.
            s3_resource_kwargs: additional kwargs to pass to the boto3 session resource.
            ssh_tunnel: optional SSH tunnel to use for the S3 connection.
            key: main key to index on.
            store_hash: store the SHA1 hash right before insertion to the database.
            unpack_data: whether to decompress and unpack byte data when querying from
                the bucket.
            searchable_fields: fields to keep in the index store.
            index_store_kwargs: kwargs to pass to the index store. Allows the user to
                use kwargs here to update the index store.
        """
        if boto3 is None:
            raise RuntimeError("boto3 and botocore are required for S3Store")
        self.index_store_kwargs = index_store_kwargs or {}
        if index_store_kwargs:
            d_ = index.as_dict()
            d_.update(index_store_kwargs)
            self.index = index.__class__.from_dict(d_)
        else:
            self.index = index
        self.bucket = bucket
        self.s3_profile = s3_profile
        self.compress = compress
        self.endpoint_url = endpoint_url
        self.sub_dir = sub_dir.strip("/") + "/" if sub_dir else ""
        self.s3: Any = None
        self.s3_bucket: Any = None
        self.s3_workers = s3_workers
        self.s3_resource_kwargs = s3_resource_kwargs if s3_resource_kwargs is not None else {}
        self.ssh_tunnel = ssh_tunnel
        self.unpack_data = unpack_data
        self.searchable_fields = searchable_fields if searchable_fields is not None else []
        self.store_hash = store_hash

        # Force the key to be the same as the index
        assert isinstance(index.key, str), "Since we are using the key as a file name in S3, they key must be a string"
        if key != index.key:
            warnings.warn(
                f'The desired S3Store key "{key}" does not match the index key "{index.key},"'
                "the index key will be used",
                UserWarning,
            )
        kwargs["key"] = str(index.key)

        self._thread_local = threading.local()
        super().__init__(**kwargs)

    @property
    def name(self) -> str:
        """String representing this data source."""
        return f"s3://{self.bucket}"

    def connect(self, force_reset: bool = False):  # lgtm[py/conflicting-attributes]
        """Connect to the source data.

        Args:
            force_reset: whether to force a reset of the connection
        """
        if self.s3 is None or force_reset:
            self.s3, self.s3_bucket = self._get_resource_and_bucket()
        self.index.connect(force_reset=force_reset)

    def close(self):
        """Closes any connections."""
        self.index.close()

        self.s3.meta.client.close()
        self.s3 = None
        self.s3_bucket = None

        if self.ssh_tunnel is not None:
            self.ssh_tunnel.stop()

    @property
    def _collection(self):
        """
        A handle to the pymongo collection object.

        Important:
            Not guaranteed to exist in the future.
        """
        # For now returns the index collection since that is what we would "search" on
        return self.index._collection

    def count(self, criteria: Optional[dict] = None) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in.
        """
        return self.index.count(criteria)

    def query(
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[dict]:
        """
        Queries the Store for a set of documents.

        Args:
            criteria: PyMongo filter for documents to search in.
            properties: properties to return in grouped documents.
            sort: Dictionary of sort order for fields. Keys are field names and values
                are 1 for ascending or -1 for descending.
            skip: number documents to skip.
            limit: limit on total number of documents returned.

        """
        prop_keys = set()
        if isinstance(properties, dict):
            prop_keys = set(properties.keys())
        elif isinstance(properties, list):
            prop_keys = set(properties)

        for doc in self.index.query(criteria=criteria, sort=sort, limit=limit, skip=skip):
            if properties is not None and prop_keys.issubset(set(doc.keys())):
                yield {p: doc[p] for p in properties if p in doc}
            else:
                try:
                    # TODO: This is ugly and unsafe, do some real checking before pulling data
                    data = self.s3_bucket.Object(self._get_full_key_path(doc[self.key])).get()["Body"].read()
                except botocore.exceptions.ClientError as e:
                    # If a client error is thrown, then check that it was a NoSuchKey or NoSuchBucket error.
                    # If it was a NoSuchKey error, then the object does not exist.
                    error_code = e.response["Error"]["Code"]
                    if error_code in ["NoSuchKey", "NoSuchBucket"]:
                        error_message = e.response["Error"]["Message"]
                        self.logger.error(
                            f"S3 returned '{error_message}' while querying '{self.bucket}' for '{doc[self.key]}'"
                        )
                        continue
                    else:
                        raise e

                if self.unpack_data:
                    data = self._read_data(data=data, compress_header=doc.get("compression", ""))

                    if self.last_updated_field in doc:
                        data[self.last_updated_field] = doc[self.last_updated_field]

                yield data

    def _read_data(self, data: bytes, compress_header: str) -> dict:
        """Reads the data and transforms it into a dictionary.
        Allows for subclasses to apply custom schemes for transforming
        the data retrieved from S3.

        Args:
            data (bytes): The raw byte representation of the data.
            compress_header (str): String representing the type of compression used on the data.

        Returns:
            Dict: Dictionary representation of the data.
        """
        return self._unpack(data=data, compressed=compress_header == "zlib")

    @staticmethod
    def _unpack(data: bytes, compressed: bool):
        if compressed:
            data = zlib.decompress(data)
        # requires msgpack-python to be installed to fix string encoding problem
        # https://github.com/msgpack/msgpack/issues/121
        # During recursion
        # msgpack.unpackb goes as deep as possible during reconstruction
        # MontyDecoder().process_decode only goes until it finds a from_dict
        # as such, we cannot just use msgpack.unpackb(data, object_hook=monty_object_hook, raw=False)
        # Should just return the unpacked object then let the user run process_decoded
        return msgpack.unpackb(data, raw=False)

    def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
        """
        Get all distinct values for a field.

        Args:
            field: the field(s) to get distinct values for.
            criteria: PyMongo filter for documents to search in.
        """
        # Index is a store so it should have its own distinct function
        return self.index.distinct(field, criteria=criteria)

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        """
        Simple grouping function that will group documents by keys.

        Args:
            keys: fields to group documents.
            criteria: PyMongo filter for documents to search in.
            properties: properties to return in grouped documents.
            sort: Dictionary of sort order for fields. Keys are field names and values
            are 1 for ascending or -1 for descending.
            skip: number documents to skip.
            limit: limit on total number of documents returned.

        Returns:
            generator returning tuples of (dict, list of docs)
        """
        return self.index.groupby(
            keys=keys,
            criteria=criteria,
            properties=properties,
            sort=sort,
            skip=skip,
            limit=limit,
        )

    def ensure_index(self, key: str, unique: bool = False) -> bool:
        """
        Tries to create an index and return true if it succeeded.

        Args:
            key: single key to index.
            unique: whether this index contains only unique keys.

        Returns:
            bool indicating if the index exists/was created.
        """
        return self.index.ensure_index(key, unique=unique)

    def update(
        self,
        docs: Union[list[dict], dict],
        key: Union[list, str, None] = None,
        additional_metadata: Union[str, list[str], None] = None,
    ):
        """
        Update documents into the Store.

        Args:
            docs: the document or list of documents to update.
            key: field name(s) to determine uniqueness for a document, can be a list of
                multiple fields, a single field, or None if the Store's key field is to
                be used.
            additional_metadata: field(s) to include in the S3 store's metadata.
        """
        if not isinstance(docs, list):
            docs = [docs]

        if isinstance(key, str):
            key = [key]
        elif not key:
            key = [self.key]

        if additional_metadata is None:
            additional_metadata = []
        elif isinstance(additional_metadata, str):
            additional_metadata = [additional_metadata]
        else:
            additional_metadata = list(additional_metadata)

        self._write_to_s3_and_index(docs, key + additional_metadata + self.searchable_fields)

    def _write_to_s3_and_index(self, docs: list[dict], search_keys: list[str]):
        """Implements updating of the provided documents in S3 and the index.
        Allows for subclasses to apply custom approaches to parellizing the writing.

        Args:
            docs (List[Dict]): The documents to update
            search_keys (List[str]): The keys of the information to be updated in the index
        """
        with ThreadPoolExecutor(max_workers=self.s3_workers) as pool:
            fs = {
                pool.submit(
                    self.write_doc_to_s3,
                    doc=itr_doc,
                    search_keys=search_keys,
                )
                for itr_doc in docs
            }
            fs, _ = wait(fs)

            search_docs = [sdoc.result() for sdoc in fs]

        # Use store's update to remove key clashes
        self.index.update(search_docs, key=self.key)

    def _get_session(self):
        if self.ssh_tunnel is not None:
            self.ssh_tunnel.start()

        if not hasattr(self._thread_local, "s3_bucket"):
            if isinstance(self.s3_profile, dict):
                return Session(**self.s3_profile)
            return Session(profile_name=self.s3_profile)

        return None

    def _get_endpoint_url(self):
        if self.ssh_tunnel is None:
            return self.endpoint_url
        host, port = self.ssh_tunnel.local_address
        return f"http://{host}:{port}"

    def _get_bucket(self):
        """If on the main thread return the bucket created above, else create a new
        bucket on each thread.
        """
        if threading.current_thread().name == "MainThread":
            return self.s3_bucket

        if not hasattr(self._thread_local, "s3_bucket"):
            _, bucket = self._get_resource_and_bucket()
            self._thread_local.s3_bucket = bucket

        return self._thread_local.s3_bucket

    def _get_resource_and_bucket(self):
        """Helper function to create the resource and bucket objects."""
        session = self._get_session()
        endpoint_url = self._get_endpoint_url()
        resource = session.resource("s3", endpoint_url=endpoint_url, **self.s3_resource_kwargs)
        try:
            resource.meta.client.head_bucket(Bucket=self.bucket)
        except ClientError:
            raise RuntimeError("Bucket not present on AWS")
        bucket = resource.Bucket(self.bucket)

        return resource, bucket

    def _get_full_key_path(self, id: str) -> str:
        """Produces the full key path for S3 items.

        Args:
            id (str): The value of the key identifier.

        Returns:
            str: The full key path
        """
        return self.sub_dir + str(id)

    def _get_compression_function(self) -> Callable:
        """Returns the function to use for compressing data."""
        return zlib.compress

    def _get_decompression_function(self) -> Callable:
        """Returns the function to use for decompressing data."""
        return zlib.decompress

    def write_doc_to_s3(self, doc: dict, search_keys: list[str]) -> dict:
        """
        Write the data to s3 and return the metadata to be inserted into the index db.

        Args:
            doc: the document.
            search_keys: list of keys to pull from the docs and be inserted into the
                index db.

        Returns:
            Dict: The metadata to be inserted into the index db
        """
        s3_bucket = self._get_bucket()

        search_doc = {k: doc[k] for k in search_keys}
        search_doc[self.key] = doc[self.key]  # Ensure key is in metadata
        if self.sub_dir != "":
            search_doc["sub_dir"] = self.sub_dir

        # Remove MongoDB _id from search
        if "_id" in search_doc:
            del search_doc["_id"]

        # to make hashing more meaningful, make sure last updated field is removed
        lu_info = doc.pop(self.last_updated_field, None)
        data = msgpack.packb(doc, default=monty_default)

        if self.compress:
            # Compress with zlib if chosen
            search_doc["compression"] = "zlib"
            data = self._get_compression_function()(data)

        # keep a record of original keys, in case these are important for the individual researcher
        # it is not expected that this information will be used except in disaster recovery
        s3_to_mongo_keys = {k: self._sanitize_key(k) for k in search_doc}
        s3_to_mongo_keys["s3-to-mongo-keys"] = "s3-to-mongo-keys"  # inception
        # encode dictionary since values have to be strings
        search_doc["s3-to-mongo-keys"] = dumps(s3_to_mongo_keys)
        s3_bucket.upload_fileobj(
            Fileobj=BytesIO(data),
            Key=self._get_full_key_path(str(doc[self.key])),
            ExtraArgs={"Metadata": {s3_to_mongo_keys[k]: str(v) for k, v in search_doc.items()}},
        )

        if lu_info is not None:
            search_doc[self.last_updated_field] = lu_info

        if self.store_hash:
            hasher = sha1()
            hasher.update(data)
            obj_hash = hasher.hexdigest()
            search_doc["obj_hash"] = obj_hash
        return search_doc

    @staticmethod
    def _sanitize_key(key):
        """Sanitize keys to store in S3/MinIO metadata."""
        # Any underscores are encoded as double dashes in metadata, since keys with
        # underscores may be result in the corresponding HTTP header being stripped
        # by certain server configurations (e.g. default nginx), leading to:
        # `botocore.exceptions.ClientError: An error occurred (AccessDenied) when
        # calling the PutObject operation: There were headers present in the request
        # which were not signed`
        # Metadata stored in the MongoDB index (self.index) is stored unchanged.

        # Additionally, MinIO requires lowercase keys
        return str(key).replace("_", "-").lower()

    def remove_docs(self, criteria: dict, remove_s3_object: bool = False):
        """
        Remove docs matching the query dictionary.

        Args:
            criteria: query dictionary to match.
            remove_s3_object: whether to remove the actual S3 object or not.
        """
        if not remove_s3_object:
            self.index.remove_docs(criteria=criteria)
        else:
            to_remove = self.index.distinct(self.key, criteria=criteria)
            self.index.remove_docs(criteria=criteria)

            # Can remove up to 1000 items at a time via boto
            to_remove_chunks = list(grouper(to_remove, n=1000))
            for chunk_to_remove in to_remove_chunks:
                objlist = [{"Key": self._get_full_key_path(obj)} for obj in chunk_to_remove]
                self.s3_bucket.delete_objects(Delete={"Objects": objlist})

    @property
    def last_updated(self):
        return self.index.last_updated

    def newer_in(self, target: Store, criteria: Optional[dict] = None, exhaustive: bool = False) -> list[str]:
        """
        Returns the keys of documents that are newer in the target Store than this Store.

        Args:
            target: target Store.
            criteria: PyMongo filter for documents to search in.
            exhaustive: triggers an item-by-item check vs. checking the last_updated of
                the target Store and using that to filter out new items in.
        """
        if hasattr(target, "index"):
            return self.index.newer_in(target=target.index, criteria=criteria, exhaustive=exhaustive)
        return self.index.newer_in(target=target, criteria=criteria, exhaustive=exhaustive)

    def __hash__(self):
        return hash((self.index.__hash__, self.bucket))

    def rebuild_index_from_s3_data(self, **kwargs):
        """
        Rebuilds the index Store from the data in S3.

        Relies on the index document being stores as the metadata for the file. This can
        help recover lost databases.
        """
        bucket = self.s3_bucket
        objects = bucket.objects.filter(Prefix=self.sub_dir)
        for obj in objects:
            key_ = self._get_full_key_path(obj.key)
            data = self.s3_bucket.Object(key_).get()["Body"].read()

            if self.compress:
                data = self._get_decompression_function()(data)
            unpacked_data = msgpack.unpackb(data, raw=False)
            self.update(unpacked_data, **kwargs)

    def rebuild_metadata_from_index(self, index_query: Optional[dict] = None):
        """
        Read data from the index store and populate the metadata of the S3 bucket.
        Force all the keys to be lower case to be Minio compatible.

        Args:
            index_query: query on the index store.
        """
        qq = {} if index_query is None else index_query
        for index_doc in self.index.query(qq):
            key_ = self._get_full_key_path(index_doc[self.key])
            s3_object = self.s3_bucket.Object(key_)
            new_meta = {self._sanitize_key(k): v for k, v in s3_object.metadata.items()}
            for k, v in index_doc.items():
                new_meta[str(k).lower()] = v
            new_meta.pop("_id")
            if self.last_updated_field in new_meta:
                new_meta[self.last_updated_field] = str(to_isoformat_ceil_ms(new_meta[self.last_updated_field]))
            # s3_object.metadata.update(new_meta)
            s3_object.copy_from(
                CopySource={"Bucket": self.s3_bucket.name, "Key": key_},
                Metadata=new_meta,
                MetadataDirective="REPLACE",
            )

    def __eq__(self, other: object) -> bool:
        """
        Check equality for S3Store.

        other: other S3Store to compare with.
        """
        if not isinstance(other, S3Store):
            return False

        fields = ["index", "bucket", "last_updated_field"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`name: str` `property` ¶

String representing this data source.

`eq(other)` ¶

Check equality for S3Store.

other: other S3Store to compare with.

Source code in src/maggma/stores/aws.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for S3Store.

    other: other S3Store to compare with.
    """
    if not isinstance(other, S3Store):
        return False

    fields = ["index", "bucket", "last_updated_field"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(index, bucket, s3_profile=None, compress=False, endpoint_url=None, sub_dir=None, s3_workers=1, s3_resource_kwargs=None, ssh_tunnel=None, key='fs_id', store_hash=True, unpack_data=True, searchable_fields=None, index_store_kwargs=None, **kwargs)` ¶

Initializes an S3 Store.

Parameters:

Name	Type	Description	Default
`index`	`Store`	a store to use to index the S3 bucket.	required
`bucket`	`str`	name of the bucket.	required
`s3_profile`	`Optional[Union[str, dict]]`	name of AWS profile containing the credentials. Alternatively you can pass in a dictionary with the full credentials: aws_access_key_id (string) -- AWS access key ID aws_secret_access_key (string) -- AWS secret access key aws_session_token (string) -- AWS temporary session token region_name (string) -- Default region when creating new connections	`None`
`compress`	`bool`	compress files inserted into the store.	`False`
`endpoint_url`	`Optional[str]`	this allows the interface with minio service; ignored if `ssh_tunnel` is provided, in which case it is inferred.	`None`
`sub_dir`	`Optional[str]`	subdirectory of the S3 bucket to store the data.	`None`
`s3_workers`	`int`	number of concurrent S3 puts to run.	`1`
`s3_resource_kwargs`	`Optional[dict]`	additional kwargs to pass to the boto3 session resource.	`None`
`ssh_tunnel`	`Optional[SSHTunnel]`	optional SSH tunnel to use for the S3 connection.	`None`
`key`	`str`	main key to index on.	`'fs_id'`
`store_hash`	`bool`	store the SHA1 hash right before insertion to the database.	`True`
`unpack_data`	`bool`	whether to decompress and unpack byte data when querying from the bucket.	`True`
`searchable_fields`	`Optional[list[str]]`	fields to keep in the index store.	`None`
`index_store_kwargs`	`Optional[dict]`	kwargs to pass to the index store. Allows the user to use kwargs here to update the index store.	`None`

Source code in src/maggma/stores/aws.py

def __init__(
    self,
    index: Store,
    bucket: str,
    s3_profile: Optional[Union[str, dict]] = None,
    compress: bool = False,
    endpoint_url: Optional[str] = None,
    sub_dir: Optional[str] = None,
    s3_workers: int = 1,
    s3_resource_kwargs: Optional[dict] = None,
    ssh_tunnel: Optional[SSHTunnel] = None,
    key: str = "fs_id",
    store_hash: bool = True,
    unpack_data: bool = True,
    searchable_fields: Optional[list[str]] = None,
    index_store_kwargs: Optional[dict] = None,
    **kwargs,
):
    """
    Initializes an S3 Store.

    Args:
        index: a store to use to index the S3 bucket.
        bucket: name of the bucket.
        s3_profile: name of AWS profile containing the credentials. Alternatively
            you can pass in a dictionary with the full credentials:
                aws_access_key_id (string) -- AWS access key ID
                aws_secret_access_key (string) -- AWS secret access key
                aws_session_token (string) -- AWS temporary session token
                region_name (string) -- Default region when creating new connections
        compress: compress files inserted into the store.
        endpoint_url: this allows the interface with minio service; ignored if
            `ssh_tunnel` is provided, in which case it is inferred.
        sub_dir: subdirectory of the S3 bucket to store the data.
        s3_workers: number of concurrent S3 puts to run.
        s3_resource_kwargs: additional kwargs to pass to the boto3 session resource.
        ssh_tunnel: optional SSH tunnel to use for the S3 connection.
        key: main key to index on.
        store_hash: store the SHA1 hash right before insertion to the database.
        unpack_data: whether to decompress and unpack byte data when querying from
            the bucket.
        searchable_fields: fields to keep in the index store.
        index_store_kwargs: kwargs to pass to the index store. Allows the user to
            use kwargs here to update the index store.
    """
    if boto3 is None:
        raise RuntimeError("boto3 and botocore are required for S3Store")
    self.index_store_kwargs = index_store_kwargs or {}
    if index_store_kwargs:
        d_ = index.as_dict()
        d_.update(index_store_kwargs)
        self.index = index.__class__.from_dict(d_)
    else:
        self.index = index
    self.bucket = bucket
    self.s3_profile = s3_profile
    self.compress = compress
    self.endpoint_url = endpoint_url
    self.sub_dir = sub_dir.strip("/") + "/" if sub_dir else ""
    self.s3: Any = None
    self.s3_bucket: Any = None
    self.s3_workers = s3_workers
    self.s3_resource_kwargs = s3_resource_kwargs if s3_resource_kwargs is not None else {}
    self.ssh_tunnel = ssh_tunnel
    self.unpack_data = unpack_data
    self.searchable_fields = searchable_fields if searchable_fields is not None else []
    self.store_hash = store_hash

    # Force the key to be the same as the index
    assert isinstance(index.key, str), "Since we are using the key as a file name in S3, they key must be a string"
    if key != index.key:
        warnings.warn(
            f'The desired S3Store key "{key}" does not match the index key "{index.key},"'
            "the index key will be used",
            UserWarning,
        )
    kwargs["key"] = str(index.key)

    self._thread_local = threading.local()
    super().__init__(**kwargs)

`close()` ¶

Closes any connections.

Source code in src/maggma/stores/aws.py

def close(self):
    """Closes any connections."""
    self.index.close()

    self.s3.meta.client.close()
    self.s3 = None
    self.s3_bucket = None

    if self.ssh_tunnel is not None:
        self.ssh_tunnel.stop()

`connect(force_reset=False)` ¶

Connect to the source data.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to force a reset of the connection	`False`

Source code in src/maggma/stores/aws.py

def connect(self, force_reset: bool = False):  # lgtm[py/conflicting-attributes]
    """Connect to the source data.

    Args:
        force_reset: whether to force a reset of the connection
    """
    if self.s3 is None or force_reset:
        self.s3, self.s3_bucket = self._get_resource_and_bucket()
    self.index.connect(force_reset=force_reset)

`count(criteria=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in.	`None`

Source code in src/maggma/stores/aws.py

def count(self, criteria: Optional[dict] = None) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in.
    """
    return self.index.count(criteria)

`distinct(field, criteria=None, all_exist=False)` ¶

Get all distinct values for a field.

Parameters:

Name	Type	Description	Default
`field`	`str`	the field(s) to get distinct values for.	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in.	`None`

Source code in src/maggma/stores/aws.py

def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
    """
    Get all distinct values for a field.

    Args:
        field: the field(s) to get distinct values for.
        criteria: PyMongo filter for documents to search in.
    """
    # Index is a store so it should have its own distinct function
    return self.index.distinct(field, criteria=criteria)

`ensure_index(key, unique=False)` ¶

Tries to create an index and return true if it succeeded.

Parameters:

Name	Type	Description	Default
`key`	`str`	single key to index.	required
`unique`	`bool`	whether this index contains only unique keys.	`False`

Returns:

Type	Description
`bool`	bool indicating if the index exists/was created.

Source code in src/maggma/stores/aws.py

def ensure_index(self, key: str, unique: bool = False) -> bool:
    """
    Tries to create an index and return true if it succeeded.

    Args:
        key: single key to index.
        unique: whether this index contains only unique keys.

    Returns:
        bool indicating if the index exists/was created.
    """
    return self.index.ensure_index(key, unique=unique)

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Simple grouping function that will group documents by keys.

Parameters:

Name	Type	Description	Default
`keys`	`Union[list[str], str]`	fields to group documents.	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in.	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents.	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values	`None`
`skip`	`int`	number documents to skip.	`0`
`limit`	`int`	limit on total number of documents returned.	`0`

Returns:

Type	Description
`Iterator[tuple[dict, list[dict]]]`	generator returning tuples of (dict, list of docs)

Source code in src/maggma/stores/aws.py

def groupby(
    self,
    keys: Union[list[str], str],
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[tuple[dict, list[dict]]]:
    """
    Simple grouping function that will group documents by keys.

    Args:
        keys: fields to group documents.
        criteria: PyMongo filter for documents to search in.
        properties: properties to return in grouped documents.
        sort: Dictionary of sort order for fields. Keys are field names and values
        are 1 for ascending or -1 for descending.
        skip: number documents to skip.
        limit: limit on total number of documents returned.

    Returns:
        generator returning tuples of (dict, list of docs)
    """
    return self.index.groupby(
        keys=keys,
        criteria=criteria,
        properties=properties,
        sort=sort,
        skip=skip,
        limit=limit,
    )

`newer_in(target, criteria=None, exhaustive=False)` ¶

Returns the keys of documents that are newer in the target Store than this Store.

Parameters:

Name	Type	Description	Default
`target`	`Store`	target Store.	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in.	`None`
`exhaustive`	`bool`	triggers an item-by-item check vs. checking the last_updated of the target Store and using that to filter out new items in.	`False`

Source code in src/maggma/stores/aws.py

def newer_in(self, target: Store, criteria: Optional[dict] = None, exhaustive: bool = False) -> list[str]:
    """
    Returns the keys of documents that are newer in the target Store than this Store.

    Args:
        target: target Store.
        criteria: PyMongo filter for documents to search in.
        exhaustive: triggers an item-by-item check vs. checking the last_updated of
            the target Store and using that to filter out new items in.
    """
    if hasattr(target, "index"):
        return self.index.newer_in(target=target.index, criteria=criteria, exhaustive=exhaustive)
    return self.index.newer_in(target=target, criteria=criteria, exhaustive=exhaustive)

`query(criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Queries the Store for a set of documents.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in.	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents.	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip.	`0`
`limit`	`int`	limit on total number of documents returned.	`0`

Source code in src/maggma/stores/aws.py

def query(
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[dict]:
    """
    Queries the Store for a set of documents.

    Args:
        criteria: PyMongo filter for documents to search in.
        properties: properties to return in grouped documents.
        sort: Dictionary of sort order for fields. Keys are field names and values
            are 1 for ascending or -1 for descending.
        skip: number documents to skip.
        limit: limit on total number of documents returned.

    """
    prop_keys = set()
    if isinstance(properties, dict):
        prop_keys = set(properties.keys())
    elif isinstance(properties, list):
        prop_keys = set(properties)

    for doc in self.index.query(criteria=criteria, sort=sort, limit=limit, skip=skip):
        if properties is not None and prop_keys.issubset(set(doc.keys())):
            yield {p: doc[p] for p in properties if p in doc}
        else:
            try:
                # TODO: This is ugly and unsafe, do some real checking before pulling data
                data = self.s3_bucket.Object(self._get_full_key_path(doc[self.key])).get()["Body"].read()
            except botocore.exceptions.ClientError as e:
                # If a client error is thrown, then check that it was a NoSuchKey or NoSuchBucket error.
                # If it was a NoSuchKey error, then the object does not exist.
                error_code = e.response["Error"]["Code"]
                if error_code in ["NoSuchKey", "NoSuchBucket"]:
                    error_message = e.response["Error"]["Message"]
                    self.logger.error(
                        f"S3 returned '{error_message}' while querying '{self.bucket}' for '{doc[self.key]}'"
                    )
                    continue
                else:
                    raise e

            if self.unpack_data:
                data = self._read_data(data=data, compress_header=doc.get("compression", ""))

                if self.last_updated_field in doc:
                    data[self.last_updated_field] = doc[self.last_updated_field]

            yield data

`rebuild_index_from_s3_data(**kwargs)` ¶

Rebuilds the index Store from the data in S3.

Relies on the index document being stores as the metadata for the file. This can help recover lost databases.

Source code in src/maggma/stores/aws.py

def rebuild_index_from_s3_data(self, **kwargs):
    """
    Rebuilds the index Store from the data in S3.

    Relies on the index document being stores as the metadata for the file. This can
    help recover lost databases.
    """
    bucket = self.s3_bucket
    objects = bucket.objects.filter(Prefix=self.sub_dir)
    for obj in objects:
        key_ = self._get_full_key_path(obj.key)
        data = self.s3_bucket.Object(key_).get()["Body"].read()

        if self.compress:
            data = self._get_decompression_function()(data)
        unpacked_data = msgpack.unpackb(data, raw=False)
        self.update(unpacked_data, **kwargs)

`rebuild_metadata_from_index(index_query=None)` ¶

Read data from the index store and populate the metadata of the S3 bucket. Force all the keys to be lower case to be Minio compatible.

Parameters:

Name	Type	Description	Default
`index_query`	`Optional[dict]`	query on the index store.	`None`

Source code in src/maggma/stores/aws.py

def rebuild_metadata_from_index(self, index_query: Optional[dict] = None):
    """
    Read data from the index store and populate the metadata of the S3 bucket.
    Force all the keys to be lower case to be Minio compatible.

    Args:
        index_query: query on the index store.
    """
    qq = {} if index_query is None else index_query
    for index_doc in self.index.query(qq):
        key_ = self._get_full_key_path(index_doc[self.key])
        s3_object = self.s3_bucket.Object(key_)
        new_meta = {self._sanitize_key(k): v for k, v in s3_object.metadata.items()}
        for k, v in index_doc.items():
            new_meta[str(k).lower()] = v
        new_meta.pop("_id")
        if self.last_updated_field in new_meta:
            new_meta[self.last_updated_field] = str(to_isoformat_ceil_ms(new_meta[self.last_updated_field]))
        # s3_object.metadata.update(new_meta)
        s3_object.copy_from(
            CopySource={"Bucket": self.s3_bucket.name, "Key": key_},
            Metadata=new_meta,
            MetadataDirective="REPLACE",
        )

`remove_docs(criteria, remove_s3_object=False)` ¶

Remove docs matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match.	required
`remove_s3_object`	`bool`	whether to remove the actual S3 object or not.	`False`

Source code in src/maggma/stores/aws.py

def remove_docs(self, criteria: dict, remove_s3_object: bool = False):
    """
    Remove docs matching the query dictionary.

    Args:
        criteria: query dictionary to match.
        remove_s3_object: whether to remove the actual S3 object or not.
    """
    if not remove_s3_object:
        self.index.remove_docs(criteria=criteria)
    else:
        to_remove = self.index.distinct(self.key, criteria=criteria)
        self.index.remove_docs(criteria=criteria)

        # Can remove up to 1000 items at a time via boto
        to_remove_chunks = list(grouper(to_remove, n=1000))
        for chunk_to_remove in to_remove_chunks:
            objlist = [{"Key": self._get_full_key_path(obj)} for obj in chunk_to_remove]
            self.s3_bucket.delete_objects(Delete={"Objects": objlist})

`update(docs, key=None, additional_metadata=None)` ¶

Update documents into the Store.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update.	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used.	`None`
`additional_metadata`	`Union[str, list[str], None]`	field(s) to include in the S3 store's metadata.	`None`

Source code in src/maggma/stores/aws.py

def update(
    self,
    docs: Union[list[dict], dict],
    key: Union[list, str, None] = None,
    additional_metadata: Union[str, list[str], None] = None,
):
    """
    Update documents into the Store.

    Args:
        docs: the document or list of documents to update.
        key: field name(s) to determine uniqueness for a document, can be a list of
            multiple fields, a single field, or None if the Store's key field is to
            be used.
        additional_metadata: field(s) to include in the S3 store's metadata.
    """
    if not isinstance(docs, list):
        docs = [docs]

    if isinstance(key, str):
        key = [key]
    elif not key:
        key = [self.key]

    if additional_metadata is None:
        additional_metadata = []
    elif isinstance(additional_metadata, str):
        additional_metadata = [additional_metadata]
    else:
        additional_metadata = list(additional_metadata)

    self._write_to_s3_and_index(docs, key + additional_metadata + self.searchable_fields)

`write_doc_to_s3(doc, search_keys)` ¶

Write the data to s3 and return the metadata to be inserted into the index db.

Parameters:

Name	Type	Description	Default
`doc`	`dict`	the document.	required
`search_keys`	`list[str]`	list of keys to pull from the docs and be inserted into the index db.	required

Returns:

Name	Type	Description
`Dict`	`dict`	The metadata to be inserted into the index db

Source code in src/maggma/stores/aws.py

def write_doc_to_s3(self, doc: dict, search_keys: list[str]) -> dict:
    """
    Write the data to s3 and return the metadata to be inserted into the index db.

    Args:
        doc: the document.
        search_keys: list of keys to pull from the docs and be inserted into the
            index db.

    Returns:
        Dict: The metadata to be inserted into the index db
    """
    s3_bucket = self._get_bucket()

    search_doc = {k: doc[k] for k in search_keys}
    search_doc[self.key] = doc[self.key]  # Ensure key is in metadata
    if self.sub_dir != "":
        search_doc["sub_dir"] = self.sub_dir

    # Remove MongoDB _id from search
    if "_id" in search_doc:
        del search_doc["_id"]

    # to make hashing more meaningful, make sure last updated field is removed
    lu_info = doc.pop(self.last_updated_field, None)
    data = msgpack.packb(doc, default=monty_default)

    if self.compress:
        # Compress with zlib if chosen
        search_doc["compression"] = "zlib"
        data = self._get_compression_function()(data)

    # keep a record of original keys, in case these are important for the individual researcher
    # it is not expected that this information will be used except in disaster recovery
    s3_to_mongo_keys = {k: self._sanitize_key(k) for k in search_doc}
    s3_to_mongo_keys["s3-to-mongo-keys"] = "s3-to-mongo-keys"  # inception
    # encode dictionary since values have to be strings
    search_doc["s3-to-mongo-keys"] = dumps(s3_to_mongo_keys)
    s3_bucket.upload_fileobj(
        Fileobj=BytesIO(data),
        Key=self._get_full_key_path(str(doc[self.key])),
        ExtraArgs={"Metadata": {s3_to_mongo_keys[k]: str(v) for k, v in search_doc.items()}},
    )

    if lu_info is not None:
        search_doc[self.last_updated_field] = lu_info

    if self.store_hash:
        hasher = sha1()
        hasher.update(data)
        obj_hash = hasher.hexdigest()
        search_doc["obj_hash"] = obj_hash
    return search_doc

Advanced Stores for connecting to Microsoft Azure data.

`AzureBlobStore` ¶

Bases: Store

GridFS like storage using Azure Blob and a regular store for indexing.

Requires azure-storage-blob and azure-identity modules to be installed.

Source code in src/maggma/stores/azure.py

class AzureBlobStore(Store):
    """
    GridFS like storage using Azure Blob and a regular store for indexing.

    Requires azure-storage-blob and azure-identity modules to be installed.
    """

    def __init__(
        self,
        index: Store,
        container_name: str,
        azure_client_info: Optional[Union[str, dict]] = None,
        compress: bool = False,
        sub_dir: Optional[str] = None,
        workers: int = 1,
        azure_resource_kwargs: Optional[dict] = None,
        key: str = "fs_id",
        store_hash: bool = True,
        unpack_data: bool = True,
        searchable_fields: Optional[list[str]] = None,
        key_sanitize_dict: Optional[dict] = None,
        create_container: bool = False,
        **kwargs,
    ):
        """
        Initializes an AzureBlob Store.

        Args:
            index: a store to use to index the Azure blob
            container_name: name of the container
            azure_client_info: connection_url of the BlobServiceClient if a string.
                Assumes that the access is passwordless in that case.
                Otherwise, if a dictionary, options to instantiate the
                BlobServiceClient.
                Currently supported keywords:
                    - connection_string: a connection string for the Azure blob
            compress: compress files inserted into the store
            sub_dir: (optional)  subdirectory of the container to store the data.
                When defined, a final "/" will be added if not already present.
            workers: number of concurrent Azure puts to run
            store_hash: store the sha1 hash right before insertion to the database.
            unpack_data: whether to decompress and unpack byte data when querying from
                the container.
            searchable_fields: fields to keep in the index store
            key_sanitize_dict: a dictionary that allows to customize the sanitization
                of the keys in metadata, since they should adhere to the naming rules
                for C# identifiers. If None the AZURE_KEY_SANITIZE default will be used
                to handle the most common cases.
            create_container: if True the Store creates the container, in case it does
                not exist.
            kwargs: keywords for the base Store.
        """
        if azure_blob is None:
            raise RuntimeError("azure-storage-blob and azure-identity are required for AzureBlobStore")

        self.index = index
        self.container_name = container_name
        self.azure_client_info = azure_client_info
        self.compress = compress
        self.sub_dir = sub_dir.rstrip("/") + "/" if sub_dir else ""
        self.service: Optional[BlobServiceClient] = None
        self.container: Optional[ContainerClient] = None
        self.workers = workers
        self.azure_resource_kwargs = azure_resource_kwargs if azure_resource_kwargs is not None else {}
        self.unpack_data = unpack_data
        self.searchable_fields = searchable_fields if searchable_fields is not None else []
        self.store_hash = store_hash
        if key_sanitize_dict is None:
            key_sanitize_dict = AZURE_KEY_SANITIZE
        self.key_sanitize_dict = key_sanitize_dict
        self.create_container = create_container

        # Force the key to be the same as the index
        assert isinstance(
            index.key, str
        ), "Since we are using the key as a file name in Azure Blob, the key must be a string"
        if key != index.key:
            warnings.warn(
                f'The desired AzureBlobStore key "{key}" does not match the index key "{index.key},"'
                "the index key will be used",
                UserWarning,
            )
        kwargs["key"] = str(index.key)

        self._thread_local = threading.local()
        super().__init__(**kwargs)

    @property
    def name(self) -> str:
        """
        Returns:
            a string representing this data source.
        """
        return f"container://{self.container_name}"

    def connect(self, *args, **kwargs):  # lgtm[py/conflicting-attributes]
        """
        Connect to the source data.
        """
        service_client = self._get_service_client()

        if not self.service:
            self.service = service_client
            container = service_client.get_container_client(self.container_name)
            if not container.exists():
                if self.create_container:
                    # catch the exception to avoid errors if already created
                    try:
                        container.create_container()
                    except ResourceExistsError:
                        pass
                else:
                    raise RuntimeError(f"Container not present on Azure: {self.container_name}")

            self.container = container
        self.index.connect(*args, **kwargs)

    def close(self):
        """
        Closes any connections.
        """
        self.index.close()
        self.service = None
        self.container = None

    @property
    def _collection(self):
        """
        Returns:
            a handle to the pymongo collection object.

        Important:
            Not guaranteed to exist in the future
        """
        # For now returns the index collection since that is what we would "search" on
        return self.index._collection

    def count(self, criteria: Optional[dict] = None) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in
        """
        return self.index.count(criteria)

    def query(
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[dict]:
        """
        Queries the Store for a set of documents.

        Args:
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned

        """
        if self.container is None or self.service is None:
            raise RuntimeError("The store has not been connected")

        prop_keys = set()
        if isinstance(properties, dict):
            prop_keys = set(properties.keys())
        elif isinstance(properties, list):
            prop_keys = set(properties)

        for doc in self.index.query(criteria=criteria, sort=sort, limit=limit, skip=skip):
            if properties is not None and prop_keys.issubset(set(doc.keys())):
                yield {p: doc[p] for p in properties if p in doc}
            else:
                try:
                    data = self.container.download_blob(self.sub_dir + str(doc[self.key])).readall()
                except azure.core.exceptions.ResourceNotFoundError:
                    self.logger.error(f"Could not find Blob object {doc[self.key]}")

                if self.unpack_data:
                    data = self._unpack(data=data, compressed=doc.get("compression", "") == "zlib")

                    if self.last_updated_field in doc:
                        data[self.last_updated_field] = doc[self.last_updated_field]  # type: ignore

                yield data  # type: ignore

    @staticmethod
    def _unpack(data: bytes, compressed: bool):
        if compressed:
            data = zlib.decompress(data)
        # requires msgpack-python to be installed to fix string encoding problem
        # https://github.com/msgpack/msgpack/issues/121
        # During recursion
        # msgpack.unpackb goes as deep as possible during reconstruction
        # MontyDecoder().process_decode only goes until it finds a from_dict
        # as such, we cannot just use msgpack.unpackb(data, object_hook=monty_object_hook, raw=False)
        # Should just return the unpacked object then let the user run process_decoded
        return msgpack.unpackb(data, raw=False)

    def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
        """
        Get all distinct values for a field.

        Args:
            field: the field(s) to get distinct values for
            criteria: PyMongo filter for documents to search in
        """
        # Index is a store so it should have its own distinct function
        return self.index.distinct(field, criteria=criteria)

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        """
        Simple grouping function that will group documents
        by keys.

        Args:
            keys: fields to group documents
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned

        Returns:
            generator returning tuples of (dict, list of docs)
        """
        return self.index.groupby(
            keys=keys,
            criteria=criteria,
            properties=properties,
            sort=sort,
            skip=skip,
            limit=limit,
        )

    def ensure_index(self, key: str, unique: bool = False) -> bool:
        """
        Tries to create an index and return true if it succeeded.

        Args:
            key: single key to index
            unique: Whether or not this index contains only unique keys

        Returns:
            bool indicating if the index exists/was created
        """
        return self.index.ensure_index(key, unique=unique)

    def update(
        self,
        docs: Union[list[dict], dict],
        key: Union[list, str, None] = None,
        additional_metadata: Union[str, list[str], None] = None,
    ):
        """
        Update documents into the Store.

        Args:
            docs: the document or list of documents to update
            key: field name(s) to determine uniqueness for a
                 document, can be a list of multiple fields,
                 a single field, or None if the Store's key
                 field is to be used
            additional_metadata: field(s) to include in the blob store's metadata
        """
        if self.container is None or self.service is None:
            raise RuntimeError("The store has not been connected")

        if not isinstance(docs, list):
            docs = [docs]

        if isinstance(key, str):
            key = [key]
        elif not key:
            key = [self.key]

        if additional_metadata is None:
            additional_metadata = []
        elif isinstance(additional_metadata, str):
            additional_metadata = [additional_metadata]
        else:
            additional_metadata = list(additional_metadata)

        with ThreadPoolExecutor(max_workers=self.workers) as pool:
            fs = {
                pool.submit(
                    self.write_doc_to_blob,
                    doc=itr_doc,
                    search_keys=key + additional_metadata + self.searchable_fields,
                )
                for itr_doc in docs
            }
            fs, _ = wait(fs)

            search_docs = [sdoc.result() for sdoc in fs]

        # Use store's update to remove key clashes
        self.index.update(search_docs, key=self.key)

    def _get_service_client(self):
        if not hasattr(self._thread_local, "container"):
            if isinstance(self.azure_client_info, str):
                # assume it is the account_url and that the connection is passwordless
                default_credential = DefaultAzureCredential()
                return BlobServiceClient(self.azure_client_info, credential=default_credential)

            if isinstance(self.azure_client_info, dict):
                connection_string = self.azure_client_info.get("connection_string")
                if connection_string:
                    return BlobServiceClient.from_connection_string(conn_str=connection_string)

            msg = f"Could not instantiate BlobServiceClient from azure_client_info: {self.azure_client_info}"
            raise RuntimeError(msg)
        return None

    def _get_container(self) -> Optional[ContainerClient]:
        """
        If on the main thread return the container created above, else create a new
        container on each thread.
        """
        if threading.current_thread().name == "MainThread":
            return self.container
        if not hasattr(self._thread_local, "container"):
            service_client = self._get_service_client()
            container = service_client.get_container_client(self.container_name)
            self._thread_local.container = container
        return self._thread_local.container

    def write_doc_to_blob(self, doc: dict, search_keys: list[str]):
        """
        Write the data to an Azure blob and return the metadata to be inserted into the index db.

        Args:
            doc: the document
            search_keys: list of keys to pull from the docs and be inserted into the
            index db
        """
        container = self._get_container()
        if container is None:
            raise RuntimeError("The store has not been connected")

        search_doc = {k: doc[k] for k in search_keys}
        search_doc[self.key] = doc[self.key]  # Ensure key is in metadata
        if self.sub_dir != "":
            search_doc["sub_dir"] = self.sub_dir

        # Remove MongoDB _id from search
        if "_id" in search_doc:
            del search_doc["_id"]

        # to make hashing more meaningful, make sure last updated field is removed
        lu_info = doc.pop(self.last_updated_field, None)
        data = msgpack.packb(doc, default=monty_default)

        if self.compress:
            # Compress with zlib if chosen
            search_doc["compression"] = "zlib"
            data = zlib.compress(data)

        if self.last_updated_field in doc:
            # need this conversion for metadata insert
            search_doc[self.last_updated_field] = str(to_isoformat_ceil_ms(doc[self.last_updated_field]))

        # keep a record of original keys, in case these are important for the individual researcher
        # it is not expected that this information will be used except in disaster recovery
        blob_to_mongo_keys = {k: self._sanitize_key(k) for k in search_doc}
        blob_to_mongo_keys["blob_to_mongo_keys"] = "blob_to_mongo_keys"  # inception
        # encode dictionary since values have to be strings
        search_doc["blob_to_mongo_keys"] = dumps(blob_to_mongo_keys)

        container.upload_blob(
            name=self.sub_dir + str(doc[self.key]),
            data=data,
            metadata={blob_to_mongo_keys[k]: str(v) for k, v in search_doc.items()},
            overwrite=True,
        )

        if lu_info is not None:
            search_doc[self.last_updated_field] = lu_info

        if self.store_hash:
            hasher = sha1()
            hasher.update(data)
            obj_hash = hasher.hexdigest()
            search_doc["obj_hash"] = obj_hash
        return search_doc

    def _sanitize_key(self, key):
        """
        Sanitize keys to store metadata.
        The metadata keys should adhere to the naming rules for C# identifiers.
        """
        new_key = str(key)
        for k, v in self.key_sanitize_dict.items():
            new_key = new_key.replace(k, v)

        return new_key

    def remove_docs(self, criteria: dict, remove_blob_object: bool = False):
        """
        Remove docs matching the query dictionary.

        Args:
            criteria: query dictionary to match
            remove_blob_object: whether to remove the actual blob Object or not
        """
        if self.container is None or self.service is None:
            raise RuntimeError("The store has not been connected")

        if not remove_blob_object:
            self.index.remove_docs(criteria=criteria)
        else:
            to_remove = self.index.distinct(self.key, criteria=criteria)
            self.index.remove_docs(criteria=criteria)

            # Can remove up to 256 items at a time
            to_remove_chunks = list(grouper(to_remove, n=256))
            for chunk_to_remove in to_remove_chunks:
                objlist = [{"name": f"{self.sub_dir}{obj}"} for obj in chunk_to_remove]
                self.container.delete_blobs(*objlist)

    @property
    def last_updated(self):
        return self.index.last_updated

    def newer_in(self, target: Store, criteria: Optional[dict] = None, exhaustive: bool = False) -> list[str]:
        """
        Returns the keys of documents that are newer in the target
        Store than this Store.

        Args:
            target: target Store
            criteria: PyMongo filter for documents to search in
            exhaustive: triggers an item-by-item check vs. checking
                        the last_updated of the target Store and using
                        that to filter out new items in
        """
        if hasattr(target, "index"):
            return self.index.newer_in(target=target.index, criteria=criteria, exhaustive=exhaustive)

        return self.index.newer_in(target=target, criteria=criteria, exhaustive=exhaustive)

    def __hash__(self):
        return hash((self.index.__hash__, self.container_name))

    def rebuild_index_from_blob_data(self, **kwargs):
        """
        Rebuilds the index Store from the data in Azure
        Relies on the index document being stores as the metadata for the file
        This can help recover lost databases.
        """
        objects = self.container.list_blobs(name_starts_with=self.sub_dir)
        for obj in objects:
            # handle the case where there are subdirs in the chosen container
            # but are below the level of the current subdir
            dir_name = os.path.dirname(obj.name)
            if dir_name != self.sub_dir:
                continue

            data = self.container.download_blob(obj.name).readall()

            if self.compress:
                data = zlib.decompress(data)
            unpacked_data = msgpack.unpackb(data, raw=False)
            # TODO maybe it can be avoided to reupload the data, since it is paid
            self.update(unpacked_data, **kwargs)

    def rebuild_metadata_from_index(self, index_query: Optional[dict] = None):
        """
        Read data from the index store and populate the metadata of the Azure Blob.
        Force all of the keys to be lower case to be Minio compatible

        Args:
            index_query: query on the index store.
        """
        if self.container is None or self.service is None:
            raise RuntimeError("The store has not been connected")

        qq = {} if index_query is None else index_query
        for index_doc in self.index.query(qq):
            key_ = self.sub_dir + index_doc[self.key]
            blob = self.container.get_blob_client(key_)
            properties = blob.get_blob_properties()
            new_meta = {self._sanitize_key(k): v for k, v in properties.metadata.items()}
            for k, v in index_doc.items():
                new_meta[str(k).lower()] = v
            new_meta.pop("_id")
            if self.last_updated_field in new_meta:
                new_meta[self.last_updated_field] = str(to_isoformat_ceil_ms(new_meta[self.last_updated_field]))
            blob.set_blob_metadata(new_meta)

    def __eq__(self, other: object) -> bool:
        """
        Check equality for AzureBlobStore
        other: other AzureBlobStore to compare with.
        """
        if not isinstance(other, AzureBlobStore):
            return False

        fields = ["index", "container_name", "last_updated_field"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`name: str` `property` ¶

Returns:

Type	Description
`str`	a string representing this data source.

`eq(other)` ¶

Check equality for AzureBlobStore other: other AzureBlobStore to compare with.

Source code in src/maggma/stores/azure.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for AzureBlobStore
    other: other AzureBlobStore to compare with.
    """
    if not isinstance(other, AzureBlobStore):
        return False

    fields = ["index", "container_name", "last_updated_field"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(index, container_name, azure_client_info=None, compress=False, sub_dir=None, workers=1, azure_resource_kwargs=None, key='fs_id', store_hash=True, unpack_data=True, searchable_fields=None, key_sanitize_dict=None, create_container=False, **kwargs)` ¶

Initializes an AzureBlob Store.

Parameters:

Name	Type	Description	Default
`index`	`Store`	a store to use to index the Azure blob	required
`container_name`	`str`	name of the container	required
`azure_client_info`	`Optional[Union[str, dict]]`	connection_url of the BlobServiceClient if a string. Assumes that the access is passwordless in that case. Otherwise, if a dictionary, options to instantiate the BlobServiceClient. Currently supported keywords: - connection_string: a connection string for the Azure blob	`None`
`compress`	`bool`	compress files inserted into the store	`False`
`sub_dir`	`Optional[str]`	(optional) subdirectory of the container to store the data. When defined, a final "/" will be added if not already present.	`None`
`workers`	`int`	number of concurrent Azure puts to run	`1`
`store_hash`	`bool`	store the sha1 hash right before insertion to the database.	`True`
`unpack_data`	`bool`	whether to decompress and unpack byte data when querying from the container.	`True`
`searchable_fields`	`Optional[list[str]]`	fields to keep in the index store	`None`
`key_sanitize_dict`	`Optional[dict]`	a dictionary that allows to customize the sanitization of the keys in metadata, since they should adhere to the naming rules for C# identifiers. If None the AZURE_KEY_SANITIZE default will be used to handle the most common cases.	`None`
`create_container`	`bool`	if True the Store creates the container, in case it does not exist.	`False`
`kwargs`		keywords for the base Store.	`{}`

Source code in src/maggma/stores/azure.py

def __init__(
    self,
    index: Store,
    container_name: str,
    azure_client_info: Optional[Union[str, dict]] = None,
    compress: bool = False,
    sub_dir: Optional[str] = None,
    workers: int = 1,
    azure_resource_kwargs: Optional[dict] = None,
    key: str = "fs_id",
    store_hash: bool = True,
    unpack_data: bool = True,
    searchable_fields: Optional[list[str]] = None,
    key_sanitize_dict: Optional[dict] = None,
    create_container: bool = False,
    **kwargs,
):
    """
    Initializes an AzureBlob Store.

    Args:
        index: a store to use to index the Azure blob
        container_name: name of the container
        azure_client_info: connection_url of the BlobServiceClient if a string.
            Assumes that the access is passwordless in that case.
            Otherwise, if a dictionary, options to instantiate the
            BlobServiceClient.
            Currently supported keywords:
                - connection_string: a connection string for the Azure blob
        compress: compress files inserted into the store
        sub_dir: (optional)  subdirectory of the container to store the data.
            When defined, a final "/" will be added if not already present.
        workers: number of concurrent Azure puts to run
        store_hash: store the sha1 hash right before insertion to the database.
        unpack_data: whether to decompress and unpack byte data when querying from
            the container.
        searchable_fields: fields to keep in the index store
        key_sanitize_dict: a dictionary that allows to customize the sanitization
            of the keys in metadata, since they should adhere to the naming rules
            for C# identifiers. If None the AZURE_KEY_SANITIZE default will be used
            to handle the most common cases.
        create_container: if True the Store creates the container, in case it does
            not exist.
        kwargs: keywords for the base Store.
    """
    if azure_blob is None:
        raise RuntimeError("azure-storage-blob and azure-identity are required for AzureBlobStore")

    self.index = index
    self.container_name = container_name
    self.azure_client_info = azure_client_info
    self.compress = compress
    self.sub_dir = sub_dir.rstrip("/") + "/" if sub_dir else ""
    self.service: Optional[BlobServiceClient] = None
    self.container: Optional[ContainerClient] = None
    self.workers = workers
    self.azure_resource_kwargs = azure_resource_kwargs if azure_resource_kwargs is not None else {}
    self.unpack_data = unpack_data
    self.searchable_fields = searchable_fields if searchable_fields is not None else []
    self.store_hash = store_hash
    if key_sanitize_dict is None:
        key_sanitize_dict = AZURE_KEY_SANITIZE
    self.key_sanitize_dict = key_sanitize_dict
    self.create_container = create_container

    # Force the key to be the same as the index
    assert isinstance(
        index.key, str
    ), "Since we are using the key as a file name in Azure Blob, the key must be a string"
    if key != index.key:
        warnings.warn(
            f'The desired AzureBlobStore key "{key}" does not match the index key "{index.key},"'
            "the index key will be used",
            UserWarning,
        )
    kwargs["key"] = str(index.key)

    self._thread_local = threading.local()
    super().__init__(**kwargs)

`close()` ¶

Closes any connections.

Source code in src/maggma/stores/azure.py

def close(self):
    """
    Closes any connections.
    """
    self.index.close()
    self.service = None
    self.container = None

`connect(*args, **kwargs)` ¶

Connect to the source data.

Source code in src/maggma/stores/azure.py

def connect(self, *args, **kwargs):  # lgtm[py/conflicting-attributes]
    """
    Connect to the source data.
    """
    service_client = self._get_service_client()

    if not self.service:
        self.service = service_client
        container = service_client.get_container_client(self.container_name)
        if not container.exists():
            if self.create_container:
                # catch the exception to avoid errors if already created
                try:
                    container.create_container()
                except ResourceExistsError:
                    pass
            else:
                raise RuntimeError(f"Container not present on Azure: {self.container_name}")

        self.container = container
    self.index.connect(*args, **kwargs)

`count(criteria=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in	`None`

Source code in src/maggma/stores/azure.py

def count(self, criteria: Optional[dict] = None) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in
    """
    return self.index.count(criteria)

`distinct(field, criteria=None, all_exist=False)` ¶

Get all distinct values for a field.

Parameters:

Name	Type	Description	Default
`field`	`str`	the field(s) to get distinct values for	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`

Source code in src/maggma/stores/azure.py

def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
    """
    Get all distinct values for a field.

    Args:
        field: the field(s) to get distinct values for
        criteria: PyMongo filter for documents to search in
    """
    # Index is a store so it should have its own distinct function
    return self.index.distinct(field, criteria=criteria)

`ensure_index(key, unique=False)` ¶

Tries to create an index and return true if it succeeded.

Parameters:

Name	Type	Description	Default
`key`	`str`	single key to index	required
`unique`	`bool`	Whether or not this index contains only unique keys	`False`

Returns:

Type	Description
`bool`	bool indicating if the index exists/was created

Source code in src/maggma/stores/azure.py

def ensure_index(self, key: str, unique: bool = False) -> bool:
    """
    Tries to create an index and return true if it succeeded.

    Args:
        key: single key to index
        unique: Whether or not this index contains only unique keys

    Returns:
        bool indicating if the index exists/was created
    """
    return self.index.ensure_index(key, unique=unique)

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Simple grouping function that will group documents by keys.

Parameters:

Name	Type	Description	Default
`keys`	`Union[list[str], str]`	fields to group documents	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Returns:

Type	Description
`Iterator[tuple[dict, list[dict]]]`	generator returning tuples of (dict, list of docs)

Source code in src/maggma/stores/azure.py

def groupby(
    self,
    keys: Union[list[str], str],
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[tuple[dict, list[dict]]]:
    """
    Simple grouping function that will group documents
    by keys.

    Args:
        keys: fields to group documents
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned

    Returns:
        generator returning tuples of (dict, list of docs)
    """
    return self.index.groupby(
        keys=keys,
        criteria=criteria,
        properties=properties,
        sort=sort,
        skip=skip,
        limit=limit,
    )

`newer_in(target, criteria=None, exhaustive=False)` ¶

Returns the keys of documents that are newer in the target Store than this Store.

Parameters:

Name	Type	Description	Default
`target`	`Store`	target Store	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`exhaustive`	`bool`	triggers an item-by-item check vs. checking the last_updated of the target Store and using that to filter out new items in	`False`

Source code in src/maggma/stores/azure.py

def newer_in(self, target: Store, criteria: Optional[dict] = None, exhaustive: bool = False) -> list[str]:
    """
    Returns the keys of documents that are newer in the target
    Store than this Store.

    Args:
        target: target Store
        criteria: PyMongo filter for documents to search in
        exhaustive: triggers an item-by-item check vs. checking
                    the last_updated of the target Store and using
                    that to filter out new items in
    """
    if hasattr(target, "index"):
        return self.index.newer_in(target=target.index, criteria=criteria, exhaustive=exhaustive)

    return self.index.newer_in(target=target, criteria=criteria, exhaustive=exhaustive)

`query(criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Queries the Store for a set of documents.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Source code in src/maggma/stores/azure.py

def query(
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[dict]:
    """
    Queries the Store for a set of documents.

    Args:
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned

    """
    if self.container is None or self.service is None:
        raise RuntimeError("The store has not been connected")

    prop_keys = set()
    if isinstance(properties, dict):
        prop_keys = set(properties.keys())
    elif isinstance(properties, list):
        prop_keys = set(properties)

    for doc in self.index.query(criteria=criteria, sort=sort, limit=limit, skip=skip):
        if properties is not None and prop_keys.issubset(set(doc.keys())):
            yield {p: doc[p] for p in properties if p in doc}
        else:
            try:
                data = self.container.download_blob(self.sub_dir + str(doc[self.key])).readall()
            except azure.core.exceptions.ResourceNotFoundError:
                self.logger.error(f"Could not find Blob object {doc[self.key]}")

            if self.unpack_data:
                data = self._unpack(data=data, compressed=doc.get("compression", "") == "zlib")

                if self.last_updated_field in doc:
                    data[self.last_updated_field] = doc[self.last_updated_field]  # type: ignore

            yield data  # type: ignore

`rebuild_index_from_blob_data(**kwargs)` ¶

Rebuilds the index Store from the data in Azure Relies on the index document being stores as the metadata for the file This can help recover lost databases.

Source code in src/maggma/stores/azure.py

def rebuild_index_from_blob_data(self, **kwargs):
    """
    Rebuilds the index Store from the data in Azure
    Relies on the index document being stores as the metadata for the file
    This can help recover lost databases.
    """
    objects = self.container.list_blobs(name_starts_with=self.sub_dir)
    for obj in objects:
        # handle the case where there are subdirs in the chosen container
        # but are below the level of the current subdir
        dir_name = os.path.dirname(obj.name)
        if dir_name != self.sub_dir:
            continue

        data = self.container.download_blob(obj.name).readall()

        if self.compress:
            data = zlib.decompress(data)
        unpacked_data = msgpack.unpackb(data, raw=False)
        # TODO maybe it can be avoided to reupload the data, since it is paid
        self.update(unpacked_data, **kwargs)

`rebuild_metadata_from_index(index_query=None)` ¶

Read data from the index store and populate the metadata of the Azure Blob. Force all of the keys to be lower case to be Minio compatible

Parameters:

Name	Type	Description	Default
`index_query`	`Optional[dict]`	query on the index store.	`None`

Source code in src/maggma/stores/azure.py

def rebuild_metadata_from_index(self, index_query: Optional[dict] = None):
    """
    Read data from the index store and populate the metadata of the Azure Blob.
    Force all of the keys to be lower case to be Minio compatible

    Args:
        index_query: query on the index store.
    """
    if self.container is None or self.service is None:
        raise RuntimeError("The store has not been connected")

    qq = {} if index_query is None else index_query
    for index_doc in self.index.query(qq):
        key_ = self.sub_dir + index_doc[self.key]
        blob = self.container.get_blob_client(key_)
        properties = blob.get_blob_properties()
        new_meta = {self._sanitize_key(k): v for k, v in properties.metadata.items()}
        for k, v in index_doc.items():
            new_meta[str(k).lower()] = v
        new_meta.pop("_id")
        if self.last_updated_field in new_meta:
            new_meta[self.last_updated_field] = str(to_isoformat_ceil_ms(new_meta[self.last_updated_field]))
        blob.set_blob_metadata(new_meta)

`remove_docs(criteria, remove_blob_object=False)` ¶

Remove docs matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required
`remove_blob_object`	`bool`	whether to remove the actual blob Object or not	`False`

Source code in src/maggma/stores/azure.py

def remove_docs(self, criteria: dict, remove_blob_object: bool = False):
    """
    Remove docs matching the query dictionary.

    Args:
        criteria: query dictionary to match
        remove_blob_object: whether to remove the actual blob Object or not
    """
    if self.container is None or self.service is None:
        raise RuntimeError("The store has not been connected")

    if not remove_blob_object:
        self.index.remove_docs(criteria=criteria)
    else:
        to_remove = self.index.distinct(self.key, criteria=criteria)
        self.index.remove_docs(criteria=criteria)

        # Can remove up to 256 items at a time
        to_remove_chunks = list(grouper(to_remove, n=256))
        for chunk_to_remove in to_remove_chunks:
            objlist = [{"name": f"{self.sub_dir}{obj}"} for obj in chunk_to_remove]
            self.container.delete_blobs(*objlist)

`update(docs, key=None, additional_metadata=None)` ¶

Update documents into the Store.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used	`None`
`additional_metadata`	`Union[str, list[str], None]`	field(s) to include in the blob store's metadata	`None`

Source code in src/maggma/stores/azure.py

def update(
    self,
    docs: Union[list[dict], dict],
    key: Union[list, str, None] = None,
    additional_metadata: Union[str, list[str], None] = None,
):
    """
    Update documents into the Store.

    Args:
        docs: the document or list of documents to update
        key: field name(s) to determine uniqueness for a
             document, can be a list of multiple fields,
             a single field, or None if the Store's key
             field is to be used
        additional_metadata: field(s) to include in the blob store's metadata
    """
    if self.container is None or self.service is None:
        raise RuntimeError("The store has not been connected")

    if not isinstance(docs, list):
        docs = [docs]

    if isinstance(key, str):
        key = [key]
    elif not key:
        key = [self.key]

    if additional_metadata is None:
        additional_metadata = []
    elif isinstance(additional_metadata, str):
        additional_metadata = [additional_metadata]
    else:
        additional_metadata = list(additional_metadata)

    with ThreadPoolExecutor(max_workers=self.workers) as pool:
        fs = {
            pool.submit(
                self.write_doc_to_blob,
                doc=itr_doc,
                search_keys=key + additional_metadata + self.searchable_fields,
            )
            for itr_doc in docs
        }
        fs, _ = wait(fs)

        search_docs = [sdoc.result() for sdoc in fs]

    # Use store's update to remove key clashes
    self.index.update(search_docs, key=self.key)

`write_doc_to_blob(doc, search_keys)` ¶

Write the data to an Azure blob and return the metadata to be inserted into the index db.

Parameters:

Name	Type	Description	Default
`doc`	`dict`	the document	required
`search_keys`	`list[str]`	list of keys to pull from the docs and be inserted into the	required

Source code in src/maggma/stores/azure.py

def write_doc_to_blob(self, doc: dict, search_keys: list[str]):
    """
    Write the data to an Azure blob and return the metadata to be inserted into the index db.

    Args:
        doc: the document
        search_keys: list of keys to pull from the docs and be inserted into the
        index db
    """
    container = self._get_container()
    if container is None:
        raise RuntimeError("The store has not been connected")

    search_doc = {k: doc[k] for k in search_keys}
    search_doc[self.key] = doc[self.key]  # Ensure key is in metadata
    if self.sub_dir != "":
        search_doc["sub_dir"] = self.sub_dir

    # Remove MongoDB _id from search
    if "_id" in search_doc:
        del search_doc["_id"]

    # to make hashing more meaningful, make sure last updated field is removed
    lu_info = doc.pop(self.last_updated_field, None)
    data = msgpack.packb(doc, default=monty_default)

    if self.compress:
        # Compress with zlib if chosen
        search_doc["compression"] = "zlib"
        data = zlib.compress(data)

    if self.last_updated_field in doc:
        # need this conversion for metadata insert
        search_doc[self.last_updated_field] = str(to_isoformat_ceil_ms(doc[self.last_updated_field]))

    # keep a record of original keys, in case these are important for the individual researcher
    # it is not expected that this information will be used except in disaster recovery
    blob_to_mongo_keys = {k: self._sanitize_key(k) for k in search_doc}
    blob_to_mongo_keys["blob_to_mongo_keys"] = "blob_to_mongo_keys"  # inception
    # encode dictionary since values have to be strings
    search_doc["blob_to_mongo_keys"] = dumps(blob_to_mongo_keys)

    container.upload_blob(
        name=self.sub_dir + str(doc[self.key]),
        data=data,
        metadata={blob_to_mongo_keys[k]: str(v) for k, v in search_doc.items()},
        overwrite=True,
    )

    if lu_info is not None:
        search_doc[self.last_updated_field] = lu_info

    if self.store_hash:
        hasher = sha1()
        hasher.update(data)
        obj_hash = hasher.hexdigest()
        search_doc["obj_hash"] = obj_hash
    return search_doc

Advanced Stores for behavior outside normal access patterns.

`AliasingStore` ¶

Bases: Store

Special Store that aliases for the primary accessors.

Source code in src/maggma/stores/advanced_stores.py

class AliasingStore(Store):
    """
    Special Store that aliases for the primary accessors.
    """

    def __init__(self, store: Store, aliases: dict, **kwargs):
        """
        Args:
            store: the store to wrap around
            aliases: dict of aliases of the form external key: internal key.
        """
        self.store = store
        # Given an external key tells what the internal key is
        self.aliases = aliases
        # Given the internal key tells us what the external key is
        self.reverse_aliases = {v: k for k, v in aliases.items()}
        self.kwargs = kwargs

        kwargs.update(
            {
                "last_updated_field": store.last_updated_field,
                "last_updated_type": store.last_updated_type,
            }
        )
        super().__init__(**kwargs)

    @property
    def name(self) -> str:
        """
        Return a string representing this data source.
        """
        return self.store.name

    def count(self, criteria: Optional[dict] = None) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in
        """
        criteria = criteria if criteria else {}
        lazy_substitute(criteria, self.reverse_aliases)
        return self.store.count(criteria)

    def query(
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[dict]:
        """
        Queries the Store for a set of documents.

        Args:
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned
        """
        criteria = criteria if criteria else {}

        if properties is not None:
            if isinstance(properties, list):
                properties = {p: 1 for p in properties}
            substitute(properties, self.reverse_aliases)

        lazy_substitute(criteria, self.reverse_aliases)
        for d in self.store.query(properties=properties, criteria=criteria, sort=sort, limit=limit, skip=skip):
            substitute(d, self.aliases)
            yield d

    def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
        """
        Get all distinct values for a field.

        Args:
            field: the field(s) to get distinct values for
            criteria: PyMongo filter for documents to search in
        """
        criteria = criteria if criteria else {}
        lazy_substitute(criteria, self.reverse_aliases)

        # substitute forward
        return self.store.distinct(self.aliases[field], criteria=criteria)

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        """
        Simple grouping function that will group documents
        by keys.

        Args:
            keys: fields to group documents
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned

        Returns:
            generator returning tuples of (dict, list of docs)
        """
        # Convert to a list
        keys = keys if isinstance(keys, list) else [keys]

        # Make the aliasing transformations on keys
        keys = [self.aliases.get(k, k) for k in keys]

        # Update criteria and properties based on aliases
        criteria = criteria if criteria else {}

        if properties is not None:
            if isinstance(properties, list):
                properties = {p: 1 for p in properties}
            substitute(properties, self.reverse_aliases)

        lazy_substitute(criteria, self.reverse_aliases)

        return self.store.groupby(keys=keys, properties=properties, criteria=criteria, skip=skip, limit=limit)

    def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
        """
        Update documents into the Store.

        Args:
            docs: the document or list of documents to update
            key: field name(s) to determine uniqueness for a
                 document, can be a list of multiple fields,
                 a single field, or None if the Store's key
                 field is to be used
        """
        key = key if key else self.key

        for d in docs:
            substitute(d, self.reverse_aliases)

        if key in self.aliases:
            key = self.aliases[key]

        self.store.update(docs, key=key)

    def remove_docs(self, criteria: dict):
        """
        Remove docs matching the query dictionary.

        Args:
            criteria: query dictionary to match
        """
        # Update criteria and properties based on aliases
        lazy_substitute(criteria, self.reverse_aliases)
        self.store.remove_docs(criteria)

    def ensure_index(self, key, unique=False, **kwargs):
        if key in self.aliases:
            key = self.aliases
        return self.store.ensure_index(key, unique, **kwargs)

    def close(self):
        self.store.close()

    @property
    def _collection(self):
        return self.store._collection

    def connect(self, force_reset=False):
        self.store.connect(force_reset=force_reset)

    def __eq__(self, other: object) -> bool:
        """
        Check equality for AliasingStore.

        Args:
            other: other AliasingStore to compare with
        """
        if not isinstance(other, AliasingStore):
            return False

        fields = ["store", "aliases", "last_updated_field"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`name: str` `property` ¶

Return a string representing this data source.

`eq(other)` ¶

Check equality for AliasingStore.

Parameters:

Name	Type	Description	Default
`other`	`object`	other AliasingStore to compare with	required

Source code in src/maggma/stores/advanced_stores.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for AliasingStore.

    Args:
        other: other AliasingStore to compare with
    """
    if not isinstance(other, AliasingStore):
        return False

    fields = ["store", "aliases", "last_updated_field"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(store, aliases, **kwargs)` ¶

Parameters:

Name	Type	Description	Default
`store`	`Store`	the store to wrap around	required
`aliases`	`dict`	dict of aliases of the form external key: internal key.	required

Source code in src/maggma/stores/advanced_stores.py

def __init__(self, store: Store, aliases: dict, **kwargs):
    """
    Args:
        store: the store to wrap around
        aliases: dict of aliases of the form external key: internal key.
    """
    self.store = store
    # Given an external key tells what the internal key is
    self.aliases = aliases
    # Given the internal key tells us what the external key is
    self.reverse_aliases = {v: k for k, v in aliases.items()}
    self.kwargs = kwargs

    kwargs.update(
        {
            "last_updated_field": store.last_updated_field,
            "last_updated_type": store.last_updated_type,
        }
    )
    super().__init__(**kwargs)

`count(criteria=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in	`None`

Source code in src/maggma/stores/advanced_stores.py

def count(self, criteria: Optional[dict] = None) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in
    """
    criteria = criteria if criteria else {}
    lazy_substitute(criteria, self.reverse_aliases)
    return self.store.count(criteria)

`distinct(field, criteria=None, all_exist=False)` ¶

Get all distinct values for a field.

Parameters:

Name	Type	Description	Default
`field`	`str`	the field(s) to get distinct values for	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`

Source code in src/maggma/stores/advanced_stores.py

def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
    """
    Get all distinct values for a field.

    Args:
        field: the field(s) to get distinct values for
        criteria: PyMongo filter for documents to search in
    """
    criteria = criteria if criteria else {}
    lazy_substitute(criteria, self.reverse_aliases)

    # substitute forward
    return self.store.distinct(self.aliases[field], criteria=criteria)

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Simple grouping function that will group documents by keys.

Parameters:

Name	Type	Description	Default
`keys`	`Union[list[str], str]`	fields to group documents	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Returns:

Type	Description
`Iterator[tuple[dict, list[dict]]]`	generator returning tuples of (dict, list of docs)

Source code in src/maggma/stores/advanced_stores.py

def groupby(
    self,
    keys: Union[list[str], str],
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[tuple[dict, list[dict]]]:
    """
    Simple grouping function that will group documents
    by keys.

    Args:
        keys: fields to group documents
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned

    Returns:
        generator returning tuples of (dict, list of docs)
    """
    # Convert to a list
    keys = keys if isinstance(keys, list) else [keys]

    # Make the aliasing transformations on keys
    keys = [self.aliases.get(k, k) for k in keys]

    # Update criteria and properties based on aliases
    criteria = criteria if criteria else {}

    if properties is not None:
        if isinstance(properties, list):
            properties = {p: 1 for p in properties}
        substitute(properties, self.reverse_aliases)

    lazy_substitute(criteria, self.reverse_aliases)

    return self.store.groupby(keys=keys, properties=properties, criteria=criteria, skip=skip, limit=limit)

`query(criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Queries the Store for a set of documents.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Source code in src/maggma/stores/advanced_stores.py

def query(
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[dict]:
    """
    Queries the Store for a set of documents.

    Args:
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned
    """
    criteria = criteria if criteria else {}

    if properties is not None:
        if isinstance(properties, list):
            properties = {p: 1 for p in properties}
        substitute(properties, self.reverse_aliases)

    lazy_substitute(criteria, self.reverse_aliases)
    for d in self.store.query(properties=properties, criteria=criteria, sort=sort, limit=limit, skip=skip):
        substitute(d, self.aliases)
        yield d

`remove_docs(criteria)` ¶

Remove docs matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required

Source code in src/maggma/stores/advanced_stores.py

def remove_docs(self, criteria: dict):
    """
    Remove docs matching the query dictionary.

    Args:
        criteria: query dictionary to match
    """
    # Update criteria and properties based on aliases
    lazy_substitute(criteria, self.reverse_aliases)
    self.store.remove_docs(criteria)

`update(docs, key=None)` ¶

Update documents into the Store.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used	`None`

Source code in src/maggma/stores/advanced_stores.py

def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
    """
    Update documents into the Store.

    Args:
        docs: the document or list of documents to update
        key: field name(s) to determine uniqueness for a
             document, can be a list of multiple fields,
             a single field, or None if the Store's key
             field is to be used
    """
    key = key if key else self.key

    for d in docs:
        substitute(d, self.reverse_aliases)

    if key in self.aliases:
        key = self.aliases[key]

    self.store.update(docs, key=key)

`MongograntStore` ¶

Bases: MongoStore

Initialize a Store with a mongogrant "<role>:<host>/<db>." spec.

Some class methods of MongoStore, e.g. from_db_file and from_collection, are not supported.

mongogrant documentation: https://github.com/materialsproject/mongogrant

Source code in src/maggma/stores/advanced_stores.py

@deprecated(MongoStore)
class MongograntStore(MongoStore):
    """Initialize a Store with a mongogrant "`<role>`:`<host>`/`<db>`." spec.

    Some class methods of MongoStore, e.g. from_db_file and from_collection,
    are not supported.

    mongogrant documentation: https://github.com/materialsproject/mongogrant
    """

    @requires(
        Client is not None,
        "mongogrant is required to use MongoGrantStore. Please run `pip install maggma[mongogrant]",
    )
    def __init__(
        self,
        mongogrant_spec: str,
        collection_name: str,
        mgclient_config_path: Optional[str] = None,
        **kwargs,
    ):
        """
        Args:
            mongogrant_spec: of the form `<role>`:`<host>`/`<db>`, where
                role is one of {"read", "readWrite"} or aliases {"ro", "rw"};
                host is a db host (w/ optional port) or alias; and db is a db
                on that host, or alias. See mongogrant documentation.
            collection_name: name of mongo collection
            mgclient_config_path: Path to mongogrant client config file,
               or None if default path (`mongogrant.client.path`).
        """
        self.mongogrant_spec = mongogrant_spec
        self.collection_name = collection_name
        self.mgclient_config_path = mgclient_config_path
        self._coll = None

        if self.mgclient_config_path:
            config = Config(check=check, path=self.mgclient_config_path)
            client = Client(config)
        else:
            client = Client()

        if {"username", "password", "database", "host"} & set(kwargs):
            raise StoreError(
                "MongograntStore does not accept "
                "username, password, database, or host "
                "arguments. Use `mongogrant_spec`."
            )

        self.kwargs = kwargs
        _auth_info = client.get_db_auth_from_spec(self.mongogrant_spec)
        super().__init__(
            host=_auth_info["host"],
            database=_auth_info["authSource"],
            username=_auth_info["username"],
            password=_auth_info["password"],
            collection_name=self.collection_name,
            **kwargs,
        )

    @property
    def name(self):
        return f"mgrant://{self.mongogrant_spec}/{self.collection_name}"

    def __hash__(self):
        return hash((self.mongogrant_spec, self.collection_name, self.last_updated_field))

    @classmethod
    def from_db_file(cls, file):
        """
        Raises ValueError since MongograntStores can't be initialized from a file.
        """
        raise ValueError("MongograntStore doesn't implement from_db_file")

    @classmethod
    def from_collection(cls, collection):
        """
        Raises ValueError since MongograntStores can't be initialized from a PyMongo collection.
        """
        raise ValueError("MongograntStore doesn't implement from_collection")

    def __eq__(self, other: object) -> bool:
        """
        Check equality for MongograntStore.

        Args:
            other: other MongograntStore to compare with
        """
        if not isinstance(other, MongograntStore):
            return False

        fields = [
            "mongogrant_spec",
            "collection_name",
            "mgclient_config_path",
            "last_updated_field",
        ]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`eq(other)` ¶

Check equality for MongograntStore.

Parameters:

Name	Type	Description	Default
`other`	`object`	other MongograntStore to compare with	required

Source code in src/maggma/stores/advanced_stores.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for MongograntStore.

    Args:
        other: other MongograntStore to compare with
    """
    if not isinstance(other, MongograntStore):
        return False

    fields = [
        "mongogrant_spec",
        "collection_name",
        "mgclient_config_path",
        "last_updated_field",
    ]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(mongogrant_spec, collection_name, mgclient_config_path=None, **kwargs)` ¶

Parameters:

Name	Type	Description	Default
`mongogrant_spec`	`str`	of the form `<role>`:`<host>`/`<db>`, where role is one of {"read", "readWrite"} or aliases {"ro", "rw"}; host is a db host (w/ optional port) or alias; and db is a db on that host, or alias. See mongogrant documentation.	required
`collection_name`	`str`	name of mongo collection	required
`mgclient_config_path`	`Optional[str]`	Path to mongogrant client config file, or None if default path (`mongogrant.client.path`).	`None`

Source code in src/maggma/stores/advanced_stores.py

@requires(
    Client is not None,
    "mongogrant is required to use MongoGrantStore. Please run `pip install maggma[mongogrant]",
)
def __init__(
    self,
    mongogrant_spec: str,
    collection_name: str,
    mgclient_config_path: Optional[str] = None,
    **kwargs,
):
    """
    Args:
        mongogrant_spec: of the form `<role>`:`<host>`/`<db>`, where
            role is one of {"read", "readWrite"} or aliases {"ro", "rw"};
            host is a db host (w/ optional port) or alias; and db is a db
            on that host, or alias. See mongogrant documentation.
        collection_name: name of mongo collection
        mgclient_config_path: Path to mongogrant client config file,
           or None if default path (`mongogrant.client.path`).
    """
    self.mongogrant_spec = mongogrant_spec
    self.collection_name = collection_name
    self.mgclient_config_path = mgclient_config_path
    self._coll = None

    if self.mgclient_config_path:
        config = Config(check=check, path=self.mgclient_config_path)
        client = Client(config)
    else:
        client = Client()

    if {"username", "password", "database", "host"} & set(kwargs):
        raise StoreError(
            "MongograntStore does not accept "
            "username, password, database, or host "
            "arguments. Use `mongogrant_spec`."
        )

    self.kwargs = kwargs
    _auth_info = client.get_db_auth_from_spec(self.mongogrant_spec)
    super().__init__(
        host=_auth_info["host"],
        database=_auth_info["authSource"],
        username=_auth_info["username"],
        password=_auth_info["password"],
        collection_name=self.collection_name,
        **kwargs,
    )

`from_collection(collection)` `classmethod` ¶

Raises ValueError since MongograntStores can't be initialized from a PyMongo collection.

Source code in src/maggma/stores/advanced_stores.py

@classmethod
def from_collection(cls, collection):
    """
    Raises ValueError since MongograntStores can't be initialized from a PyMongo collection.
    """
    raise ValueError("MongograntStore doesn't implement from_collection")

`from_db_file(file)` `classmethod` ¶

Raises ValueError since MongograntStores can't be initialized from a file.

Source code in src/maggma/stores/advanced_stores.py

@classmethod
def from_db_file(cls, file):
    """
    Raises ValueError since MongograntStores can't be initialized from a file.
    """
    raise ValueError("MongograntStore doesn't implement from_db_file")

`SandboxStore` ¶

Bases: Store

Provides a sandboxed view to another store.

Source code in src/maggma/stores/advanced_stores.py

class SandboxStore(Store):
    """
    Provides a sandboxed view to another store.
    """

    def __init__(self, store: Store, sandbox: str, exclusive: bool = False):
        """
        Args:
            store: store to wrap sandboxing around
            sandbox: the corresponding sandbox
            exclusive: whether to be exclusively in this sandbox or include global items.
        """
        self.store = store
        self.sandbox = sandbox
        self.exclusive = exclusive
        super().__init__(
            key=self.store.key,
            last_updated_field=self.store.last_updated_field,
            last_updated_type=self.store.last_updated_type,
            validator=self.store.validator,
        )

    @property
    def name(self) -> str:
        """
        Returns:
            a string representing this data source.
        """
        return f"Sandbox[{self.store.name}][{self.sandbox}]"

    @property
    def sbx_criteria(self) -> dict:
        """
        Returns:
            the sandbox criteria dict used to filter the source store.
        """
        if self.exclusive:
            return {"sbxn": self.sandbox}
        return {"$or": [{"sbxn": {"$in": [self.sandbox]}}, {"sbxn": {"$exists": False}}]}

    def count(self, criteria: Optional[dict] = None) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in
        """
        criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
        return self.store.count(criteria=criteria)

    def query(
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[dict]:
        """
        Queries the Store for a set of documents.

        Args:
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned
        """
        criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
        return self.store.query(properties=properties, criteria=criteria, sort=sort, limit=limit, skip=skip)

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        """
        Simple grouping function that will group documents
        by keys.

        Args:
            keys: fields to group documents
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned

        Returns:
            generator returning tuples of (dict, list of docs)
        """
        criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria

        return self.store.groupby(keys=keys, properties=properties, criteria=criteria, skip=skip, limit=limit)

    def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
        """
        Update documents into the Store.

        Args:
            docs: the document or list of documents to update
            key: field name(s) to determine uniqueness for a
                 document, can be a list of multiple fields,
                 a single field, or None if the Store's key
                 field is to be used
        """
        for d in docs:
            if "sbxn" in d:
                d["sbxn"] = list(set(d["sbxn"] + [self.sandbox]))
            else:
                d["sbxn"] = [self.sandbox]

        self.store.update(docs, key=key)

    def remove_docs(self, criteria: dict):
        """
        Remove docs matching the query dictionary.

        Args:
            criteria: query dictionary to match
        """
        # Update criteria and properties based on aliases
        criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
        self.store.remove_docs(criteria)

    def ensure_index(self, key, unique=False, **kwargs):
        return self.store.ensure_index(key, unique, **kwargs)

    def close(self):
        self.store.close()

    @property
    def _collection(self):
        return self.store._collection

    def connect(self, force_reset=False):
        self.store.connect(force_reset=force_reset)

    def __eq__(self, other: object) -> bool:
        """
        Check equality for SandboxStore.

        Args:
            other: other SandboxStore to compare with
        """
        if not isinstance(other, SandboxStore):
            return False

        fields = ["store", "sandbox", "last_updated_field"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`name: str` `property` ¶

Returns:

Type	Description
`str`	a string representing this data source.

`sbx_criteria: dict` `property` ¶

Returns:

Type	Description
`dict`	the sandbox criteria dict used to filter the source store.

`eq(other)` ¶

Check equality for SandboxStore.

Parameters:

Name	Type	Description	Default
`other`	`object`	other SandboxStore to compare with	required

Source code in src/maggma/stores/advanced_stores.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for SandboxStore.

    Args:
        other: other SandboxStore to compare with
    """
    if not isinstance(other, SandboxStore):
        return False

    fields = ["store", "sandbox", "last_updated_field"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(store, sandbox, exclusive=False)` ¶

Parameters:

Name	Type	Description	Default
`store`	`Store`	store to wrap sandboxing around	required
`sandbox`	`str`	the corresponding sandbox	required
`exclusive`	`bool`	whether to be exclusively in this sandbox or include global items.	`False`

Source code in src/maggma/stores/advanced_stores.py

def __init__(self, store: Store, sandbox: str, exclusive: bool = False):
    """
    Args:
        store: store to wrap sandboxing around
        sandbox: the corresponding sandbox
        exclusive: whether to be exclusively in this sandbox or include global items.
    """
    self.store = store
    self.sandbox = sandbox
    self.exclusive = exclusive
    super().__init__(
        key=self.store.key,
        last_updated_field=self.store.last_updated_field,
        last_updated_type=self.store.last_updated_type,
        validator=self.store.validator,
    )

`count(criteria=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in	`None`

Source code in src/maggma/stores/advanced_stores.py

def count(self, criteria: Optional[dict] = None) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in
    """
    criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
    return self.store.count(criteria=criteria)

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Simple grouping function that will group documents by keys.

Parameters:

Name	Type	Description	Default
`keys`	`Union[list[str], str]`	fields to group documents	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Returns:

Type	Description
`Iterator[tuple[dict, list[dict]]]`	generator returning tuples of (dict, list of docs)

Source code in src/maggma/stores/advanced_stores.py

def groupby(
    self,
    keys: Union[list[str], str],
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[tuple[dict, list[dict]]]:
    """
    Simple grouping function that will group documents
    by keys.

    Args:
        keys: fields to group documents
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned

    Returns:
        generator returning tuples of (dict, list of docs)
    """
    criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria

    return self.store.groupby(keys=keys, properties=properties, criteria=criteria, skip=skip, limit=limit)

`query(criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Queries the Store for a set of documents.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Source code in src/maggma/stores/advanced_stores.py

def query(
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[dict]:
    """
    Queries the Store for a set of documents.

    Args:
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned
    """
    criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
    return self.store.query(properties=properties, criteria=criteria, sort=sort, limit=limit, skip=skip)

`remove_docs(criteria)` ¶

Remove docs matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required

Source code in src/maggma/stores/advanced_stores.py

def remove_docs(self, criteria: dict):
    """
    Remove docs matching the query dictionary.

    Args:
        criteria: query dictionary to match
    """
    # Update criteria and properties based on aliases
    criteria = dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
    self.store.remove_docs(criteria)

`update(docs, key=None)` ¶

Update documents into the Store.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used	`None`

Source code in src/maggma/stores/advanced_stores.py

def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
    """
    Update documents into the Store.

    Args:
        docs: the document or list of documents to update
        key: field name(s) to determine uniqueness for a
             document, can be a list of multiple fields,
             a single field, or None if the Store's key
             field is to be used
    """
    for d in docs:
        if "sbxn" in d:
            d["sbxn"] = list(set(d["sbxn"] + [self.sandbox]))
        else:
            d["sbxn"] = [self.sandbox]

    self.store.update(docs, key=key)

`VaultStore` ¶

Bases: MongoStore

Extends MongoStore to read credentials out of Vault server and uses these values to initialize MongoStore instance.

Source code in src/maggma/stores/advanced_stores.py

class VaultStore(MongoStore):
    """
    Extends MongoStore to read credentials out of Vault server
    and uses these values to initialize MongoStore instance.
    """

    @requires(hvac is not None, "hvac is required to use VaultStore")
    def __init__(self, collection_name: str, vault_secret_path: str):
        """
        Args:
            collection_name: name of mongo collection
            vault_secret_path: path on vault server with mongo creds object.

        Important:
            Environment variables that must be set prior to invocation
            VAULT_ADDR - URL of vault server (eg. https://matgen8.lbl.gov:8200)
            VAULT_TOKEN or GITHUB_TOKEN - token used to authenticate to vault
        """
        self.collection_name = collection_name
        self.vault_secret_path = vault_secret_path

        # TODO: Switch this over to Pydantic ConfigSettings
        vault_addr = os.getenv("VAULT_ADDR")

        if not vault_addr:
            raise RuntimeError("VAULT_ADDR not set")

        client = hvac.Client(vault_addr)

        # If we have a vault token use this
        token = os.getenv("VAULT_TOKEN")

        # Look for a github token instead
        if not token:
            github_token = os.getenv("GITHUB_TOKEN")

            if github_token:
                client.auth_github(github_token)
            else:
                raise RuntimeError("VAULT_TOKEN or GITHUB_TOKEN not set")
        else:
            client.token = token
            if not client.is_authenticated():
                raise RuntimeError("Bad token")

        # Read the vault secret
        json_db_creds = client.read(vault_secret_path)
        db_creds = json.loads(json_db_creds["data"]["value"])

        database = db_creds.get("db")
        host = db_creds.get("host", "localhost")
        port = db_creds.get("port", 27017)
        username = db_creds.get("username", "")
        password = db_creds.get("password", "")

        super().__init__(database, collection_name, host, port, username, password)

    def __eq__(self, other: object) -> bool:
        """
        Check equality for VaultStore.

        Args:
            other: other VaultStore to compare with
        """
        if not isinstance(other, VaultStore):
            return False

        fields = ["vault_secret_path", "collection_name", "last_updated_field"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`eq(other)` ¶

Check equality for VaultStore.

Parameters:

Name	Type	Description	Default
`other`	`object`	other VaultStore to compare with	required

Source code in src/maggma/stores/advanced_stores.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for VaultStore.

    Args:
        other: other VaultStore to compare with
    """
    if not isinstance(other, VaultStore):
        return False

    fields = ["vault_secret_path", "collection_name", "last_updated_field"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(collection_name, vault_secret_path)` ¶

Parameters:

Name	Type	Description	Default
`collection_name`	`str`	name of mongo collection	required
`vault_secret_path`	`str`	path on vault server with mongo creds object.	required

Important

Environment variables that must be set prior to invocation VAULT_ADDR - URL of vault server (eg. https://matgen8.lbl.gov:8200) VAULT_TOKEN or GITHUB_TOKEN - token used to authenticate to vault

Source code in src/maggma/stores/advanced_stores.py

@requires(hvac is not None, "hvac is required to use VaultStore")
def __init__(self, collection_name: str, vault_secret_path: str):
    """
    Args:
        collection_name: name of mongo collection
        vault_secret_path: path on vault server with mongo creds object.

    Important:
        Environment variables that must be set prior to invocation
        VAULT_ADDR - URL of vault server (eg. https://matgen8.lbl.gov:8200)
        VAULT_TOKEN or GITHUB_TOKEN - token used to authenticate to vault
    """
    self.collection_name = collection_name
    self.vault_secret_path = vault_secret_path

    # TODO: Switch this over to Pydantic ConfigSettings
    vault_addr = os.getenv("VAULT_ADDR")

    if not vault_addr:
        raise RuntimeError("VAULT_ADDR not set")

    client = hvac.Client(vault_addr)

    # If we have a vault token use this
    token = os.getenv("VAULT_TOKEN")

    # Look for a github token instead
    if not token:
        github_token = os.getenv("GITHUB_TOKEN")

        if github_token:
            client.auth_github(github_token)
        else:
            raise RuntimeError("VAULT_TOKEN or GITHUB_TOKEN not set")
    else:
        client.token = token
        if not client.is_authenticated():
            raise RuntimeError("Bad token")

    # Read the vault secret
    json_db_creds = client.read(vault_secret_path)
    db_creds = json.loads(json_db_creds["data"]["value"])

    database = db_creds.get("db")
    host = db_creds.get("host", "localhost")
    port = db_creds.get("port", 27017)
    username = db_creds.get("username", "")
    password = db_creds.get("password", "")

    super().__init__(database, collection_name, host, port, username, password)

Special stores that combine underlying Stores together.

`ConcatStore` ¶

Bases: Store

Store concatting multiple stores.

Source code in src/maggma/stores/compound_stores.py

class ConcatStore(Store):
    """Store concatting multiple stores."""

    def __init__(self, stores: list[Store], **kwargs):
        """
        Initialize a ConcatStore that concatenates multiple stores together
        to appear as one store.

        Args:
            stores: list of stores to concatenate together
        """
        self.stores = stores
        self.kwargs = kwargs
        super().__init__(**kwargs)

    @property
    def name(self) -> str:
        """
        A string representing this data source.
        """
        compound_name = ",".join([store.name for store in self.stores])
        return f"Concat[{compound_name}]"

    def connect(self, force_reset: bool = False):
        """
        Connect all stores in this ConcatStore.

        Args:
            force_reset: Whether to forcibly reset the connection for all stores
        """
        for store in self.stores:
            store.connect(force_reset)

    def close(self):
        """
        Close all connections in this ConcatStore.
        """
        for store in self.stores:
            store.close()

    @property
    def _collection(self):
        raise NotImplementedError("No collection property for ConcatStore")

    @property
    def last_updated(self) -> datetime:
        """
        Finds the most recent last_updated across all the stores.
        This might not be the most useful way to do this for this type of Store
        since it could very easily over-estimate the last_updated based on what stores
        are used.
        """
        lus = []
        for store in self.stores:
            lu = store.last_updated
            lus.append(lu)
        return max(lus)

    def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
        """
        Update documents into the Store
        Not implemented in ConcatStore.

        Args:
            docs: the document or list of documents to update
            key: field name(s) to determine uniqueness for a
                 document, can be a list of multiple fields,
                 a single field, or None if the Store's key
                 field is to be used
        """
        raise NotImplementedError("No update method for ConcatStore")

    def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
        """
        Get all distinct values for a field.

        Args:
            field: the field(s) to get distinct values for
            criteria: PyMongo filter for documents to search in
        """
        distincts = []
        for store in self.stores:
            distincts.extend(store.distinct(field=field, criteria=criteria))

        return list(set(distincts))

    def ensure_index(self, key: str, unique: bool = False) -> bool:
        """
        Ensure an index is properly set. Returns whether all stores support this index or not.

        Args:
            key: single key to index
            unique: Whether or not this index contains only unique keys

        Returns:
            bool indicating if the index exists/was created on all stores
        """
        return all(store.ensure_index(key, unique) for store in self.stores)

    def count(self, criteria: Optional[dict] = None) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in
        """
        counts = [store.count(criteria) for store in self.stores]

        return sum(counts)

    def query(
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[dict]:
        """
        Queries across all Store for a set of documents.

        Args:
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned
        """
        # TODO: skip, sort and limit are broken. implement properly
        for store in self.stores:
            yield from store.query(criteria=criteria, properties=properties)

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        """
        Simple grouping function that will group documents
        by keys.

        Args:
            keys: fields to group documents
            criteria: PyMongo filter for documents to search in
            properties: properties to return in grouped documents
            sort: Dictionary of sort order for fields. Keys are field names and
                values are 1 for ascending or -1 for descending.
            skip: number documents to skip
            limit: limit on total number of documents returned

        Returns:
            generator returning tuples of (dict, list of docs)
        """
        if isinstance(keys, str):
            keys = [keys]

        docs = []
        for store in self.stores:
            temp_docs = list(
                store.groupby(
                    keys=keys,
                    criteria=criteria,
                    properties=properties,
                    sort=sort,
                    skip=skip,
                    limit=limit,
                )
            )
            for _key, group in temp_docs:
                docs.extend(group)

        def key_set(d: dict) -> tuple:
            """Index function based on passed in keys."""
            return tuple(d.get(k) for k in keys)

        sorted_docs = sorted(docs, key=key_set)
        for vals, group_iter in groupby(sorted_docs, key=key_set):
            id_dict = dict(zip(keys, vals))
            yield id_dict, list(group_iter)

    def remove_docs(self, criteria: dict):
        """
        Remove docs matching the query dictionary.

        Args:
            criteria: query dictionary to match
        """
        raise NotImplementedError("No remove_docs method for JointStore")

    def __eq__(self, other: object) -> bool:
        """
        Check equality for ConcatStore.

        Args:
            other: other JointStore to compare with
        """
        if not isinstance(other, ConcatStore):
            return False

        fields = ["stores"]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`last_updated: datetime` `property` ¶

Finds the most recent last_updated across all the stores. This might not be the most useful way to do this for this type of Store since it could very easily over-estimate the last_updated based on what stores are used.

`name: str` `property` ¶

A string representing this data source.

`eq(other)` ¶

Check equality for ConcatStore.

Parameters:

Name	Type	Description	Default
`other`	`object`	other JointStore to compare with	required

Source code in src/maggma/stores/compound_stores.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for ConcatStore.

    Args:
        other: other JointStore to compare with
    """
    if not isinstance(other, ConcatStore):
        return False

    fields = ["stores"]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(stores, **kwargs)` ¶

Initialize a ConcatStore that concatenates multiple stores together to appear as one store.

Parameters:

Name	Type	Description	Default
`stores`	`list[Store]`	list of stores to concatenate together	required

Source code in src/maggma/stores/compound_stores.py

def __init__(self, stores: list[Store], **kwargs):
    """
    Initialize a ConcatStore that concatenates multiple stores together
    to appear as one store.

    Args:
        stores: list of stores to concatenate together
    """
    self.stores = stores
    self.kwargs = kwargs
    super().__init__(**kwargs)

`close()` ¶

Close all connections in this ConcatStore.

Source code in src/maggma/stores/compound_stores.py

def close(self):
    """
    Close all connections in this ConcatStore.
    """
    for store in self.stores:
        store.close()

`connect(force_reset=False)` ¶

Connect all stores in this ConcatStore.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	Whether to forcibly reset the connection for all stores	`False`

Source code in src/maggma/stores/compound_stores.py

def connect(self, force_reset: bool = False):
    """
    Connect all stores in this ConcatStore.

    Args:
        force_reset: Whether to forcibly reset the connection for all stores
    """
    for store in self.stores:
        store.connect(force_reset)

`count(criteria=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in	`None`

Source code in src/maggma/stores/compound_stores.py

def count(self, criteria: Optional[dict] = None) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in
    """
    counts = [store.count(criteria) for store in self.stores]

    return sum(counts)

`distinct(field, criteria=None, all_exist=False)` ¶

Get all distinct values for a field.

Parameters:

Name	Type	Description	Default
`field`	`str`	the field(s) to get distinct values for	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`

Source code in src/maggma/stores/compound_stores.py

def distinct(self, field: str, criteria: Optional[dict] = None, all_exist: bool = False) -> list:
    """
    Get all distinct values for a field.

    Args:
        field: the field(s) to get distinct values for
        criteria: PyMongo filter for documents to search in
    """
    distincts = []
    for store in self.stores:
        distincts.extend(store.distinct(field=field, criteria=criteria))

    return list(set(distincts))

`ensure_index(key, unique=False)` ¶

Ensure an index is properly set. Returns whether all stores support this index or not.

Parameters:

Name	Type	Description	Default
`key`	`str`	single key to index	required
`unique`	`bool`	Whether or not this index contains only unique keys	`False`

Returns:

Type	Description
`bool`	bool indicating if the index exists/was created on all stores

Source code in src/maggma/stores/compound_stores.py

def ensure_index(self, key: str, unique: bool = False) -> bool:
    """
    Ensure an index is properly set. Returns whether all stores support this index or not.

    Args:
        key: single key to index
        unique: Whether or not this index contains only unique keys

    Returns:
        bool indicating if the index exists/was created on all stores
    """
    return all(store.ensure_index(key, unique) for store in self.stores)

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Simple grouping function that will group documents by keys.

Parameters:

Name	Type	Description	Default
`keys`	`Union[list[str], str]`	fields to group documents	required
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Returns:

Type	Description
`Iterator[tuple[dict, list[dict]]]`	generator returning tuples of (dict, list of docs)

Source code in src/maggma/stores/compound_stores.py

def groupby(
    self,
    keys: Union[list[str], str],
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[tuple[dict, list[dict]]]:
    """
    Simple grouping function that will group documents
    by keys.

    Args:
        keys: fields to group documents
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned

    Returns:
        generator returning tuples of (dict, list of docs)
    """
    if isinstance(keys, str):
        keys = [keys]

    docs = []
    for store in self.stores:
        temp_docs = list(
            store.groupby(
                keys=keys,
                criteria=criteria,
                properties=properties,
                sort=sort,
                skip=skip,
                limit=limit,
            )
        )
        for _key, group in temp_docs:
            docs.extend(group)

    def key_set(d: dict) -> tuple:
        """Index function based on passed in keys."""
        return tuple(d.get(k) for k in keys)

    sorted_docs = sorted(docs, key=key_set)
    for vals, group_iter in groupby(sorted_docs, key=key_set):
        id_dict = dict(zip(keys, vals))
        yield id_dict, list(group_iter)

`query(criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

Queries across all Store for a set of documents.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to search in	`None`
`properties`	`Union[dict, list, None]`	properties to return in grouped documents	`None`
`sort`	`Optional[dict[str, Union[Sort, int]]]`	Dictionary of sort order for fields. Keys are field names and values are 1 for ascending or -1 for descending.	`None`
`skip`	`int`	number documents to skip	`0`
`limit`	`int`	limit on total number of documents returned	`0`

Source code in src/maggma/stores/compound_stores.py

def query(
    self,
    criteria: Optional[dict] = None,
    properties: Union[dict, list, None] = None,
    sort: Optional[dict[str, Union[Sort, int]]] = None,
    skip: int = 0,
    limit: int = 0,
) -> Iterator[dict]:
    """
    Queries across all Store for a set of documents.

    Args:
        criteria: PyMongo filter for documents to search in
        properties: properties to return in grouped documents
        sort: Dictionary of sort order for fields. Keys are field names and
            values are 1 for ascending or -1 for descending.
        skip: number documents to skip
        limit: limit on total number of documents returned
    """
    # TODO: skip, sort and limit are broken. implement properly
    for store in self.stores:
        yield from store.query(criteria=criteria, properties=properties)

`remove_docs(criteria)` ¶

Remove docs matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required

Source code in src/maggma/stores/compound_stores.py

def remove_docs(self, criteria: dict):
    """
    Remove docs matching the query dictionary.

    Args:
        criteria: query dictionary to match
    """
    raise NotImplementedError("No remove_docs method for JointStore")

`update(docs, key=None)` ¶

Update documents into the Store Not implemented in ConcatStore.

Parameters:

Name	Type	Description	Default
`docs`	`Union[list[dict], dict]`	the document or list of documents to update	required
`key`	`Union[list, str, None]`	field name(s) to determine uniqueness for a document, can be a list of multiple fields, a single field, or None if the Store's key field is to be used	`None`

Source code in src/maggma/stores/compound_stores.py

def update(self, docs: Union[list[dict], dict], key: Union[list, str, None] = None):
    """
    Update documents into the Store
    Not implemented in ConcatStore.

    Args:
        docs: the document or list of documents to update
        key: field name(s) to determine uniqueness for a
             document, can be a list of multiple fields,
             a single field, or None if the Store's key
             field is to be used
    """
    raise NotImplementedError("No update method for ConcatStore")

`JointStore` ¶

Bases: Store

Store that implements a on-the-fly join across multiple collections all in the same MongoDB database. This is a Read-Only Store designed to combine data from multiple collections.

Source code in src/maggma/stores/compound_stores.py

class JointStore(Store):
    """
    Store that implements a on-the-fly join across multiple collections all in the same MongoDB database.
    This is a Read-Only Store designed to combine data from multiple collections.
    """

    def __init__(
        self,
        database: str,
        collection_names: list[str],
        host: str = "localhost",
        port: int = 27017,
        username: str = "",
        password: str = "",
        main: Optional[str] = None,
        merge_at_root: bool = False,
        mongoclient_kwargs: Optional[dict] = None,
        **kwargs,
    ):
        """
        Args:
            database: The database name
            collection_names: list of all collections
                to join
            host: Hostname for the database
            port: TCP port to connect to
            username: Username for the collection
            password: Password to connect with
            main: name for the main collection
                if not specified this defaults to the first
                in collection_names list.
        """
        self.database = database
        self.collection_names = collection_names
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self._coll = None  # type: Any
        self.main = main or collection_names[0]
        self.merge_at_root = merge_at_root
        self.mongoclient_kwargs = mongoclient_kwargs or {}
        self.kwargs = kwargs

        super().__init__(**kwargs)

    @property
    def name(self) -> str:
        """
        Return a string representing this data source.
        """
        compound_name = ",".join(self.collection_names)
        return f"Compound[{self.host}/{self.database}][{compound_name}]"

    def connect(self, force_reset: bool = False):
        """
        Connects the underlying Mongo database and all collection connections.

        Args:
            force_reset: whether to reset the connection or not when the Store is
                already connected.
        """
        if not self._coll or force_reset:
            conn: MongoClient = (
                MongoClient(
                    host=self.host,
                    port=self.port,
                    username=self.username,
                    password=self.password,
                    **self.mongoclient_kwargs,
                )
                if self.username != ""
                else MongoClient(self.host, self.port, **self.mongoclient_kwargs)
            )
            db = conn[self.database]
            self._coll = db[self.main]
            self._has_merge_objects = self._collection.database.client.server_info()["version"] > "3.6"

    def close(self):
        """
        Closes underlying database connections.
        """
        self._collection.database.client.close()

    @property
    def _collection(self):
        """Property referring to the root pymongo collection."""
        if self._coll is None:
            raise StoreError("Must connect Mongo-like store before attempting to use it")
        return self._coll

    @property
    def nonmain_names(self) -> list:
        """
        all non-main collection names.
        """
        return list(set(self.collection_names) - {self.main})

    @property
    def last_updated(self) -> datetime:
        """
        Special last_updated for this JointStore
        that checks all underlying collections.
        """
        lus = []
        for cname in self.collection_names:
            store = MongoStore.from_collection(self._collection.database[cname])
            store.last_updated_field = self.last_updated_field
            lu = store.last_updated
            lus.append(lu)
        return max(lus)

    # TODO: implement update?
    def update(self, docs, update_lu=True, key=None, **kwargs):
        """
        Update documents into the underlying collections
        Not Implemented for JointStore.
        """
        raise NotImplementedError("JointStore is a read-only store")

    def _get_store_by_name(self, name) -> MongoStore:
        """
        Gets an underlying collection as a mongoStore.
        """
        if name not in self.collection_names:
            raise ValueError("Asking for collection not referenced in this Store")
        return MongoStore.from_collection(self._collection.database[name])

    def ensure_index(self, key, unique=False, **kwargs):
        """
        Can't ensure index for JointStore.
        """
        raise NotImplementedError("No ensure_index method for JointStore")

    def _get_pipeline(self, criteria=None, properties=None, skip=0, limit=0):
        """
        Gets the aggregation pipeline for query and query_one.

        Args:
            properties: properties to be returned
            criteria: criteria to filter by
            skip: docs to skip
            limit: limit results to N docs
        Returns:
            list of aggregation operators
        """
        pipeline = []
        collection_names = list(set(self.collection_names) - set(self.main))
        for cname in collection_names:
            pipeline.append(
                {
                    "$lookup": {
                        "from": cname,
                        "localField": self.key,
                        "foreignField": self.key,
                        "as": cname,
                    }
                }
            )

            if self.merge_at_root:
                if not self._has_merge_objects:
                    raise Exception("MongoDB server version too low to use $mergeObjects.")

                pipeline.append(
                    {
                        "$replaceRoot": {
                            "newRoot": {
                                "$mergeObjects": [
                                    {"$arrayElemAt": [f"${cname}", 0]},
                                    "$$ROOT",
                                ]
                            }
                        }
                    }
                )
            else:
                pipeline.append(
                    {
                        "$unwind": {
                            "path": f"${cname}",
                            "preserveNullAndEmptyArrays": True,
                        }
                    }
                )

        # Do projection for max last_updated
        lu_max_fields = [f"${self.last_updated_field}"]
        lu_max_fields.extend([f"${cname}.{self.last_updated_field}" for cname in self.collection_names])
        lu_proj = {self.last_updated_field: {"$max": lu_max_fields}}
        pipeline.append({"$addFields": lu_proj})

        if criteria:
            pipeline.append({"$match": criteria})
        if isinstance(properties, list):
            properties = {k: 1 for k in properties}
        if properties:
            pipeline.append({"$project": properties})

        if skip > 0:
            pipeline.append({"$skip": skip})

        if limit > 0:
            pipeline.append({"$limit": limit})
        return pipeline

    def count(self, criteria: Optional[dict] = None) -> int:
        """
        Counts the number of documents matching the query criteria.

        Args:
            criteria: PyMongo filter for documents to count in
        """
        pipeline = self._get_pipeline(criteria=criteria)
        pipeline.append({"$count": "count"})
        agg = list(self._collection.aggregate(pipeline))
        return agg[0].get("count", 0) if len(agg) > 0 else 0

    def query(
        self,
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[dict]:
        pipeline = self._get_pipeline(criteria=criteria, properties=properties, skip=skip, limit=limit)
        agg = self._collection.aggregate(pipeline)
        yield from agg

    def groupby(
        self,
        keys: Union[list[str], str],
        criteria: Optional[dict] = None,
        properties: Union[dict, list, None] = None,
        sort: Optional[dict[str, Union[Sort, int]]] = None,
        skip: int = 0,
        limit: int = 0,
    ) -> Iterator[tuple[dict, list[dict]]]:
        pipeline = self._get_pipeline(criteria=criteria, properties=properties, skip=skip, limit=limit)
        if not isinstance(keys, list):
            keys = [keys]
        group_id = {}  # type: Dict[str,Any]
        for key in keys:
            set_(group_id, key, f"${key}")
        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})

        agg = self._collection.aggregate(pipeline)

        for d in agg:
            yield d["_id"], d["docs"]

    def query_one(self, criteria=None, properties=None, **kwargs):
        """
        Get one document.

        Args:
            properties: properties to return in query
            criteria: filter for matching
            kwargs: kwargs for collection.aggregate

        Returns:
            single document
        """
        # TODO: maybe adding explicit limit in agg pipeline is better as below?
        # pipeline = self._get_pipeline(properties, criteria)
        # pipeline.append({"$limit": 1})
        query = self.query(criteria=criteria, properties=properties, **kwargs)
        try:
            return next(query)
        except StopIteration:
            return None

    def remove_docs(self, criteria: dict):
        """
        Remove docs matching the query dictionary.

        Args:
            criteria: query dictionary to match
        """
        raise NotImplementedError("No remove_docs method for JointStore")

    def __eq__(self, other: object) -> bool:
        """
        Check equality for JointStore

        Args:
            other: other JointStore to compare with.
        """
        if not isinstance(other, JointStore):
            return False

        fields = [
            "database",
            "collection_names",
            "host",
            "port",
            "main",
            "merge_at_root",
        ]
        return all(getattr(self, f) == getattr(other, f) for f in fields)

`last_updated: datetime` `property` ¶

Special last_updated for this JointStore that checks all underlying collections.

`name: str` `property` ¶

Return a string representing this data source.

`nonmain_names: list` `property` ¶

all non-main collection names.

`eq(other)` ¶

Check equality for JointStore

Parameters:

Name	Type	Description	Default
`other`	`object`	other JointStore to compare with.	required

Source code in src/maggma/stores/compound_stores.py

def __eq__(self, other: object) -> bool:
    """
    Check equality for JointStore

    Args:
        other: other JointStore to compare with.
    """
    if not isinstance(other, JointStore):
        return False

    fields = [
        "database",
        "collection_names",
        "host",
        "port",
        "main",
        "merge_at_root",
    ]
    return all(getattr(self, f) == getattr(other, f) for f in fields)

`init(database, collection_names, host='localhost', port=27017, username='', password='', main=None, merge_at_root=False, mongoclient_kwargs=None, **kwargs)` ¶

Parameters:

Name	Type	Description	Default
`database`	`str`	The database name	required
`collection_names`	`list[str]`	list of all collections to join	required
`host`	`str`	Hostname for the database	`'localhost'`
`port`	`int`	TCP port to connect to	`27017`
`username`	`str`	Username for the collection	`''`
`password`	`str`	Password to connect with	`''`
`main`	`Optional[str]`	name for the main collection if not specified this defaults to the first in collection_names list.	`None`

Source code in src/maggma/stores/compound_stores.py

def __init__(
    self,
    database: str,
    collection_names: list[str],
    host: str = "localhost",
    port: int = 27017,
    username: str = "",
    password: str = "",
    main: Optional[str] = None,
    merge_at_root: bool = False,
    mongoclient_kwargs: Optional[dict] = None,
    **kwargs,
):
    """
    Args:
        database: The database name
        collection_names: list of all collections
            to join
        host: Hostname for the database
        port: TCP port to connect to
        username: Username for the collection
        password: Password to connect with
        main: name for the main collection
            if not specified this defaults to the first
            in collection_names list.
    """
    self.database = database
    self.collection_names = collection_names
    self.host = host
    self.port = port
    self.username = username
    self.password = password
    self._coll = None  # type: Any
    self.main = main or collection_names[0]
    self.merge_at_root = merge_at_root
    self.mongoclient_kwargs = mongoclient_kwargs or {}
    self.kwargs = kwargs

    super().__init__(**kwargs)

`close()` ¶

Closes underlying database connections.

Source code in src/maggma/stores/compound_stores.py

def close(self):
    """
    Closes underlying database connections.
    """
    self._collection.database.client.close()

`connect(force_reset=False)` ¶

Connects the underlying Mongo database and all collection connections.

Parameters:

Name	Type	Description	Default
`force_reset`	`bool`	whether to reset the connection or not when the Store is already connected.	`False`

Source code in src/maggma/stores/compound_stores.py

def connect(self, force_reset: bool = False):
    """
    Connects the underlying Mongo database and all collection connections.

    Args:
        force_reset: whether to reset the connection or not when the Store is
            already connected.
    """
    if not self._coll or force_reset:
        conn: MongoClient = (
            MongoClient(
                host=self.host,
                port=self.port,
                username=self.username,
                password=self.password,
                **self.mongoclient_kwargs,
            )
            if self.username != ""
            else MongoClient(self.host, self.port, **self.mongoclient_kwargs)
        )
        db = conn[self.database]
        self._coll = db[self.main]
        self._has_merge_objects = self._collection.database.client.server_info()["version"] > "3.6"

`count(criteria=None)` ¶

Counts the number of documents matching the query criteria.

Parameters:

Name	Type	Description	Default
`criteria`	`Optional[dict]`	PyMongo filter for documents to count in	`None`

Source code in src/maggma/stores/compound_stores.py

def count(self, criteria: Optional[dict] = None) -> int:
    """
    Counts the number of documents matching the query criteria.

    Args:
        criteria: PyMongo filter for documents to count in
    """
    pipeline = self._get_pipeline(criteria=criteria)
    pipeline.append({"$count": "count"})
    agg = list(self._collection.aggregate(pipeline))
    return agg[0].get("count", 0) if len(agg) > 0 else 0

`ensure_index(key, unique=False, **kwargs)` ¶

Can't ensure index for JointStore.

Source code in src/maggma/stores/compound_stores.py

def ensure_index(self, key, unique=False, **kwargs):
    """
    Can't ensure index for JointStore.
    """
    raise NotImplementedError("No ensure_index method for JointStore")

`query_one(criteria=None, properties=None, **kwargs)` ¶

Get one document.

Parameters:

Name	Description	Default
`properties`	properties to return in query	`None`
`criteria`	filter for matching	`None`
`kwargs`	kwargs for collection.aggregate	`{}`

Returns:

Type	Description
	single document

Source code in src/maggma/stores/compound_stores.py

def query_one(self, criteria=None, properties=None, **kwargs):
    """
    Get one document.

    Args:
        properties: properties to return in query
        criteria: filter for matching
        kwargs: kwargs for collection.aggregate

    Returns:
        single document
    """
    # TODO: maybe adding explicit limit in agg pipeline is better as below?
    # pipeline = self._get_pipeline(properties, criteria)
    # pipeline.append({"$limit": 1})
    query = self.query(criteria=criteria, properties=properties, **kwargs)
    try:
        return next(query)
    except StopIteration:
        return None

`remove_docs(criteria)` ¶

Remove docs matching the query dictionary.

Parameters:

Name	Type	Description	Default
`criteria`	`dict`	query dictionary to match	required

Source code in src/maggma/stores/compound_stores.py

def remove_docs(self, criteria: dict):
    """
    Remove docs matching the query dictionary.

    Args:
        criteria: query dictionary to match
    """
    raise NotImplementedError("No remove_docs method for JointStore")

`update(docs, update_lu=True, key=None, **kwargs)` ¶

Update documents into the underlying collections Not Implemented for JointStore.

Source code in src/maggma/stores/compound_stores.py

def update(self, docs, update_lu=True, key=None, **kwargs):
    """
    Update documents into the underlying collections
    Not Implemented for JointStore.
    """
    raise NotImplementedError("JointStore is a read-only store")

`SSHTunnel` ¶

Bases: MSONable

SSH tunnel to remote server.

Source code in src/maggma/stores/ssh_tunnel.py

class SSHTunnel(MSONable):
    """SSH tunnel to remote server."""

    __TUNNELS: dict[str, SSHTunnelForwarder] = {}

    def __init__(
        self,
        tunnel_server_address: str,
        remote_server_address: str,
        local_port: Optional[int] = None,
        username: Optional[str] = None,
        password: Optional[str] = None,
        private_key: Optional[str] = None,
        **kwargs,
    ):
        """
        Args:
            tunnel_server_address: string address with port for the SSH tunnel server
            remote_server_address: string address with port for the server to connect to
            local_port: optional port to use for the local address (127.0.0.1);
                if `None`, a random open port will be automatically selected
            username: optional username for the ssh tunnel server
            password: optional password for the ssh tunnel server; If a private_key is
                supplied this password is assumed to be the private key password
            private_key: ssh private key to authenticate to the tunnel server
            kwargs: any extra args passed to the SSHTunnelForwarder.
        """
        self.tunnel_server_address = tunnel_server_address
        self.remote_server_address = remote_server_address
        self.local_port = local_port
        self.username = username
        self.password = password
        self.private_key = private_key
        self.kwargs = kwargs

        if remote_server_address in SSHTunnel.__TUNNELS:
            self.tunnel = SSHTunnel.__TUNNELS[remote_server_address]
        else:
            if local_port is None:
                local_port = _find_free_port("127.0.0.1")
            local_bind_address = ("127.0.0.1", local_port)

            ssh_address, ssh_port = tunnel_server_address.split(":")
            ssh_port = int(ssh_port)  # type: ignore

            remote_bind_address, remote_bind_port = remote_server_address.split(":")
            remote_bind_port = int(remote_bind_port)  # type: ignore

            if private_key is not None:
                ssh_password = None
                ssh_private_key_password = password
            else:
                ssh_password = password
                ssh_private_key_password = None

            self.tunnel = SSHTunnelForwarder(
                ssh_address_or_host=(ssh_address, ssh_port),
                local_bind_address=local_bind_address,
                remote_bind_address=(remote_bind_address, remote_bind_port),
                ssh_username=username,
                ssh_password=ssh_password,
                ssh_private_key_password=ssh_private_key_password,
                ssh_pkey=private_key,
                **kwargs,
            )

    def start(self):
        if not self.tunnel.is_active:
            self.tunnel.start()

    def stop(self):
        if self.tunnel.tunnel_is_up:
            self.tunnel.stop()

    @property
    def local_address(self) -> tuple[str, int]:
        return self.tunnel.local_bind_address

`init(tunnel_server_address, remote_server_address, local_port=None, username=None, password=None, private_key=None, **kwargs)` ¶

Parameters:

Name	Type	Description	Default
`tunnel_server_address`	`str`	string address with port for the SSH tunnel server	required
`remote_server_address`	`str`	string address with port for the server to connect to	required
`local_port`	`Optional[int]`	optional port to use for the local address (127.0.0.1); if `None`, a random open port will be automatically selected	`None`
`username`	`Optional[str]`	optional username for the ssh tunnel server	`None`
`password`	`Optional[str]`	optional password for the ssh tunnel server; If a private_key is supplied this password is assumed to be the private key password	`None`
`private_key`	`Optional[str]`	ssh private key to authenticate to the tunnel server	`None`
`kwargs`		any extra args passed to the SSHTunnelForwarder.	`{}`

Source code in src/maggma/stores/ssh_tunnel.py

def __init__(
    self,
    tunnel_server_address: str,
    remote_server_address: str,
    local_port: Optional[int] = None,
    username: Optional[str] = None,
    password: Optional[str] = None,
    private_key: Optional[str] = None,
    **kwargs,
):
    """
    Args:
        tunnel_server_address: string address with port for the SSH tunnel server
        remote_server_address: string address with port for the server to connect to
        local_port: optional port to use for the local address (127.0.0.1);
            if `None`, a random open port will be automatically selected
        username: optional username for the ssh tunnel server
        password: optional password for the ssh tunnel server; If a private_key is
            supplied this password is assumed to be the private key password
        private_key: ssh private key to authenticate to the tunnel server
        kwargs: any extra args passed to the SSHTunnelForwarder.
    """
    self.tunnel_server_address = tunnel_server_address
    self.remote_server_address = remote_server_address
    self.local_port = local_port
    self.username = username
    self.password = password
    self.private_key = private_key
    self.kwargs = kwargs

    if remote_server_address in SSHTunnel.__TUNNELS:
        self.tunnel = SSHTunnel.__TUNNELS[remote_server_address]
    else:
        if local_port is None:
            local_port = _find_free_port("127.0.0.1")
        local_bind_address = ("127.0.0.1", local_port)

        ssh_address, ssh_port = tunnel_server_address.split(":")
        ssh_port = int(ssh_port)  # type: ignore

        remote_bind_address, remote_bind_port = remote_server_address.split(":")
        remote_bind_port = int(remote_bind_port)  # type: ignore

        if private_key is not None:
            ssh_password = None
            ssh_private_key_password = password
        else:
            ssh_password = password
            ssh_private_key_password = None

        self.tunnel = SSHTunnelForwarder(
            ssh_address_or_host=(ssh_address, ssh_port),
            local_bind_address=local_bind_address,
            remote_bind_address=(remote_bind_address, remote_bind_port),
            ssh_username=username,
            ssh_password=ssh_password,
            ssh_private_key_password=ssh_private_key_password,
            ssh_pkey=private_key,
            **kwargs,
        )

Stores

JSONStore ¶

__eq__(other) ¶

__init__(paths, read_only=True, serialization_option=None, serialization_default=None, encoding=None, **kwargs) ¶

connect(force_reset=False) ¶

read_json_file(path) ¶

remove_docs(criteria) ¶

update(docs, key=None) ¶

update_json_file() ¶

MemoryStore ¶

name property ¶

__eq__(other) ¶

__hash__() ¶

__init__(collection_name='memory_db', **kwargs) ¶

close() ¶

connect(force_reset=False) ¶

groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0) ¶

MongoStore ¶

name: str property ¶

__eq__(other) ¶

__hash__() ¶

__init__(database, collection_name, host='localhost', port=27017, username='', password='', ssh_tunnel=None, safe_update=False, auth_source=None, mongoclient_kwargs=None, default_sort=None, **kwargs) ¶

close() ¶

connect(force_reset=False) ¶

count(criteria=None, hint=None) ¶

distinct(field, criteria=None, all_exist=False) ¶

ensure_index(key, unique=False) ¶

from_collection(collection) classmethod ¶

from_db_file(filename, **kwargs) classmethod ¶

from_launchpad_file(lp_file, collection_name, **kwargs) classmethod ¶

groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0) ¶

query(criteria=None, properties=None, sort=None, hint=None, skip=0, limit=0, **kwargs) ¶

remove_docs(criteria) ¶

update(docs, key=None) ¶

MongoURIStore ¶

name: str property ¶

__init__(uri, collection_name, database=None, ssh_tunnel=None, mongoclient_kwargs=None, default_sort=None, **kwargs) ¶

connect(force_reset=False) ¶

MontyStore ¶

name: str property ¶

__init__(collection_name, database_path=None, database_name='db', storage='sqlite', storage_kwargs=None, client_kwargs=None, **kwargs) ¶

connect(force_reset=False) ¶

count(criteria=None, hint=None) ¶

update(docs, key=None) ¶

FileStore ¶

name: str property ¶

__init__(path, file_filters=None, max_depth=None, read_only=True, include_orphans=False, json_name='FileStore.json', encoding=None, **kwargs) ¶

add_metadata(metadata=None, query=None, auto_data=None, **kwargs) ¶

connect(force_reset=False) ¶

query(criteria=None, properties=None, sort=None, hint=None, skip=0, limit=0, contents_size_limit=0) ¶

query_one(criteria=None, properties=None, sort=None, contents_size_limit=None) ¶

read() ¶

remove_docs(criteria, confirm=False) ¶

update(docs, key=None) ¶

GridFSStore ¶

last_updated: datetime property ¶

name: str property ¶

__eq__(other) ¶

__init__(database, collection_name, host='localhost', port=27017, username='', password='', compression=False, ensure_metadata=False, searchable_fields=None, auth_source=None, mongoclient_kwargs=None, ssh_tunnel=None, **kwargs) ¶

connect(force_reset=False) ¶

count(criteria=None) ¶

distinct(field, criteria=None, all_exist=False) ¶

ensure_index(key, unique=False) ¶

from_launchpad_file(lp_file, collection_name, **kwargs) classmethod ¶

groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0) ¶

query(criteria=None, properties=None, sort=None, skip=0, limit=0) ¶

remove_docs(criteria) ¶

transform_criteria(criteria) classmethod ¶

update(docs, key=None, additional_metadata=None) ¶

GridFSURIStore ¶

__init__(uri, collection_name, database=None, compression=False, ensure_metadata=False, searchable_fields=None, mongoclient_kwargs=None, **kwargs) ¶

connect(force_reset=False) ¶

S3Store ¶

name: str property ¶

__eq__(other) ¶

__init__(index, bucket, s3_profile=None, compress=False, endpoint_url=None, sub_dir=None, s3_workers=1, s3_resource_kwargs=None, ssh_tunnel=None, key='fs_id', store_hash=True, unpack_data=True, searchable_fields=None, index_store_kwargs=None, **kwargs) ¶

close() ¶

connect(force_reset=False) ¶

count(criteria=None) ¶

distinct(field, criteria=None, all_exist=False) ¶

`JSONStore` ¶

`eq(other)` ¶

`init(paths, read_only=True, serialization_option=None, serialization_default=None, encoding=None, **kwargs)` ¶

`connect(force_reset=False)` ¶

`read_json_file(path)` ¶

`remove_docs(criteria)` ¶

`update(docs, key=None)` ¶

`update_json_file()` ¶

`MemoryStore` ¶

`name` `property` ¶

`eq(other)` ¶

`hash()` ¶

`init(collection_name='memory_db', **kwargs)` ¶

`close()` ¶

`connect(force_reset=False)` ¶

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

`MongoStore` ¶

`name: str` `property` ¶

`eq(other)` ¶

`hash()` ¶

`init(database, collection_name, host='localhost', port=27017, username='', password='', ssh_tunnel=None, safe_update=False, auth_source=None, mongoclient_kwargs=None, default_sort=None, **kwargs)` ¶

`close()` ¶

`connect(force_reset=False)` ¶

`count(criteria=None, hint=None)` ¶

`distinct(field, criteria=None, all_exist=False)` ¶

`ensure_index(key, unique=False)` ¶

`from_collection(collection)` `classmethod` ¶

`from_db_file(filename, **kwargs)` `classmethod` ¶

`from_launchpad_file(lp_file, collection_name, **kwargs)` `classmethod` ¶

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

`query(criteria=None, properties=None, sort=None, hint=None, skip=0, limit=0, **kwargs)` ¶

`remove_docs(criteria)` ¶

`update(docs, key=None)` ¶

`MongoURIStore` ¶

`name: str` `property` ¶

`init(uri, collection_name, database=None, ssh_tunnel=None, mongoclient_kwargs=None, default_sort=None, **kwargs)` ¶

`connect(force_reset=False)` ¶

`MontyStore` ¶

`name: str` `property` ¶

`init(collection_name, database_path=None, database_name='db', storage='sqlite', storage_kwargs=None, client_kwargs=None, **kwargs)` ¶

`connect(force_reset=False)` ¶

`count(criteria=None, hint=None)` ¶

`update(docs, key=None)` ¶

`FileStore` ¶

`name: str` `property` ¶

`init(path, file_filters=None, max_depth=None, read_only=True, include_orphans=False, json_name='FileStore.json', encoding=None, **kwargs)` ¶

`add_metadata(metadata=None, query=None, auto_data=None, **kwargs)` ¶

`connect(force_reset=False)` ¶

`query(criteria=None, properties=None, sort=None, hint=None, skip=0, limit=0, contents_size_limit=0)` ¶

`query_one(criteria=None, properties=None, sort=None, contents_size_limit=None)` ¶

`read()` ¶

`remove_docs(criteria, confirm=False)` ¶

`update(docs, key=None)` ¶

`GridFSStore` ¶

`last_updated: datetime` `property` ¶

`name: str` `property` ¶

`eq(other)` ¶

`init(database, collection_name, host='localhost', port=27017, username='', password='', compression=False, ensure_metadata=False, searchable_fields=None, auth_source=None, mongoclient_kwargs=None, ssh_tunnel=None, **kwargs)` ¶

`connect(force_reset=False)` ¶

`count(criteria=None)` ¶

`distinct(field, criteria=None, all_exist=False)` ¶

`ensure_index(key, unique=False)` ¶

`from_launchpad_file(lp_file, collection_name, **kwargs)` `classmethod` ¶

`groupby(keys, criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

`query(criteria=None, properties=None, sort=None, skip=0, limit=0)` ¶

`remove_docs(criteria)` ¶

`transform_criteria(criteria)` `classmethod` ¶

`update(docs, key=None, additional_metadata=None)` ¶

`GridFSURIStore` ¶

`init(uri, collection_name, database=None, compression=False, ensure_metadata=False, searchable_fields=None, mongoclient_kwargs=None, **kwargs)` ¶

`connect(force_reset=False)` ¶

`S3Store` ¶

`name: str` `property` ¶

`eq(other)` ¶

`init(index, bucket, s3_profile=None, compress=False, endpoint_url=None, sub_dir=None, s3_workers=1, s3_resource_kwargs=None, ssh_tunnel=None, key='fs_id', store_hash=True, unpack_data=True, searchable_fields=None, index_store_kwargs=None, **kwargs)` ¶

`close()` ¶

`connect(force_reset=False)` ¶

`count(criteria=None)` ¶

`distinct(field, criteria=None, all_exist=False)` ¶

`ensure_index(key, unique=False)` ¶