Source code for vibeqc.gcp

"""Geometric counterpoise (gCP) correction — Kruse & Grimme 2012.

The gCP correction is a per-atom-pair, density-free additive correction
that approximates basis-set superposition error (BSSE) in the Boys–
Bernardi counterpoise sense without running the multiple ghost-atom
calculations the rigorous CP requires. It is the "missing third leg" of
the Grimme-school **composite 3c methods** alongside dispersion (D3 / D4)
and the per-functional modified short-range correction.

The functional form (Kruse & Grimme, *J. Chem. Phys.* **136**, 154101
(2012), eq. 4 + 5)::

    E_gCP = σ · Σ_a Σ_{b≠a}  e^(−α · R_ab^β)  ·  e_a^MIS  /  √(N_b^virt · S_ab)

where

* ``σ, α, β`` are the three basis-set-specific fit parameters (Table 2
  of the paper);
* ``e_a^MIS`` is the per-atom "missing basis-set incompleteness" energy
  — the difference between an isolated-atom HF energy in the target
  small basis and in a much larger reference basis (Table 3 + SI of the
  paper);
* ``N_b^virt`` is the number of virtual orbitals of atom *b* in the
  target basis (basis-dependent count);
* ``S_ab`` is the 1s-1s Slater-type-orbital overlap between atoms *a*
  and *b* with effective per-element exponents ``ζ_a``.

The correction is **structure-only**: no density, no orbitals, no SCF —
just geometry + the per-basis parameter set. This is what makes it
cheap to add to the 3c composites at zero SCF cost.

vibe-qc carries the published ``(σ, α, β)`` fit parameters for the
basis sets the 3c composites use, plus the ``e_a^MIS``, ``N_b^virt``,
and Slater ``ζ_a`` tables for the H / C / N / O / F first-row organic
chemistry slice — the most-published reference data, cross-verified
against the Kruse-Grimme 2012 paper and the canonical Grimme-group
``mctc-gcp`` reference implementation. Coverage beyond first-row is
not yet bundled; ``compute_gcp`` raises :class:`GCPDataMissing` with
a contribute-via-PR pointer when called on an unsupported element /
basis combination.

See :mod:`vibeqc.composites` for the keyword shortcuts (``hf-3c``,
``pbeh-3c``, ``b97-3c``, etc.) that wire gCP into a single-call
turnkey 3c run via :func:`vibeqc.run_job`.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

import numpy as np

try:
    # Python 3.11+: tomllib is stdlib. vibe-qc requires Python ≥ 3.11
    # per pyproject.toml so the import is unconditional in practice.
    import tomllib as _tomllib
except ImportError:  # pragma: no cover
    import tomli as _tomllib   # type: ignore[no-redef]

from ._vibeqc_core import Molecule
from ._gcp_slater import (
    PER_ELEMENT_SLATER_SHELL,
    effective_zeta,
    slater_overlap,
)


__all__ = [
    "GCPParams",
    "GCPResult",
    "GCPDataMissing",
    "compute_gcp",
    "gcp_params_for",
    "available_basis_sets",
]


class GCPDataMissing(ValueError):
    """Raised by :func:`compute_gcp` when an element or basis set falls
    outside the bundled gCP parameter coverage. Lists the missing
    (element, basis) pairs and points at the contribution pathway.
    """


@dataclass(frozen=True)
class GCPParams:
    """Geometric-counterpoise correction parameters for one basis set.

    Holds the four Kruse-Grimme fit constants (σ, η, α, β) plus the
    per-element (e_mis, n_virt) tables. Per-element keys are atomic
    numbers (1 = H, 6 = C, …). The per-element Slater-overlap exponent
    is *not* stored per-element — it is computed from the universal
    Clementi-Raimondi single-zeta table in :mod:`vibeqc._gcp_slater`
    scaled by η, matching the canonical mctc-gcp Fortran reference.

    Attributes
    ----------
    basis_name : str
        Lowercase canonical name of the target basis set (e.g.
        ``"def2-svp"``, ``"minix"``, ``"def2-mtzvp"``).
    sigma, eta, alpha, beta : float
        The four basis-set-specific fit constants from the per-recipe
        ``case`` block in mctc-gcp ``src/gcp.f90``::

            E_gCP = σ · Σ_a Σ_{b≠a}
                        exp(-α · R_ab^β) · e_a^MIS /
                        sqrt(N_b^virt · S_ab(η))

        where ``S_ab(η)`` is the shell-aware Slater overlap with
        per-element effective exponents ζ_X = η · (universal
        Clementi-Raimondi single-zeta). Dimensions: σ dimensionless,
        η dimensionless (a multiplicative Slater-exponent scaling),
        α has units of bohr^(−β), β dimensionless.
    e_mis : dict[int, float]
        Per-element "missing basis-set incompleteness" energy
        (Hartree). Positive in the stored convention; the gCP
        formula applies the negative sign internally.
    n_virt : dict[int, float]
        Per-element virtual-orbital count in the target basis. The
        denominator under the square root in the gCP formula; floating
        point so half-integer "fractional" definitions remain
        expressible.
    etaspec : float
        Extra heavier-element (Z ≥ 11) Slater-exponent scaling factor.
        Default ``1.0``; r²SCAN-3c sets this to ``1.15`` per the
        Grimme et al. 2021 recipe. Mirrors mctc-gcp's ``etaspec``
        parameter.
    citation : str
        One-line bibliographic pointer to the original publication
        introducing this parameter set. Surfaced in the SCF log under
        "Parameter provenance".
    """

    basis_name: str
    sigma: float
    alpha: float
    beta: float
    eta: float = 1.0
    e_mis: dict[int, float] = field(default_factory=dict)
    n_virt: dict[int, float] = field(default_factory=dict)
    etaspec: float = 1.0
    citation: str = ""

    def covers(self, atomic_numbers) -> tuple[list[int], list[int]]:
        """Return ``(supported, missing)`` element lists for the given
        atomic-number sequence. Used by :func:`compute_gcp` to surface
        coverage gaps before evaluating.

        An element is "supported" iff it has rows registered in **both**
        ``e_mis`` and ``n_virt`` (key existence, *not* nonzero values —
        the canonical mctc-gcp tables encode "this element-pair has
        zero gCP contribution" as ``e_mis = 0.0`` while still having
        the row registered; we honour that convention).
        """
        supported, missing = [], []
        for z in atomic_numbers:
            zi = int(z)
            if zi in self.e_mis and zi in self.n_virt:
                supported.append(zi)
            else:
                missing.append(zi)
        return supported, missing


[docs] @dataclass class GCPResult: """Return value of :func:`compute_gcp`. Attributes ---------- energy : float gCP correction energy (Hartree). Always non-negative — gCP opposes the BSSE over-binding of incomplete basis sets, so a positive correction is added to the SCF total. gradient : np.ndarray, optional ``(n_atoms, 3)`` nuclear gradient ∂E_gCP/∂R_A (Hartree/bohr). Only present when ``compute_gcp`` was called with ``with_gradient=True``; otherwise ``None``. basis_name : str Echo of the target basis set the correction was evaluated for. params : GCPParams The parameter set that was used. Recorded so callers can dump provenance into a log without a second lookup. """ energy: float basis_name: str params: GCPParams gradient: Optional[np.ndarray] = None def __repr__(self) -> str: return ( f"GCPResult(basis={self.basis_name!r}, " f"energy={self.energy:+.8e}" + (", gradient=<...>)" if self.gradient is not None else ")") )
# --------------------------------------------------------------------------- # Slater-type-orbital overlap is now implemented in :mod:`vibeqc._gcp_slater` # (shell-aware analytic form ported from the canonical mctc-gcp Fortran # reference at https://github.com/grimme-lab/gcp). The simpler 1s-1s # geometric-mean form used in the v0.9.0-prep first-cut has been # retired — it didn't match the parameter fit conventions Grimme et al. # used, so the bundled (σ, η, α, β) constants weren't usable with it. # Callers should use :func:`vibeqc._gcp_slater.slater_overlap` directly # if they need the overlap surface from outside this module. # # A 1s-1s convenience kept under the original name for downstream # tests / debugging — defers to the canonical implementation with # both shells pinned to 1. def _slater_overlap_1s_1s(zeta_a: float, zeta_b: float, R: float) -> float: """Compatibility wrapper for the original 1s-1s overlap surface. Delegates to :func:`vibeqc._gcp_slater.slater_overlap` with both shells = 1 (Roothaan 1951 ⟨1s|1s⟩ analytic form). Symmetric in (ζ_a, ζ_b), positive on (0, ∞), reduces to 1 at R = 0. """ if R < 1e-10: return 1.0 return slater_overlap(R, 1, 1, zeta_a, zeta_b) # --------------------------------------------------------------------------- # Parameter registry. Loaded from the TOML files in # ``python/vibeqc/data_library/gcp/`` — one file per basis, with # per-publication citation + DOI + license metadata in the file's # ``[metadata]`` block. See ``data_library/README.md`` for the full # storage standard, ``data_library/_schema.toml`` for the per-kind # schema, and ``data_library/gcp/_template.toml`` for a worked # template. # # The registry is built lazily at module-import time so the file I/O # only runs once per Python process; the dict comprehension below is # the entire loader. Adding a new basis is a single TOML file in # ``data_library/gcp/`` — no code change. # --------------------------------------------------------------------------- _GCP_DATA_DIR = Path(__file__).resolve().parent / "data_library" / "gcp" _ELEMENT_SYMBOLS_TO_Z = { "h": 1, "he": 2, "li": 3, "be": 4, "b": 5, "c": 6, "n": 7, "o": 8, "f": 9, "ne": 10, "na": 11, "mg": 12, "al": 13, "si": 14, "p": 15, "s": 16, "cl": 17, "ar": 18, "k": 19, "ca": 20, "sc": 21, "ti": 22, "v": 23, "cr": 24, "mn": 25, "fe": 26, "co": 27, "ni": 28, "cu": 29, "zn": 30, "ga": 31, "ge": 32, "as": 33, "se": 34, "br": 35, "kr": 36, "rb": 37, "sr": 38, "y": 39, "zr": 40, "nb": 41, "mo": 42, "tc": 43, "ru": 44, "rh": 45, "pd": 46, "ag": 47, "cd": 48, "in": 49, "sn": 50, "sb": 51, "te": 52, "i": 53, "xe": 54, "cs": 55, "ba": 56, "la": 57, "ce": 58, "pr": 59, "nd": 60, "pm": 61, "sm": 62, "eu": 63, "gd": 64, "tb": 65, "dy": 66, "ho": 67, "er": 68, "tm": 69, "yb": 70, "lu": 71, "hf": 72, "ta": 73, "w": 74, "re": 75, "os": 76, "ir": 77, "pt": 78, "au": 79, "hg": 80, "tl": 81, "pb": 82, "bi": 83, "po": 84, "at": 85, "rn": 86, } def _parse_gcp_toml(path: Path) -> GCPParams: """Parse one ``data_library/gcp/<name>.toml`` file into a :class:`GCPParams` instance. The schema is documented in ``data_library/README.md`` and ``data_library/_schema.toml``. The loader validates the ``metadata.kind`` discriminator and propagates the citation string into the returned ``GCPParams.citation`` field. Files with ``metadata.status = "pending"`` (no [elements.*] bodies populated yet) load cleanly into a ``GCPParams`` with empty per-element dicts; the eventual ``compute_gcp(..., basis)`` call raises :class:`GCPDataMissing` with the right contribute- via-PR pointer. """ with path.open("rb") as fh: doc = _tomllib.load(fh) meta = doc.get("metadata", {}) if meta.get("kind") != "gcp_parameters": raise ValueError( f"{path.name}: metadata.kind is {meta.get('kind')!r}, " f"expected 'gcp_parameters' (see data_library/_schema.toml)." ) basis_name = str(meta["name"]).lower() if path.stem.lower() != basis_name: raise ValueError( f"{path.name}: metadata.name={basis_name!r} does not match " f"file stem {path.stem!r}." ) params = doc.get("parameters", {}) sigma = float(params.get("sigma", 0.0)) alpha = float(params.get("alpha", 0.0)) beta = float(params.get("beta", 0.0)) # ``eta`` is optional for back-compat with v0.9.0-prep TOMLs that # only carried (sigma, alpha, beta); a missing eta defaults to 1.0 # which makes the gCP attenuator use un-scaled Clementi-Raimondi # Slater exponents. Bundled TOMLs from extract_gcp_data.py carry # the canonical η. eta = float(params.get("eta", 1.0)) # ``etaspec`` is the extra heavier-element scaling factor; only # set for r²SCAN-3c / def2-mTZVPP (where it = 1.15 per Grimme # 2021); default 1.0 elsewhere. etaspec = float(params.get("etaspec", 1.0)) e_mis: dict[int, float] = {} n_virt: dict[int, float] = {} for sym, block in doc.get("elements", {}).items(): z = _ELEMENT_SYMBOLS_TO_Z.get(sym.lower()) if z is None: raise ValueError( f"{path.name}: unrecognised element symbol {sym!r} in " f"[elements.*]." ) e_mis[z] = float(block["e_mis"]) n_virt[z] = float(block["n_virt"]) citation = str(meta.get("citation", "")) doi = str(meta.get("doi", "")) if doi: citation = f"{citation} (DOI: {doi})" if citation else f"DOI: {doi}" return GCPParams( basis_name=basis_name, sigma=sigma, eta=eta, alpha=alpha, beta=beta, e_mis=e_mis, n_virt=n_virt, etaspec=etaspec, citation=citation, ) def _load_gcp_registry() -> dict[str, GCPParams]: """Discover and parse every ``data_library/gcp/*.toml`` file. Files prefixed with ``_`` are treated as templates / schema and skipped. The result is cached at module-import time; new files require a process restart to register. """ out: dict[str, GCPParams] = {} if not _GCP_DATA_DIR.is_dir(): # Build-time / source-tree mismatch — surface clearly rather # than letting compute_gcp fail much later with an unhelpful # "no parameters registered" error. raise RuntimeError( f"vibeqc.gcp: data_library/gcp directory not found at " f"{_GCP_DATA_DIR}. Reinstall vibe-qc (`pip install -e .`) " f"to repopulate the parameter directory." ) for path in sorted(_GCP_DATA_DIR.glob("*.toml")): if path.name.startswith("_"): continue # _template.toml, _schema.toml, etc. params = _parse_gcp_toml(path) out[params.basis_name] = params return out _BASIS_PARAMS: dict[str, GCPParams] = _load_gcp_registry() def available_basis_sets() -> list[str]: """Return the lowercase basis-set names known to the gCP registry, sorted alphabetically. Useful for surface introspection (e.g. the ``vq.list_methods()`` companion of the composites surface). """ return sorted(_BASIS_PARAMS.keys()) def gcp_params_for(basis_name: str) -> Optional[GCPParams]: """Look up the gCP parameter set for a target basis. Case-insensitive, canonicalises ``"def2-SVP"`` / ``"DEF2-SVP"`` / ``"def2-svp"`` to the same registry entry. Returns ``None`` when the basis is not registered — callers decide whether that's an error. """ return _BASIS_PARAMS.get(basis_name.lower().strip()) def _ensure_full_data(params: GCPParams, atomic_numbers) -> None: """Raise GCPDataMissing if any atom's element is not covered by the parameter set's per-element tables. Run before evaluating to fail fast with a clear message rather than silently producing a wrong correction. """ if params.beta == 0.0 and params.alpha == 0.0 and params.sigma == 0.0: # Recipe-side intentional "no gCP" marker (r²SCAN-3c sets all # three to zero — gCP is absorbed into the def2-mTZVPP basis # contraction by construction). Treat as a no-op. return _, missing = params.covers(atomic_numbers) if missing: unique_missing = sorted(set(missing)) raise GCPDataMissing( f"compute_gcp(basis={params.basis_name!r}): per-element " f"gCP parameters not yet bundled for Z={unique_missing}.\n" f" Bundled coverage for this basis: " f"Z={sorted(params.e_mis.keys())} (e_mis), " f"Z={sorted(params.n_virt.keys())} (n_virt).\n" f" Reference parameter source: {params.citation or '(not recorded)'}.\n" f" Contribute the missing per-element data via PR — see " f"docs/user_guide/composites.md § 'Extending the gCP " f"coverage', or supply your own table via " f"vibeqc.GCPParams(...) and call compute_gcp(..., " f"params=<your_params>)." ) if not params.e_mis or not params.n_virt: raise GCPDataMissing( f"compute_gcp(basis={params.basis_name!r}): the (σ, η, α, β) " f"fit constants for this basis are registered, but the " f"per-element e_mis / n_virt tables have not yet " f"been bundled in this first-cut release.\n" f" Reference parameter source: {params.citation or '(not recorded)'}.\n" f" Workaround: pass an explicit ``params=GCPParams(...)`` " f"with the per-element tables filled in. See " f"docs/user_guide/composites.md § 'Extending the gCP " f"coverage'." )
[docs] def compute_gcp( mol: Molecule, basis_name: Optional[str] = None, *, params: Optional[GCPParams] = None, with_gradient: bool = False, ) -> GCPResult: """Geometric counterpoise correction (Kruse & Grimme 2012). Parameters ---------- mol :class:`vibeqc.Molecule` (atomic positions in bohr; the gCP evaluator works in bohr natively). basis_name Lowercase canonical name of the target basis set (e.g. ``"def2-svp"``, ``"minis"``, ``"def2-mtzvpp"``). Looked up in the bundled parameter registry. Mutually exclusive with ``params``. params Explicit :class:`GCPParams` instance — bypasses the registry lookup. Use this to plug in published per-element data for a basis that's not yet bundled, or to evaluate gCP with a custom parameter set. with_gradient If True, the returned :class:`GCPResult` carries an ``(n_atoms, 3)`` array of ∂E_gCP/∂R_A in Hartree/bohr. The gCP energy is a closed-form sum over atom pairs; the gradient is the analytic derivative of the same expression with respect to nuclear positions. Returns ------- GCPResult Raises ------ ValueError If neither ``basis_name`` nor ``params`` is given, or if both are. GCPDataMissing If the basis-set / element combination falls outside the bundled parameter coverage. The message lists the missing elements and points at the contribute-via-PR pathway. """ if (basis_name is None) == (params is None): raise ValueError( "compute_gcp: pass exactly one of ``basis_name`` (registry " "lookup) or ``params`` (explicit parameter set)." ) if params is None: params = gcp_params_for(basis_name) # type: ignore[arg-type] if params is None: known = ", ".join(available_basis_sets()) raise GCPDataMissing( f"compute_gcp(basis_name={basis_name!r}): no gCP " f"parameter set registered for this basis. Bundled " f"registry: [{known}]. Pass an explicit " f"``params=GCPParams(...)`` to evaluate with a custom " f"parameter set." ) atoms = list(mol.atoms) n = len(atoms) if n < 2: # Single-atom or empty system — no pairs, gCP is zero. return GCPResult( energy=0.0, basis_name=params.basis_name, params=params, gradient=(np.zeros((n, 3)) if with_gradient else None), ) atomic_numbers = [int(a.Z) for a in atoms] _ensure_full_data(params, atomic_numbers) # r²SCAN-3c case: the recipe sets (σ, α, β) all to zero to signal # "no gCP" — the basis contraction is tuned to absorb the BSSE. if params.sigma == 0.0 and params.alpha == 0.0 and params.beta == 0.0: return GCPResult( energy=0.0, basis_name=params.basis_name, params=params, gradient=(np.zeros((n, 3)) if with_gradient else None), ) positions = np.array([list(a.xyz) for a in atoms], dtype=float) # Pre-compute the per-element effective Slater exponents and shell # quantum numbers; these are O(n_atoms) and used in the O(n²) pair # loop below, so the precomputation is mandatory for performance on # larger systems. zeta = [ effective_zeta(z, eta=params.eta, etaspec=params.etaspec) for z in atomic_numbers ] n_shell = [PER_ELEMENT_SLATER_SHELL[z] for z in atomic_numbers] energy = 0.0 gradient = np.zeros((n, 3)) if with_gradient else None eps = 1e-12 # Double-sum over ordered pairs (a, b) with a ≠ b. The Kruse-Grimme # form is asymmetric in (a, b) — e_a^MIS appears with a's identity # in the numerator while N_b^virt and S_ab appear with b's. for a in range(n): za = atomic_numbers[a] e_mis_a = params.e_mis[za] if e_mis_a == 0.0: # Atom a contributes nothing — short-circuit the inner loop. continue zeta_a = zeta[a] shell_a = n_shell[a] for b in range(n): if b == a: continue zb = atomic_numbers[b] n_virt_b = params.n_virt[zb] if n_virt_b == 0.0: continue # avoid 1/0 — and a zero-virt atom adds nothing zeta_b = zeta[b] shell_b = n_shell[b] dvec = positions[a] - positions[b] R = float(np.linalg.norm(dvec)) if R < eps: # Atoms on top of each other — geometry pathology; skip # the pair rather than dividing by zero. The SCF would # also crash on this geometry; this just keeps gCP from # being the first complaint. continue S_ab = slater_overlap(R, shell_a, shell_b, zeta_a, zeta_b) S_safe = max(abs(S_ab), eps) denom = np.sqrt(n_virt_b * S_safe) attenuation = np.exp(-params.alpha * R ** params.beta) pair_contrib = ( params.sigma * attenuation * e_mis_a / denom ) energy += pair_contrib if with_gradient: # ∂/∂R_a of one (a, b) term: # pair = σ · e_a · g(R) / √(N_b · S(R)) # with g(R) = e^(-α R^β), and R = |R_a - R_b|. # The (σ · e_a / √N_b) prefactor is R-independent. The # radial derivative collects the two R-dependent pieces: # # d(pair)/dR = (σ · e_a / √N_b) · # [g'(R) · S^(-1/2) # - ½ · g(R) · S^(-3/2) · S'(R)] # # dS/dR comes from a central finite difference on the # shell-aware Slater overlap — the analytic derivative # branches by (n_a, n_b) pair so a single central FD is # both cleaner and accurate to ~1e-9 at h = 1e-4 R. dR = max(1e-5, 1e-4 * R) S_plus = slater_overlap(R + dR, shell_a, shell_b, zeta_a, zeta_b) S_minus = slater_overlap(max(R - dR, 1e-6), shell_a, shell_b, zeta_a, zeta_b) dS_dR = (S_plus - S_minus) / (2.0 * dR) dg_dR = (-params.alpha * params.beta * R ** (params.beta - 1.0) * attenuation) d_pair_dR = ( params.sigma * e_mis_a / np.sqrt(n_virt_b) * ( dg_dR / np.sqrt(S_safe) - 0.5 * attenuation * S_safe ** (-1.5) * dS_dR ) ) # Project radial derivative onto Cartesian: dR/dR_a = # (R_a − R_b)/R = dvec/R; dR/dR_b is the negative. d_pair_d_R_a = d_pair_dR * dvec / R gradient[a] += d_pair_d_R_a gradient[b] -= d_pair_d_R_a return GCPResult( energy=float(energy), basis_name=params.basis_name, params=params, gradient=gradient, )