Source code for vibeqc.ecp_metadata

"""Auto-populate :class:`ECPCenter` lists from a basis name + a Molecule.

Phase 14e of the libecpint integration: given a basis like ``lanl2dz``,
``dhf-tzvp``, etc. — one of the 13 ECP-bearing bases that
``scripts/basisset_dev/split_ecp_g94.py`` produced ``<name>.ecp``
sidecar files for — read the sidecar, extract per-element ``ncore``,
and produce a ready-to-go ``opts.ecp_centers`` + ``opts.ecp_library``
without making the user do the bookkeeping by hand.

Public surface:

* :func:`parse_sidecar_path(path)` — read one ``<name>.ecp`` file,
  return a list of :class:`EcpHeader` (one per element block).
* :func:`sidecar_path_for(basis_name)` — locate ``<name>.ecp`` next
  to ``<name>.g94`` under ``$LIBINT_DATA_PATH/basis/``. Returns
  ``None`` if the basis has no ECP sidecar (i.e. is all-electron).
* :func:`library_for(basis_name, ncore)` — pick the right libecpint
  XML library for a given basis-name + ncore combination. Returns
  ``None`` for non-standard ncores like vDZP's per-element customs.
* :func:`auto_ecp_centers(mol, basis_name, library_name=None)` — the
  one-call helper. Returns ``(ecp_centers, library_name)`` ready to
  drop into ``RHFOptions`` / ``UHFOptions`` / ``RKSOptions`` /
  ``UKSOptions``.

Caveats and known gaps (from `HANDOVER_DOCS_LIBECPINT.md`):

* **Mixed-library molecules unsupported in v0.7.x.** The C++ SCF
  drivers take a single ``ecp_library`` string; auto-population
  raises ``ValueError`` when the molecule's atoms span more than
  one libecpint XML library. Phase 14g (inline-primitive feed via
  ``set_ecp_basis(...)`` instead of ``set_ecp_basis_from_library``)
  removes this restriction.
* **Non-standard ncore values** (vDZP's B/C/N/etc with ``ncore=2``,
  ``ncore=3``, etc.) match no standard libecpint library; auto-
  population raises ``NotImplementedError`` with a Phase 14g
  pointer. vDZP's inline ECPs are valid data, just not yet
  consumable.
"""

from __future__ import annotations

import os
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Optional


# ---------- Element-symbol lookup -------------------------------------------
#
# BSE-style sidecars come in mixed-case (Na, Mg) AND all-caps (NA, MG)
# depending on the era of the source data (LANL files are all-caps,
# dhf is mixed). Normalise to the canonical title-case symbol on
# parse.

_SYMBOL_TO_Z: dict[str, int] = {
    sym: z for z, sym in enumerate([
        "",   "H",  "He",
        "Li", "Be", "B",  "C",  "N",  "O",  "F",  "Ne",
        "Na", "Mg", "Al", "Si", "P",  "S",  "Cl", "Ar",
        "K",  "Ca", "Sc", "Ti", "V",  "Cr", "Mn", "Fe",
        "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se",
        "Br", "Kr", "Rb", "Sr", "Y",  "Zr", "Nb", "Mo",
        "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
        "Sb", "Te", "I",  "Xe", "Cs", "Ba",
        "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd",
        "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf",
        "Ta", "W",  "Re", "Os", "Ir", "Pt", "Au", "Hg",
        "Tl", "Pb", "Bi", "Po", "At", "Rn",
        # Actinides (Z=87..103) covered for LANL08 / lanl2dz heavy
        # elements. Without these, lanl2dz.ecp's "U-ECP" line fails
        # the parse.
        "Fr", "Ra",
        "Ac", "Th", "Pa", "U",  "Np", "Pu", "Am", "Cm",
        "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
    ])
}


def _normalise_symbol(raw: str) -> str:
    """Map ``"NA"`` / ``"Na"`` / ``"na"`` to canonical ``"Na"``."""
    s = raw.strip()
    if not s:
        return s
    return s[0].upper() + s[1:].lower()


# ---------- Sidecar parser --------------------------------------------------

# BSE / NWChem ``.g94`` ECP-block convention:
#   <Sym>-ECP <lmax> <ncore>
# Confirmed against LANL2DZ sodium (NA-ECP 2 10 → lmax=2 d-projection,
# ncore=10 = [Ne]) and dhf-TZVP rubidium (RB-ECP 4 28 → lmax=4,
# ncore=28 = [Ar]3d¹⁰).
_ECP_HEADER_RE = re.compile(
    r"^\s*([A-Z][A-Za-z]?)-ECP\s+(\d+)\s+(\d+)\s*$"
)


@dataclass(frozen=True)
class EcpHeader:
    """One ``<Sym>-ECP <lmax> <ncore>`` row from a sidecar.

    Attributes
    ----------
    symbol : str
        Canonical title-case element symbol (``"Na"``, ``"Rb"``).
    Z : int
        Atomic number derived from ``symbol``.
    lmax : int
        Maximum angular momentum used in the ECP expansion.
    ncore : int
        Number of replaced core electrons. Subtract from atomic ``Z``
        to get the valence-electron count for the SCF.
    """
    symbol: str
    Z: int
    lmax: int
    ncore: int


def parse_sidecar_path(path: Path) -> list[EcpHeader]:
    """Parse a ``.ecp`` sidecar file. Returns one :class:`EcpHeader`
    per element block, in source order. Skips comments and blank
    lines; raises :class:`ValueError` on malformed headers.
    """
    text = Path(path).read_text(errors="replace")
    out: list[EcpHeader] = []
    for line in text.splitlines():
        m = _ECP_HEADER_RE.match(line)
        if not m:
            continue
        sym = _normalise_symbol(m.group(1))
        if sym not in _SYMBOL_TO_Z:
            raise ValueError(
                f"{path}: ECP header references unknown element "
                f"{m.group(1)!r}"
            )
        out.append(EcpHeader(
            symbol=sym, Z=_SYMBOL_TO_Z[sym],
            lmax=int(m.group(2)), ncore=int(m.group(3)),
        ))
    return out


def sidecar_path_for(basis_name: str) -> Optional[Path]:
    """Locate ``<basis_name>.ecp`` under ``$LIBINT_DATA_PATH/basis/``.

    Returns ``None`` when no sidecar exists (i.e. the basis is
    all-electron). The path resolution mirrors libint's: at vibe-qc
    import time ``__init__.py`` points ``$LIBINT_DATA_PATH`` at the
    bundled ``basis_library/`` directory; basis names resolve under
    ``<that>/basis/<name>.g94`` and we look for ``.ecp`` alongside.
    """
    root = os.environ.get("LIBINT_DATA_PATH")
    if not root:
        return None
    candidate = Path(root) / "basis" / f"{basis_name.lower()}.ecp"
    return candidate if candidate.is_file() else None


# ---------- Library-name resolution ----------------------------------------
#
# libecpint ships a fixed set of XML ECP libraries under
# ``python/vibeqc/ecp_library/xml/``: ecp10mdf, ecp28mdf, ecp46mdf,
# ecp60mdf, ecp78mdf, lanl2dz. The basis name + ncore tell us which
# one to point at.

# Standard ncore-keyed mapping for the Stuttgart-Köln MDF series.
# Used when the basis name itself isn't a libecpint library
# (e.g. dhf-tzvp + Rb has ncore=28 → ecp28mdf).
_NCORE_TO_MDF: dict[int, str] = {
    10: "ecp10mdf",
    28: "ecp28mdf",
    46: "ecp46mdf",
    60: "ecp60mdf",
    78: "ecp78mdf",
}

# Bases whose name maps directly to a libecpint XML library —
# ncore-irrelevant since the same XML covers every element in the
# basis. Add to this set as new ECP basis families ship XML.
_BASIS_TO_LIBRARY: dict[str, str] = {
    "lanl2dz":   "lanl2dz",
    "lanl2dzdp": "lanl2dz",
    "lanl2tz":   "lanl2dz",
    "lanl08":    "lanl2dz",
    "lanl08(d)": "lanl2dz",
    "lanl08(f)": "lanl2dz",
}


def library_for(basis_name: str, ncore: int) -> Optional[str]:
    """Pick the libecpint XML library name for ``(basis_name, ncore)``.

    Resolution order:

    1. If the basis name is one of the LANL family, use
       ``"lanl2dz"`` (every LANL element lives in that single XML).
    2. Otherwise consult :data:`_NCORE_TO_MDF` keyed on ``ncore``
       — covers def2-TZVP/QZVP heavy elements, dhf-*, x2c-*, and
       any other Stuttgart-Köln MDF-derived basis.
    3. Return ``None`` for non-standard ncores (vDZP's
       per-element customs, ``ncore=2``, ``ncore=3``, etc.).

    The ``None`` return is the auto-populator's "no standard
    library matches; user must provide an inline-primitive feed
    (Phase 14g)" signal.
    """
    name = basis_name.lower()
    if name in _BASIS_TO_LIBRARY:
        return _BASIS_TO_LIBRARY[name]
    return _NCORE_TO_MDF.get(ncore)


# ---------- The one-call helper --------------------------------------------


[docs] def auto_ecp_centers(mol, basis_name: str, library_name: Optional[str] = None) -> tuple: """Build ``(ecp_centers, library_name)`` from a Molecule + basis name. For each atom in ``mol`` whose Z appears in ``<basis_name>.ecp``, emits an ``ECPCenter(Z, xyz)`` and resolves the right libecpint XML library. The two returned values can be assigned directly to ``opts.ecp_centers`` and ``opts.ecp_library`` on any of the four molecular SCF Options classes. Parameters ---------- mol : vibeqc.Molecule The molecule whose heavy atoms need ECP centres built. basis_name : str Basis-set name as you'd pass to ``vq.BasisSet(mol, name)``. library_name : Optional[str] Override the auto-resolved libecpint XML library. Useful when a basis ships a custom XML the user dropped into ``$VIBEQC_ECP_SHARE_DIR/xml/``. Returns ------- (centers, library_name) : tuple[list[ECPCenter], str] Empty list and the input ``library_name`` (or ``""``) if the basis is all-electron; otherwise a populated list. Raises ------ ValueError When the molecule's heavy atoms would need MORE THAN ONE libecpint XML library (mixed-row case — common for dhf with a Rb–Cs span). Calls out Phase 14g as the unblocker. NotImplementedError When the basis has at least one ECP atom whose ncore doesn't match any standard libecpint library (vDZP-style customs). """ from . import ECPCenter # local: avoid hard-import for doc builds sidecar = sidecar_path_for(basis_name) if sidecar is None: # All-electron basis. Nothing to build; honour any explicit # library_name the caller already provided. return [], (library_name or "") headers = parse_sidecar_path(sidecar) by_z = {h.Z: h for h in headers} if not by_z: return [], (library_name or "") # Walk mol's atoms; emit centres for every ECP-bearing match. # ``Molecule.atoms`` is a property returning a list, not a method. centres: list = [] libraries_seen: set[str] = set() for atom in mol.atoms: h = by_z.get(int(atom.Z)) if h is None: continue # all-electron atom, skip ec = ECPCenter() ec.Z = h.Z ec.xyz = list(atom.xyz) centres.append(ec) if library_name is None: lib = library_for(basis_name, h.ncore) if lib is None: raise NotImplementedError( f"Basis {basis_name!r} has a non-standard ECP for " f"{h.symbol} (ncore={h.ncore}, lmax={h.lmax}) — no " "matching libecpint XML library is bundled. This " "is the vDZP / inline-primitive case; needs Phase " "14g (libecpint set_ecp_basis(...) direct feed) to " "use end-to-end. As a workaround, build " "ecp_centers manually and switch to a basis with a " "standard ECP (e.g. def2-TZVP + def2-ECP)." ) libraries_seen.add(lib) if library_name is None: if len(libraries_seen) == 0: # No mol atom matched a sidecar element — basis carries # ECPs but the molecule doesn't need any. Fine; return # empty list + empty library. return [], "" if len(libraries_seen) > 1: raise ValueError( f"Basis {basis_name!r} would need {sorted(libraries_seen)} " "libecpint libraries to cover this molecule (mixed-row " "ECP), but vibe-qc's SCF drivers accept exactly one " "ecp_library per call. Split the molecule into " "single-library subsets, or wait for Phase 14g (one " "library per atom)." ) library_name = next(iter(libraries_seen)) return centres, library_name