Source code for vibeqc.crash_dump

"""Crash / abort dump for vibe-qc SCF failures.

When an SCF blows up (raised exception, NaN in the density, linear
dependence, max-iter exceeded), the user's job has already done a lot
of work that's about to be thrown away. Without a dump, the only
record is the exception text the driver re-raises. Re-running from
scratch to debug — recompute the basis, redo five iterations of SCF,
re-trigger the failure — wastes 90% of the original wall time.

The crash dump is the equivalent of Gaussian's "link death" output:
on failure, snapshot the last known SCF state to disk so the user can
attach two files (``output.dump`` + ``output.dump.density.npy``) plus
their input script to a bug report and the maintainer can reconstruct
the exact failing state without rerunning.

Output layout
-------------

For ``out_stem = "output-h2o"`` we write up to four files:

* ``output-h2o.dump`` — TOML-formatted snapshot with sections
  ``[crash]`` (when, phase, exception text, n_iters_completed),
  ``[scf.last_iter]`` (iter, energy, dE, grad_norm, diis_subspace),
  ``[geometry]`` (atom-by-atom Z + xyz + charge + multiplicity),
  ``[options]`` (the SCF options struct), ``[hint]`` (heuristic
  ``likely_cause`` based on the exception type).

* ``output-h2o.dump.density.npy`` — last-iteration density matrix
  (only when present in ``state``).

* ``output-h2o.dump.fock.npy`` — last-iteration Fock matrix
  (only when present in ``state``).

* ``output-h2o.dump.mo.npy`` — current MO coefficients
  (only when present in ``state``).

The TOML is hand-rolled (matches the convention in
:mod:`vibeqc.system_info`); round-tripping is verified in the test
suite via stdlib :mod:`tomllib`.

Public surface
--------------

.. autofunction:: dump_on_failure
.. autofunction:: load_dump
.. autofunction:: classify_failure
"""

from __future__ import annotations

import contextvars
import datetime as _dt
import math
import os
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Iterator, Optional, Union


__all__ = [
    "active_crash_dump_stem",
    "classify_failure",
    "crash_dump_context",
    "dump_on_failure",
    "load_dump",
]


# ---------------------------------------------------------------------------
# Heuristic: map an exception to a one-line "likely_cause" hint.
# ---------------------------------------------------------------------------

_HINTS: tuple[tuple[str, str], ...] = (
    ("nan in density",
     "DIIS instability — try damping=0.5, level_shift=0.5, "
     "or DIIS=False to fall back to plain Roothaan iterations."),
    ("nan",
     "Numerical blow-up — likely DIIS or quasi-Newton step ran into "
     "an indefinite Hessian. Try a smaller initial step or disable "
     "the quadratic fallback."),
    ("linear depend",
     "Severe basis-set linear dependence — switch to canonical "
     "orthogonalisation or remove diffuse functions. See "
     "vibeqc.linear_dependence.check_linear_dependence."),
    ("singular",
     "Singular Fock or overlap matrix — same root cause as linear "
     "dependence. Try canonical_orthog=True in the options struct."),
    ("max_iter",
     "SCF did not converge in max_iter steps — try damping=0.3, "
     "level_shift=0.5, or increase max_iter. Run again with "
     "structured_log=True to inspect convergence behavior."),
    ("converge",
     "SCF did not converge — try damping=0.3, level_shift=0.5, or "
     "increase max_iter."),
    ("memory",
     "Out-of-memory during SCF — try a smaller basis, fewer threads, "
     "or set memory_override=False to abort earlier with a clearer "
     "estimate."),
    ("ecp",
     "ECP backend failure — verify the ECP library matches the basis "
     "(libecpint XML library 'ecp10mdf' / 'ecp28mdf' / ...)."),
)


[docs] def classify_failure(exc: BaseException, *, phase: Optional[str] = None) -> str: """Return a one-line ``likely_cause`` string for the dump's ``[hint]`` section. Best-effort: matches keywords in the exception message and type name; falls back to a generic hint when no keyword fires. """ try: text = str(exc).lower() except Exception: text = "" type_name = type(exc).__name__.lower() haystack = f"{type_name} {text} {(phase or '').lower()}" for needle, hint in _HINTS: if needle in haystack: return hint return ( "No specific hint matched. Inspect the [scf.last_iter] block " "and the .density.npy attachment. Re-run with " "structured_log=True for an iter-by-iter trace." )
# --------------------------------------------------------------------------- # State-collection helpers. # --------------------------------------------------------------------------- def _atom_geometry(molecule: Any) -> list[dict[str, Any]]: """Per-atom geometry dump as a list of inline-table-ready dicts.""" rows: list[dict[str, Any]] = [] if molecule is None: return rows for atom in getattr(molecule, "atoms", []) or []: xyz = list(getattr(atom, "xyz", (0.0, 0.0, 0.0))) rows.append({ "Z": int(getattr(atom, "Z", 0)), "x": float(xyz[0]) if len(xyz) > 0 else 0.0, "y": float(xyz[1]) if len(xyz) > 1 else 0.0, "z": float(xyz[2]) if len(xyz) > 2 else 0.0, }) return rows def _options_dict(options: Any) -> dict[str, Any]: """Best-effort dump of an options struct. The C++ option classes (``RHFOptions`` etc.) are ``def_readwrite`` pybind11 wrappers — we walk public attributes and keep only primitive values that round-trip through TOML cleanly. """ if options is None: return {} out: dict[str, Any] = {} for name in dir(options): if name.startswith("_"): continue try: value = getattr(options, name) except Exception: continue if callable(value): continue if isinstance(value, (bool, int, float, str)): out[name] = value return out def _last_iter_dict(scf_trace: Any) -> dict[str, Any]: """Extract the final SCFIteration record (or whatever shape the trace carries) into a plain dict. Empty dict when the trace is empty or absent.""" try: trace = list(scf_trace) if scf_trace is not None else [] except TypeError: return {} if not trace: return {} last = trace[-1] fields = ("iter", "energy", "delta_e", "grad_norm", "diis_subspace") out: dict[str, Any] = {} for f in fields: if hasattr(last, f): out[f] = getattr(last, f) elif isinstance(last, dict) and f in last: out[f] = last[f] # Normalize numeric fields. if "iter" in out: out["iter"] = int(out["iter"]) if "diis_subspace" in out: out["diis_subspace"] = int(out["diis_subspace"]) for k in ("energy", "delta_e", "grad_norm"): if k in out: try: out[k] = float(out[k]) except (TypeError, ValueError): out.pop(k, None) return out # --------------------------------------------------------------------------- # TOML emit — same hand-rolled pattern as system_info. # --------------------------------------------------------------------------- def _toml_str(s: str) -> str: """Quote a string as a TOML basic string. Escapes ``\\`` and ``"`` plus control chars per the TOML 1.0 spec.""" out = [] for ch in s: if ch == "\\": out.append("\\\\") elif ch == '"': out.append('\\"') elif ch == "\n": out.append("\\n") elif ch == "\r": out.append("\\r") elif ch == "\t": out.append("\\t") elif ord(ch) < 0x20: out.append(f"\\u{ord(ch):04X}") else: out.append(ch) return '"' + "".join(out) + '"' def _toml_value(v: Any) -> str: if isinstance(v, bool): return "true" if v else "false" if isinstance(v, int): return str(v) if isinstance(v, float): # TOML 1.0 has no NaN/Inf in tables — round-trip safety: emit # them as ``nan`` / ``inf`` literals (which are valid TOML floats # per spec), but ``tomllib.loads`` returns Python floats with # the right finiteness. ``-inf`` is valid too. if math.isnan(v): return "nan" if math.isinf(v): return "inf" if v > 0 else "-inf" s = repr(v) if "." not in s and "e" not in s and "E" not in s: s += ".0" return s return _toml_str(str(v)) def _format_simple_section(title: str, data: dict[str, Any]) -> list[str]: """Render a single ``[title]`` section with key = value lines.""" lines = [f"[{title}]"] for key, value in data.items(): lines.append(f"{key} = {_toml_value(value)}") lines.append("") return lines def _format_atoms_table(rows: list[dict[str, Any]]) -> list[str]: """Render the geometry as a TOML array of inline tables.""" lines = ["[geometry]"] if not rows: lines.append("atoms = []") lines.append("") return lines lines.append("atoms = [") for r in rows: lines.append( f" {{ Z = {int(r['Z'])}, " f"x = {_toml_value(r['x'])}, " f"y = {_toml_value(r['y'])}, " f"z = {_toml_value(r['z'])} }}," ) lines.append("]") lines.append("") return lines # --------------------------------------------------------------------------- # dump_on_failure — the entry point. # ---------------------------------------------------------------------------
[docs] def dump_on_failure( out_stem: Union[str, os.PathLike], exc: Optional[BaseException], state: dict[str, Any], *, phase: Optional[str] = None, options: Any = None, molecule: Any = None, ) -> Path: """Write ``{out_stem}.dump`` (TOML) plus binary attachments capturing the last known SCF state. Parameters ---------- out_stem Path stem identifying the failing job — typically the same stem ``run_job`` was called with. ``.dump`` and friends are appended. exc The exception that triggered the dump. May be ``None`` — used when the dump fires from a non-exceptional max-iter path. state Dictionary carrying as much of the last SCF state as the caller has on hand. Recognized keys (all optional): * ``"scf_trace"`` — list of SCFIteration-like records; the last entry is rendered into ``[scf.last_iter]``. * ``"density"`` / ``"density_alpha"`` / ``"density_beta"`` — numpy arrays, side-loaded as ``.dump.density.npy`` / ``.dump.density_alpha.npy`` / ``.dump.density_beta.npy``. * ``"fock"`` / ``"fock_alpha"`` / ``"fock_beta"`` — numpy arrays, side-loaded similarly. * ``"mo_coeffs"`` / ``"mo_coeffs_alpha"`` / ``"mo_coeffs_beta"`` — numpy arrays, side-loaded as ``.dump.mo.npy`` / ``.dump.mo_alpha.npy`` / ``.dump.mo_beta.npy``. * ``"phase"`` — overridden by the ``phase`` kwarg when set. * ``"n_iters_completed"`` — explicit override; otherwise inferred from the trace length. phase Short string identifying where the failure happened (e.g. ``"scf_iteration_5"``, ``"basis_set_construction"``). When ``None``, derived from the last trace entry or set to ``"unknown"``. options SCF options struct. Walked via ``dir()`` and only primitive-typed attributes survive. molecule :class:`Molecule` (or shape-compatible) — used to dump the ``[geometry]`` section. Returns ------- pathlib.Path Path to the written ``.dump`` file (the binary attachments live next to it as ``.dump.density.npy``, ``.dump.fock.npy``, …). Notes ----- Never raises. A dump-write failure prints a one-line warning to stderr and returns the (would-be) path so the caller's re-raise path is unchanged. """ stem = Path(os.fspath(out_stem)) dump_path = stem.with_suffix(".dump") # --- Render the TOML body ------------------------------------------- trace = state.get("scf_trace") last = _last_iter_dict(trace) n_iters_completed = state.get("n_iters_completed") if n_iters_completed is None: try: n_iters_completed = len(list(trace)) if trace is not None else 0 except TypeError: n_iters_completed = 0 resolved_phase = phase or state.get("phase") or ( f"scf_iteration_{int(last['iter'])}" if "iter" in last else "unknown" ) crash_section = { "when": _dt.datetime.now().astimezone().isoformat(timespec="seconds"), "phase": str(resolved_phase), "exception_type": type(exc).__name__ if exc is not None else "", "exception": str(exc) if exc is not None else "", "n_iters_completed": int(n_iters_completed), } lines: list[str] = [ "# vibe-qc crash dump — written when an SCF fails ungracefully.", "# Pair this file with the .dump.density.npy attachment and your", "# input script to reproduce the exact failing state via", "# vibeqc.load_dump('output.dump').", "", ] lines.extend(_format_simple_section("crash", crash_section)) if last: lines.extend(_format_simple_section("scf.last_iter", last)) lines.extend(_format_atoms_table(_atom_geometry(molecule))) if molecule is not None: lines.extend(_format_simple_section("molecule", { "charge": int(getattr(molecule, "charge", 0)), "multiplicity": int(getattr(molecule, "multiplicity", 1)), "n_atoms": int(len(list(getattr(molecule, "atoms", []) or []))), })) opts_dict = _options_dict(options) if opts_dict: lines.extend(_format_simple_section("options", opts_dict)) hint_text = classify_failure(exc, phase=resolved_phase) if exc is not None \ else "SCF reached max_iter without converging — try damping or " \ "level_shift adjustments." lines.extend(_format_simple_section("hint", {"likely_cause": hint_text})) # --- Side-load binary attachments ----------------------------------- attachments: list[tuple[str, Any]] = [] for key, suffix in ( ("density", "density"), ("density_alpha", "density_alpha"), ("density_beta", "density_beta"), ("fock", "fock"), ("fock_alpha", "fock_alpha"), ("fock_beta", "fock_beta"), ("mo_coeffs", "mo"), ("mo_coeffs_alpha", "mo_alpha"), ("mo_coeffs_beta", "mo_beta"), ): arr = state.get(key) if arr is None: continue attachments.append((suffix, arr)) written_paths: list[str] = [] for suffix, arr in attachments: side_path = dump_path.with_name(f"{dump_path.name}.{suffix}.npy") try: import numpy as np np.save(side_path, np.asarray(arr)) written_paths.append(side_path.name) except Exception as exc2: print( f"vibeqc.crash_dump: warning — could not write " f"{side_path}: {exc2}", file=sys.stderr, ) if written_paths: lines.extend(_format_simple_section("attachments", { "files": ", ".join(written_paths), })) # --- Write the .dump itself ----------------------------------------- try: dump_path.parent.mkdir(parents=True, exist_ok=True) dump_path.write_text("\n".join(lines), encoding="utf-8") except OSError as exc2: print( f"vibeqc.crash_dump: warning — could not write {dump_path}: " f"{exc2}", file=sys.stderr, ) return dump_path
# --------------------------------------------------------------------------- # load_dump — read a dump file back, re-attaching numpy arrays. # ---------------------------------------------------------------------------
[docs] def load_dump(path: Union[str, os.PathLike]) -> dict[str, Any]: """Parse a ``.dump`` TOML file and load its sibling ``.npy`` attachments. Returns a nested dict matching the TOML sections, with an extra ``"arrays"`` key carrying a ``{name: numpy.ndarray}`` map of every sibling ``.dump.<name>.npy`` file present. """ try: import tomllib except ImportError: # pragma: no cover — Python <3.11 import tomli as tomllib # type: ignore[import-not-found] p = Path(os.fspath(path)) with open(p, "rb") as fh: parsed: dict[str, Any] = tomllib.load(fh) arrays: dict[str, Any] = {} try: import numpy as np prefix = p.name + "." for sibling in p.parent.iterdir(): if not sibling.name.startswith(prefix): continue if not sibling.name.endswith(".npy"): continue label = sibling.name[len(prefix):-len(".npy")] try: arrays[label] = np.load(sibling) except Exception: continue except ImportError: pass parsed["arrays"] = arrays return parsed
# --------------------------------------------------------------------------- # Crash-dump context (contextvar plumbing) — let downstream entry points # fire a dump on max-iter without taking an explicit kwarg. # --------------------------------------------------------------------------- _active_stem: contextvars.ContextVar[Optional[Path]] = ( contextvars.ContextVar("vibeqc_crash_dump_stem", default=None) ) def active_crash_dump_stem() -> Optional[Path]: """The currently registered crash-dump stem, or ``None`` if no :func:`crash_dump_context` block is open. Lower-level SCF entry points consult this on max-iter to decide whether to side-load a dump.""" return _active_stem.get()
[docs] @contextmanager def crash_dump_context( out_stem: Union[str, os.PathLike, None], *, enabled: bool = True, ) -> Iterator[Optional[Path]]: """Register a crash-dump stem for the duration of the block. Parameters ---------- out_stem Path stem to use for any dump emitted from inside the block. ``None`` disables — the contextvar is left unset. enabled When ``False``, behaves the same as ``out_stem=None``. The companion env var ``VIBEQC_NO_CRASH_DUMP=1`` *also* disables even when an explicit stem is passed — matches the ``VIBEQC_NO_HOSTNAME`` opt-out spelling. ``run_job`` calls :func:`crash_dump_context` automatically with the resolved opt-out semantics; advanced callers using this directly should likewise honour the env var, or let :func:`dump_on_failure` callers do so. """ env_off = os.environ.get("VIBEQC_NO_CRASH_DUMP", "").strip().lower() if env_off in ("1", "true", "yes", "on"): enabled = False target: Optional[Path] = None if enabled and out_stem is not None: target = Path(os.fspath(out_stem)) token = _active_stem.set(target) try: yield target finally: _active_stem.reset(token)