Source code for vibeqc.neb

"""Nudged Elastic Band (NEB) -- minimum energy path finder.

Public surface
==============

Path construction (Increment 1):

* :class:`NEBImage`, :class:`NEBPath`
* :func:`interpolate_linear`, :func:`interpolate_idpp`

Driver:

* :func:`run_neb` -- improved-tangent NEB with parallel per-image
  SCFs and a quick-min outer loop on the concatenated NEB force.
  Supports molecular and periodic endpoints, climbing image
  (``climbing_image=True``), density warm-start across outer
  iterations (all four SCF methods, molecular + periodic), and
  DFT+U.
* :class:`NEBResult` -- converged path + energies + transition-state
  index + iteration count; ``write_qvf`` for vibe-view rendering.

Both interpolators accept either :class:`vibeqc.Molecule` or
:class:`vibeqc.PeriodicSystem`. For periodic systems, IDPP pair
distances and the NEB force (tangent + spring) use the minimum-image
convention, so a band whose images straddle a cell boundary -- e.g.
an adatom hopping across the PBC in surface self-diffusion --
interpolates and relaxes along the short, through-the-boundary path
rather than being dragged across the cell. (``interpolate_linear``
is a plain Cartesian straight line; use IDPP for cross-boundary
hops.) The single-round minimum image is exact for orthorhombic
cells and the standard close approximation for mildly skewed ones.

Periodic per-image gradients are computed by central differences
(6N + 1 BIPOLE SCFs per image per outer iteration): the J^LR
(reciprocal-Ewald) contribution is still missing from the analytic
BIPOLE gradient, so the FD fallback is used to keep saddle-point
forces honest. Switching to the analytic gradient once J^LR lands
is the headline periodic-NEB optimisation -- see
``docs/user_guide/neb.md``.

References
==========
* Henkelman & Jónsson, "Improved tangent estimate in the nudged
  elastic band method for finding minimum energy paths and saddle
  points", J. Chem. Phys. 113, 9978 (2000). doi:10.1063/1.1323224.
* Smidstrup, Pedersen, Stokbro, Jónsson,
  "Improved initial guess for minimum energy path calculations",
  J. Chem. Phys. 140, 214106 (2014). doi:10.1063/1.4878664.
"""

from __future__ import annotations

import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, List, Optional, Sequence, Union

import numpy as np
from scipy.optimize import minimize

from ._vibeqc_core import Atom, BasisSet, Molecule, PeriodicSystem
from .output import (
    OutputPlan,
    dry_run_manifest,
    is_dry_run_estimate_requested,
    is_dry_run_requested,
)

System = Union[Molecule, PeriodicSystem]


class NEBImageSCFError(RuntimeError):
    """Raised when a per-image SCF fails to converge during a NEB run.

    The band cannot be propagated from a non-converged image: its energy
    is meaningless and its gradient is ill-defined (the C++ gradient
    builders reject a non-converged density outright). The NEB driver
    surfaces this as a clear, actionable error naming the offending image
    and geometry, rather than letting a cryptic ``RuntimeError: ... not
    converged`` escape from deep in the gradient code (2026-05-31 audit,
    F2).
    """


def _nonconverged_image_error(
    method: str,
    scf_result: Any,
    image_index: Optional[int],
    positions: np.ndarray,
) -> "NEBImageSCFError":
    """Build a clear :class:`NEBImageSCFError` for a non-converged image."""
    n_iter = getattr(scf_result, "n_iter", None)
    where = f"image {image_index}" if image_index is not None else "an image"
    iters = f" after {n_iter} iterations" if n_iter is not None else ""
    geom = np.array2string(np.asarray(positions, dtype=float), precision=4)
    return NEBImageSCFError(
        f"NEB {where}: the {method.upper()} SCF did not converge{iters}. "
        f"A non-converged density yields no valid energy or gradient, so "
        f"the band evaluation was aborted at this image. Raise the per-image "
        f"SCF iteration limit ({method.upper()}Options(max_iter=...), default "
        f"100) or improve the initial interpolation (more images / IDPP). "
        f"Geometry (bohr):\n{geom}"
    )


# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------


@dataclass
class NEBImage:
    """One image on a NEB path.

    ``energy``, ``gradient``, and ``tangent`` are populated by the
    NEB driver during optimisation. In Increment 1 they are all
    ``None`` for freshly interpolated images.
    """

    system: System
    energy: Optional[float] = None
    gradient: Optional[np.ndarray] = None  # (n_atoms, 3) Ha/bohr
    tangent: Optional[np.ndarray] = None  # (n_atoms, 3) unit-norm


@dataclass
class NEBPath:
    """A NEB path as an ordered list of images.

    First and last entries are the reactant and product respectively
    (fixed by default during optimisation). ``spring_constant`` is in
    Ha/bohr^2. ``climbing_image_index`` is None for standard NEB; the
    CI-NEB driver sets it to the highest-energy intermediate after
    the warm-up phase.
    """

    images: List[NEBImage]
    spring_constant: float = 0.1
    climbing_image_index: Optional[int] = None

    @property
    def n_images(self) -> int:
        return len(self.images)

    @property
    def n_intermediate(self) -> int:
        return max(0, len(self.images) - 2)

    def energies(self) -> np.ndarray:
        """Per-image energies as an array; NaN where unset."""
        return np.array(
            [
                img.energy if img.energy is not None else np.nan
                for img in self.images
            ],
            dtype=float,
        )


# ---------------------------------------------------------------------------
# Geometry helpers
# ---------------------------------------------------------------------------


def _atom_iter(system: System):
    return system.atoms if isinstance(system, Molecule) else system.unit_cell


def _positions_of(system: System) -> np.ndarray:
    return np.array([list(a.xyz) for a in _atom_iter(system)], dtype=float)


def _atomic_numbers_of(system: System) -> np.ndarray:
    return np.array([int(a.Z) for a in _atom_iter(system)], dtype=int)


def _rebuild_with_positions(template: System, positions: np.ndarray) -> System:
    """Return a copy of ``template`` with Cartesian ``positions`` (bohr)."""
    if isinstance(template, Molecule):
        new_atoms = [
            Atom(int(a.Z), list(p))
            for a, p in zip(template.atoms, positions)
        ]
        return Molecule(new_atoms, template.charge, template.multiplicity)
    new_atoms = [
        Atom(int(a.Z), list(p))
        for a, p in zip(template.unit_cell, positions)
    ]
    return PeriodicSystem(
        template.dim,
        np.asarray(template.lattice, dtype=float),
        new_atoms,
        charge=template.charge,
        multiplicity=template.multiplicity,
    )


def _check_compatible(reactant: System, product: System) -> None:
    if type(reactant) is not type(product):
        raise ValueError(
            "reactant and product must be the same system type; got "
            f"{type(reactant).__name__} and {type(product).__name__}"
        )
    zr = _atomic_numbers_of(reactant)
    zp = _atomic_numbers_of(product)
    if zr.shape != zp.shape or not np.array_equal(zr, zp):
        raise ValueError(
            "reactant and product must have matching atomic-number "
            "sequences in the same order (NEB does not reorder atoms; "
            "pre-align if needed)."
        )
    if isinstance(reactant, PeriodicSystem):
        lr = np.asarray(reactant.lattice, dtype=float)
        lp = np.asarray(product.lattice, dtype=float)
        if not np.allclose(lr, lp):
            raise ValueError(
                "reactant and product must share the same lattice -- "
                "variable-cell NEB is out of scope. Fix the cell to the "
                "reactant's lattice before constructing endpoints."
            )


# ---------------------------------------------------------------------------
# Linear interpolation
# ---------------------------------------------------------------------------


def interpolate_linear(
    reactant: System,
    product: System,
    n_images: int,
) -> List[System]:
    """Linear Cartesian interpolation between two endpoints.

    Parameters
    ----------
    reactant, product
        Same-type Molecule or PeriodicSystem with matching atom
        ordering. For PeriodicSystem the lattices must agree.
    n_images
        Number of *intermediate* images. The returned list has length
        ``n_images + 2`` (endpoints included).

    Returns
    -------
    list[System]
        ``[reactant, img_1, ..., img_n, product]``. Endpoints are
        returned as the original objects (not copies).
    """
    if n_images < 0:
        raise ValueError(f"n_images must be >= 0; got {n_images}")
    _check_compatible(reactant, product)
    r0 = _positions_of(reactant)
    r1 = _positions_of(product)
    out: List[System] = [reactant]
    for k in range(1, n_images + 1):
        t = k / (n_images + 1)
        pos = (1.0 - t) * r0 + t * r1
        out.append(_rebuild_with_positions(reactant, pos))
    out.append(product)
    return out


# ---------------------------------------------------------------------------
# IDPP interpolation -- Smidstrup et al. 2014
# ---------------------------------------------------------------------------
#
# For each intermediate image i = 1..n_images we define a target
# pair-distance matrix
#
#     d^(i)_{jk} = (1 - t_i) * d^(R)_{jk} + t_i * d^(P)_{jk}
#
# (linear in pair-distance space between reactant and product). We
# then minimise the image-dependent pair-potential objective
#
#     S^(i)(R) = sum_{j<k}  (d^(i)_{jk} - r_{jk}(R))^2 / r_{jk}(R)^4
#
# over each image's Cartesian coordinates independently. The 1/r^4
# weighting drives images strongly to relieve close atom-atom
# contacts while still tracking the interpolated distance manifold.
# Analytic gradient is provided to L-BFGS-B for speed.


def _minimum_image_diff(
    diff: np.ndarray, lattice: np.ndarray, dim: int
) -> np.ndarray:
    """Wrap Cartesian displacements ``diff`` (..., 3) to the minimum image.

    The first ``dim`` rows of ``lattice`` are the periodic lattice
    vectors. Each displacement is reduced by the integer combination of
    those vectors nearest to it (single-round convention: exact for
    orthorhombic cells, the standard close approximation for mildly
    skewed ones -- adequate for the slab + adsorbate target). Directions
    outside the periodic subspace (e.g. the vacuum axis of a ``dim < 3``
    slab) are left untouched because they lie outside the row space of
    ``lattice[:dim]``.
    """
    Lp = np.asarray(lattice, dtype=float)[:dim]      # (dim, 3)
    frac = diff @ np.linalg.pinv(Lp)                 # (..., dim)
    return diff - np.round(frac) @ Lp                # (..., 3)


def _pair_distance_matrix(
    positions: np.ndarray,
    lattice: Optional[np.ndarray] = None,
    dim: int = 3,
) -> np.ndarray:
    diff = positions[:, None, :] - positions[None, :, :]
    if lattice is not None:
        diff = _minimum_image_diff(diff, lattice, dim)
    return np.sqrt(np.einsum("ijc,ijc->ij", diff, diff))


def _idpp_value_and_grad(
    flat: np.ndarray,
    target: np.ndarray,
    n_atoms: int,
    lattice: Optional[np.ndarray] = None,
    dim: int = 3,
) -> tuple[float, np.ndarray]:
    positions = flat.reshape(n_atoms, 3)
    diff = positions[:, None, :] - positions[None, :, :]
    if lattice is not None:
        # Minimum-image displacements so a pair interacting across the PBC
        # tracks its nearest image, not the in-cell Cartesian vector.
        # round() is locally constant => d(mic diff)/dR = ddiff/dR, so the
        # analytic gradient below is unchanged apart from operating on the
        # wrapped displacement.
        diff = _minimum_image_diff(diff, lattice, dim)
    r2 = np.einsum("ijc,ijc->ij", diff, diff)
    # Off-diagonal mask; we never read the diagonal because it's masked
    # out of every aggregation below.
    mask = ~np.eye(n_atoms, dtype=bool)
    # Replace diagonal r2=0 with 1.0 so divisions are finite -- masked
    # back to zero before any sum. Also floor off-diagonal r2 at a
    # small positive value so an L-BFGS-B step that briefly drives two
    # atoms onto each other produces a finite (very large, repulsive)
    # gradient instead of NaN -- the optimiser can then escape.
    _R_FLOOR2 = 1e-6  # bohr^2
    r2_off = np.maximum(r2, _R_FLOOR2)
    safe_r2 = np.where(mask, r2_off, 1.0)
    r = np.sqrt(safe_r2)
    inv_r2 = 1.0 / safe_r2
    inv_r4 = inv_r2 * inv_r2
    inv_r5 = inv_r4 / r
    delta = target - r  # (d - r)
    # Energy: 0.5 * sum over all (i,j) of (d-r)^2 / r^4 (symmetric ->
    # the 1/2 converts to the j<k sum).
    pair_energy = delta * delta * inv_r4
    energy = 0.5 * float(np.sum(np.where(mask, pair_energy, 0.0)))
    # d/dr [(d-r)^2 / r^4] = -2 (d-r) / r^4 - 4 (d-r)^2 / r^5
    dS_dr = -2.0 * delta * inv_r4 - 4.0 * delta * delta * inv_r5
    dS_dr = np.where(mask, dS_dr, 0.0)
    # Chain rule: dr_{jk}/dR_j = diff[j,k] / r[j,k], dr/dR_k = - of that.
    # grad[k] = sum_j dS_dr[k,j] / r[k,j] * diff[k,j]
    inv_r = np.where(mask, 1.0 / r, 0.0)
    factor = dS_dr * inv_r
    grad = np.einsum("ij,ijc->ic", factor, diff)
    return energy, grad.ravel()


def interpolate_idpp(
    reactant: System,
    product: System,
    n_images: int,
    *,
    max_iter: int = 1000,
    tol: float = 1e-5,
) -> List[System]:
    """Image-Dependent Pair Potential interpolation (Smidstrup 2014).

    Builds a linear-Cartesian starting path, then for each
    intermediate image independently minimises the IDPP objective
    ``S^(i)(R) = sum_{j<k} (d^(i)_{jk} - r_{jk}(R))^2 / r_{jk}(R)^4``
    with target distances interpolated linearly between reactant and
    product pair-distance matrices. Endpoints are returned unchanged.

    Cites: Smidstrup, Pedersen, Stokbro, Jónsson,
    J. Chem. Phys. 140, 214106 (2014). doi:10.1063/1.4878664.
    """
    if n_images < 0:
        raise ValueError(f"n_images must be >= 0; got {n_images}")
    _check_compatible(reactant, product)
    if n_images == 0:
        return [reactant, product]

    n_atoms = len(_positions_of(reactant))
    # Periodic IDPP uses minimum-image pair distances (lattice from the
    # reactant; _check_compatible has already enforced a shared cell).
    is_periodic = isinstance(reactant, PeriodicSystem)
    lattice = (
        np.asarray(reactant.lattice, dtype=float) if is_periodic else None
    )
    dim = int(reactant.dim) if is_periodic else 3
    d_R = _pair_distance_matrix(_positions_of(reactant), lattice, dim)
    d_P = _pair_distance_matrix(_positions_of(product), lattice, dim)

    linear_path = interpolate_linear(reactant, product, n_images)
    out: List[System] = [reactant]
    for k in range(1, n_images + 1):
        t = k / (n_images + 1)
        target = (1.0 - t) * d_R + t * d_P
        x0 = _positions_of(linear_path[k]).ravel()
        res = minimize(
            _idpp_value_and_grad,
            x0,
            args=(target, n_atoms, lattice, dim),
            jac=True,
            method="L-BFGS-B",
            options={"maxiter": max_iter, "gtol": tol, "ftol": tol},
        )
        out.append(
            _rebuild_with_positions(reactant, res.x.reshape(n_atoms, 3))
        )
    out.append(product)
    return out


# ===========================================================================
# Driver (Increment 2) -- improved-tangent NEB, molecular only.
# ===========================================================================
#
# The driver below implements the textbook formulation of NEB:
#
#   F_total_i = F_spring_∥_i  +  F_true_⊥_i
#
# with the improved-tangent estimator of Henkelman & Jónsson 2000
# replacing the original "central-difference" tangent of Mills/
# Jónsson/Schenter 1995 -- energy ordering of neighbours decides
# whether t_i points uphill or downhill, with a transitional weighted
# mix when the central image is the local extremum (eqs. 8-11 of JCP
# 113, 9978, 2000). Spring forces are projected onto t_i; true
# nuclear gradients are projected *off* t_i. The result is a
# discretised elastic band that relaxes to the minimum energy path.
#
# Outer loop: damped quick-min (also Henkelman+Jónsson). Each image
# carries a velocity; per step it is projected onto F_total before
# advancing -- the projection keeps the band from drifting away from
# the MEP when v has accumulated tangential momentum. L-BFGS-B on
# the concatenated coordinate vector would *also* work, but only if
# the NEB force were a true gradient. It isn't: F_spring_∥ depends on
# the parallel projection of the difference vector, which is not the
# derivative of a scalar potential. Quick-min is the standard choice
# in ASE / VASP / Quantum ESPRESSO for the same reason.
#
# Parallelism: joblib.Parallel over images per outer iteration.
# Endpoints are evaluated once and cached. n_jobs=0 selects a bounded
# auto default; pass n_jobs=-1 for joblib's all-core behavior or
# n_jobs=1 for serial.
#
# Shipped on top of the Increment-2 core below:
#   * Climbing image ("climbing_image=True").
#   * Periodic dispatch (PeriodicSystem endpoints; per-image BIPOLE
#     SCF + finite-difference gradient, k-mesh via "kpoints=").
#   * SCF density warm-start across outer iterations ("warm_start=True",
#     all four methods, molecular + periodic).
#   * DFT+U ("dft_plus_u=[HubbardSite(...)]", molecular + periodic).
#
# References:
#   Henkelman, Jónsson, J. Chem. Phys. 113, 9978 (2000).
#   doi:10.1063/1.1323224.


def _positive_int_env(name: str) -> int | None:
    raw = os.environ.get(name)
    if raw is None or raw.strip() == "":
        return None
    try:
        value = int(raw)
    except ValueError:
        return None
    return value if value > 0 else None


def _resolve_neb_n_jobs(
    n_jobs: int,
    *,
    n_images: int,
    is_periodic: bool,
) -> int:
    """Resolve the process count for per-image NEB evaluation."""
    if n_jobs != 0:
        return n_jobs

    default_cap = 1 if is_periodic else 4
    cap = _positive_int_env("VIBEQC_NEB_MAX_JOBS") or default_cap
    cpu_count = os.cpu_count() or 1
    return max(1, min(n_images, cpu_count, cap))



[docs]
@dataclass
class NEBResult:
    """Outcome of a :func:`run_neb` run.

    Attributes
    ----------
    path
        The converged (or last-evaluated) :class:`NEBPath` -- endpoints
        plus intermediate images with their final geometry, energy,
        gradient, and tangent.
    energies
        Per-image energy array (Hartree), length ``n_images + 2``.
        Endpoints are at indices 0 and -1.
    converged
        True iff the max-norm of the NEB force fell below
        ``conv_tol_force`` within ``max_iter`` outer iterations.
    transition_state_index
        Index into ``path.images`` of the highest-energy image -- the
        best non-climbing estimate of the saddle. ``None`` if energies
        are unset.
    n_iter
        Number of completed outer iterations.
    max_force
        Final maximum-norm NEB force (Ha/bohr) over intermediate
        images. Compare against ``conv_tol_force`` to gauge how close
        to converged a non-converged run was.
    """

    path: NEBPath
    energies: np.ndarray
    converged: bool
    transition_state_index: Optional[int]
    n_iter: int
    max_force: float
    # Captured at run_neb call time so write_qvf can build the
    # citation surface + manifest provenance without the user
    # having to repeat the SCF flavour. All five can be None /
    # default if NEBResult is constructed by hand (e.g. in tests).
    method: Optional[str] = None
    basis: Optional[str] = None
    functional: Optional[str] = None
    is_periodic: bool = False
    # Truthy => write_qvf fires the ``routes.methods.dft_plus_u``
    # citation route (Dudarev 1998 + Cococcioni-Gironcoli 2005).
    # We store a bool rather than the HubbardSite list because
    # ``write_qvf`` only needs the on/off bit for citation
    # assembly; the actual sites have already done their job in
    # each per-image SCF.
    used_dft_plus_u: bool = False
    # For ``method="mace"``: the per-model foundation-model citation key
    # (e.g. ``batatia_mace_mp_2024``) so write_qvf can fire the MACE
    # references (the method paper via the static ``routes.methods.mace``
    # route, this key as an extra entry). Empty for the SCF methods.
    mace_model_citation: Optional[str] = None


[docs]
    def write_qvf(
        self,
        stem: Any,
        *,
        compression: Optional[int] = None,
    ) -> Any:
        """Emit a vibe-view ``reaction.path`` QVF archive.

        Builds an ``OutputPlan`` + context dict from this result and
        delegates to :func:`vibeqc.output.formats.qvf.write_qvf`. The
        archive contains a ``structure`` section (reactant geometry),
        a ``reaction.path`` section (every image's coords + energies
        + waypoints), and a ``citations`` section (BibTeX assembled
        with ``uses_neb=True`` -- plus ``uses_ci_neb=True`` when this
        result came from a climbing-image run).

        For periodic NEB results the archive ships as QVF v2 -- the
        ``reaction.path`` section additionally carries the per-frame
        lattice + dim (see ``docs/user_guide/vibe_view.md`` Sec.
        "Periodic reaction paths"). The writer detects periodic
        frames automatically.

        Parameters
        ----------
        stem
            Path stem; the writer appends ``.qvf``.
        compression
            Optional ``zipfile`` compression constant; if ``None``
            the writer uses its default (``ZIP_DEFLATED`` or
            ``ZIP_ZSTANDARD`` when ``zipfile-zstd`` is installed).

        Returns
        -------
        pathlib.Path
            The written archive path.
        """
        from .output.formats.qvf import write_reaction_path_qvf

        n_total = len(self.path.images)

        # Waypoints: reactant, product, TS (climbing image when
        # available, else the highest-energy intermediate).
        waypoints: list[dict[str, Any]] = [
            {
                "frame_index": 0,
                "label": "reactant",
                "kind": "reactant",
                "energy_eh": float(self.energies[0]),
            },
            {
                "frame_index": n_total - 1,
                "label": "product",
                "kind": "product",
                "energy_eh": float(self.energies[-1]),
            },
        ]
        ts_idx = (
            self.path.climbing_image_index
            if self.path.climbing_image_index is not None
            else self.transition_state_index
        )
        if ts_idx is not None and 0 < ts_idx < n_total - 1:
            waypoints.append(
                {
                    "frame_index": int(ts_idx),
                    "label": "TS",
                    "kind": "transition_state",
                    "energy_eh": float(self.energies[ts_idx]),
                }
            )

        # Reaction coordinate: cumulative arc length over frame
        # geometries, normalised to [0, 1].
        positions = [
            _positions_of(img.system) for img in self.path.images
        ]
        arc = [0.0]
        for i in range(1, n_total):
            arc.append(
                arc[-1]
                + float(np.linalg.norm(positions[i] - positions[i - 1]))
            )
        rc = (
            [a / arc[-1] for a in arc]
            if arc[-1] > 0.0
            else [0.0] * n_total
        )

        is_mace = (self.method or "").lower() == "mace"
        is_msindo = (self.method or "").lower() == "msindo"
        extra: dict[str, Any] = {
            "uses_neb": True,
            "uses_ci_neb": self.path.climbing_image_index is not None,
            "dft_plus_u": bool(self.used_dft_plus_u),
        }
        if is_mace:
            # MACE evaluates no Gaussian integrals and runs no SCF -- suppress
            # the always-on libint + DIIS routes so the references reflect
            # what ran. The MACE *method* paper fires via routes.methods.mace
            # (method="mace"); the per-model foundation paper is an extra.
            extra["uses_integrals"] = False
            extra["uses_scf"] = False
            if self.mace_model_citation:
                extra["extra_entries"] = [self.mace_model_citation]
        elif is_msindo:
            # MSINDO is INDO over Slater orbitals -- it evaluates NO Gaussian
            # integrals (no libint; CLAUDE.md Sec.10), so suppress the always-on
            # libint route. It *does* run a Pulay-DIIS SCF, so ``uses_scf``
            # stays on (the DIIS route is honest here). The MSINDO method paper
            # (Ahlswede & Jug 1999, Parts I + II) fires via routes.methods.msindo
            # (method="msindo").
            extra["uses_integrals"] = False

        return write_reaction_path_qvf(
            stem,
            frames=[img.system for img in self.path.images],
            energies=[float(e) for e in self.energies],
            waypoints=waypoints,
            reaction_coordinate=rc,
            method=(self.method or "RHF").upper(),
            # MACE / MSINDO have no Gaussian basis; the name is a non-citing
            # placeholder (the SCF fallback "sto-3g" would wrongly cite STO-3G).
            basis=(
                "mace" if is_mace
                else "msindo" if is_msindo
                else (self.basis or "sto-3g")
            ),
            # No XC functional for HF-like INDO (MSINDO) or the MLIP (MACE);
            # the run_neb default functional="pbe" must not leak into the
            # references.
            functional=None if (is_mace or is_msindo) else self.functional,
            extra_assemble_kwargs=extra,
            compression=compression,
        )




# --- improved tangent (Henkelman + Jónsson 2000) ---------------------------


def _improved_tangent(
    R_prev: np.ndarray,
    R_curr: np.ndarray,
    R_next: np.ndarray,
    E_prev: float,
    E_curr: float,
    E_next: float,
    lattice: Optional[np.ndarray] = None,
    dim: int = 3,
) -> np.ndarray:
    """Improved tangent t_i per Henkelman+Jónsson 2000 eq. 8-11.

    Returns a unit-norm Cartesian vector with the same shape as the
    input positions. Geometry-only fallback (no energy data) is the
    central-difference tangent ``(R_next - R_curr) + (R_curr - R_prev)``.
    For periodic systems (``lattice`` given) the two inter-image
    half-steps use the minimum-image convention, so a band whose images
    straddle a cell boundary still gets a short, sensible tangent
    instead of one that points the long way across the cell.
    """
    tau_plus = R_next - R_curr
    tau_minus = R_curr - R_prev
    if lattice is not None:
        tau_plus = _minimum_image_diff(tau_plus, lattice, dim)
        tau_minus = _minimum_image_diff(tau_minus, lattice, dim)
    if E_next > E_curr > E_prev:
        tau = tau_plus
    elif E_next < E_curr < E_prev:
        tau = tau_minus
    else:
        dE_max = max(abs(E_next - E_curr), abs(E_prev - E_curr))
        dE_min = min(abs(E_next - E_curr), abs(E_prev - E_curr))
        if E_next > E_prev:
            tau = tau_plus * dE_max + tau_minus * dE_min
        else:
            tau = tau_plus * dE_min + tau_minus * dE_max
    norm = float(np.linalg.norm(tau))
    if norm < 1e-12:
        # Degenerate band (all three images coincident); fall back to
        # the central-difference tangent (minimum-image half-steps, so
        # equal to R_next - R_prev for the molecular case). If that's
        # also zero, return zero -- the outer loop is at a fixed point.
        tau = tau_plus + tau_minus
        norm = float(np.linalg.norm(tau))
        if norm < 1e-12:
            return np.zeros_like(tau)
    return tau / norm


# --- per-image SCF + gradient (the worker called inside joblib) ------------


def _nuclear_repulsion_molecular(mol: Molecule) -> float:
    """Sum Z_i Z_j / r_ij for a molecule (bohr in, Ha out).

    The high-level SCF entry points compute this internally; the
    low-level ``run_*_scf_with_jk`` path wants ``E_nuc`` as a scalar
    so the NEB driver's warm-start helper computes it here.
    """
    atoms = list(mol.atoms)
    n = len(atoms)
    e = 0.0
    for i in range(n):
        zi = int(atoms[i].Z)
        xi = np.asarray(atoms[i].xyz, dtype=float)
        for j in range(i + 1, n):
            zj = int(atoms[j].Z)
            xj = np.asarray(atoms[j].xyz, dtype=float)
            r = float(np.linalg.norm(xi - xj))
            if r > 0:
                e += zi * zj / r
    return e


def _build_scf_common_pieces(
    mol: Molecule,
    basis: Any,
) -> tuple[np.ndarray, np.ndarray, float, Any]:
    """Shared S / Hcore / E_nuc / JKBuilder construction for the
    warm-start path. All four methods (RHF / UHF / RKS / UKS) need
    the same building blocks before calling their low-level
    ``run_*_scf_with_jk`` entry point.
    """
    from ._vibeqc_core import (
        compute_kinetic,
        compute_nuclear,
        compute_overlap,
        make_direct_jk_builder,
    )

    S = compute_overlap(basis)
    Hcore = compute_kinetic(basis) + compute_nuclear(basis, mol)
    e_nuc = _nuclear_repulsion_molecular(mol)
    jk = make_direct_jk_builder(basis)
    return S, Hcore, e_nuc, jk


def _empty_density() -> np.ndarray:
    return np.zeros((0, 0), dtype=np.float64)


def _sad_cold_start_closed(mol: Molecule, basis: Any) -> np.ndarray:
    """Closed-shell SAD initial density for a cold-start NEB image (F3).

    The low-level ``run_*_scf_with_jk`` entry points only offer an Hcore
    fallback guess when handed an empty density, which needs many more
    SCF iterations than the public ``run_rhf`` / ``run_rks`` driver
    (AUTO->SAD) and can fail at the default ``max_iter`` where the
    single-point driver converges. Seeding the cold start with SAD makes
    the band's first outer iteration converge like an equivalent single
    point. Falls back to the empty matrix (Hcore guess) if SAD can't be
    built, preserving the previous behaviour as a worst case.
    """
    from ._vibeqc_core import sad_density

    try:
        return np.asarray(sad_density(mol, basis), dtype=np.float64)
    except Exception:
        return _empty_density()


def _sad_cold_start_open(
    mol: Molecule, basis: Any, n_alpha: int, n_beta: int
) -> tuple[np.ndarray, np.ndarray]:
    """Open-shell per-spin SAD initial densities for a cold-start NEB
    image (F3). Same rationale as :func:`_sad_cold_start_closed`; falls
    back to empty (Hcore) per spin if the guess can't be built."""
    from ._vibeqc_core import InitialGuess, _guess_open_shell_density

    try:
        guess = _guess_open_shell_density(
            mol, basis, n_alpha, n_beta, InitialGuess.SAD, False
        )
        if guess is not None:
            d_alpha, d_beta = guess
            return (
                np.asarray(d_alpha, dtype=np.float64),
                np.asarray(d_beta, dtype=np.float64),
            )
    except Exception:
        pass
    return _empty_density(), _empty_density()


def _run_rhf_warm_start(
    mol: Molecule,
    basis: Any,
    options: Any,
    initial_density: Optional[np.ndarray],
) -> Any:
    """RHF SCF via the low-level ``run_rhf_scf_with_jk`` entry point.

    Used by :func:`_evaluate_image` to seed the SCF from a previous
    outer-iteration's converged density (within-image warm-start --
    the NEB warm-start milestone). The high-level ``run_rhf`` doesn't
    expose ``initial_density``, so the NEB driver routes RHF through
    the lower-level entry that does.

    When ``initial_density is None`` the SCF behaves identically to
    ``run_rhf`` (the C++ binding falls back to a diagonalisation of
    Hcore for the initial guess). The cost difference vs. a cold
    ``run_rhf`` is the Python-side construction of S/Hcore/E_nuc/JK
    -- a single-pass set of compute_* calls, much cheaper than the
    SCF iterations themselves.
    """
    from ._vibeqc_core import RHFOptions, run_rhf_scf_with_jk

    opts = options if options is not None else RHFOptions()
    S, Hcore, e_nuc, jk = _build_scf_common_pieces(mol, basis)
    init = (
        initial_density
        if initial_density is not None
        else _sad_cold_start_closed(mol, basis)
    )
    return run_rhf_scf_with_jk(
        basis,
        mol.n_electrons(),
        S,
        Hcore,
        e_nuc,
        jk,
        opts,
        initial_density=init,
    )


def _run_uhf_warm_start(
    mol: Molecule,
    basis: Any,
    options: Any,
    initial_density: Optional[tuple[np.ndarray, np.ndarray]],
) -> Any:
    """UHF SCF via ``run_uhf_scf_with_jk``. Density cache is the
    ``(alpha, beta)`` tuple of per-spin density matrices."""
    from ._vibeqc_core import UHFOptions, run_uhf_scf_with_jk

    opts = options if options is not None else UHFOptions()
    S, Hcore, e_nuc, jk = _build_scf_common_pieces(mol, basis)
    n_total = mol.n_electrons()
    mult = mol.multiplicity
    # Standard alpha/beta partition: n_alpha = (N + 2S) / 2,
    # n_beta = N - n_alpha. Matches the molecular runner's
    # convention.
    n_alpha = (n_total + (mult - 1)) // 2
    n_beta = n_total - n_alpha
    init_a, init_b = (
        _sad_cold_start_open(mol, basis, n_alpha, n_beta)
        if initial_density is None
        else initial_density
    )
    return run_uhf_scf_with_jk(
        basis,
        n_alpha,
        n_beta,
        S,
        Hcore,
        e_nuc,
        jk,
        opts,
        init_alpha=init_a,
        init_beta=init_b,
    )


def _resolve_xc_grid(
    mol: Molecule,
    grid_options: Any,
) -> Any:
    """Build (or pass through) the XC integration grid for KS-DFT."""
    from ._vibeqc_core import GridOptions, build_grid

    gopt = grid_options if grid_options is not None else GridOptions()
    return build_grid(mol, gopt)


def _run_rks_warm_start(
    mol: Molecule,
    basis: Any,
    options: Any,
    functional: Optional[str],
    grid_options: Any,
    initial_density: Optional[np.ndarray],
) -> Any:
    """RKS SCF via ``run_rks_scf_with_jk``. Same closed-shell density
    convention as RHF; additionally needs an XC integration grid."""
    from ._vibeqc_core import RKSOptions, run_rks_scf_with_jk

    opts = options if options is not None else RKSOptions()
    if functional is not None and not opts.functional:
        opts.functional = functional
    S, Hcore, e_nuc, jk = _build_scf_common_pieces(mol, basis)
    grid = _resolve_xc_grid(mol, grid_options)
    init = (
        initial_density
        if initial_density is not None
        else _sad_cold_start_closed(mol, basis)
    )
    return run_rks_scf_with_jk(
        basis,
        mol.n_electrons(),
        S,
        Hcore,
        e_nuc,
        jk,
        grid,
        opts,
        initial_density=init,
    )


def _run_uks_warm_start(
    mol: Molecule,
    basis: Any,
    options: Any,
    functional: Optional[str],
    grid_options: Any,
    initial_density: Optional[tuple[np.ndarray, np.ndarray]],
) -> Any:
    """UKS SCF via ``run_uks_scf_with_jk``. Open-shell a/b densities +
    XC grid."""
    from ._vibeqc_core import UKSOptions, run_uks_scf_with_jk

    opts = options if options is not None else UKSOptions()
    if functional is not None and not opts.functional:
        opts.functional = functional
    S, Hcore, e_nuc, jk = _build_scf_common_pieces(mol, basis)
    grid = _resolve_xc_grid(mol, grid_options)
    n_total = mol.n_electrons()
    mult = mol.multiplicity
    n_alpha = (n_total + (mult - 1)) // 2
    n_beta = n_total - n_alpha
    init_a, init_b = (
        _sad_cold_start_open(mol, basis, n_alpha, n_beta)
        if initial_density is None
        else initial_density
    )
    return run_uks_scf_with_jk(
        basis,
        n_alpha,
        n_beta,
        S,
        Hcore,
        e_nuc,
        jk,
        grid,
        opts,
        init_alpha=init_a,
        init_beta=init_b,
    )


def _evaluate_image(
    positions: np.ndarray,
    template: Molecule,
    basis_name: str,
    method: str,
    *,
    functional: Optional[str],
    rhf_options: Any,
    uhf_options: Any,
    rks_options: Any,
    uks_options: Any,
    gradient_options: Any,
    grid_options: Any,
    dispersion_params: Any,
    initial_density: Optional[np.ndarray] = None,
    image_index: Optional[int] = None,
) -> tuple[float, np.ndarray, Optional[np.ndarray]]:
    """Run SCF + gradient at one geometry.

    Returns ``(energy, gradient, converged_density)``. Energies in Ha;
    gradients in Ha/bohr, shape (n_atoms, 3). ``converged_density`` is
    the converged density carried back to the outer loop for
    within-image warm-start: a single closed-shell density matrix for
    RHF / RKS, or an ``(alpha, beta)`` tuple of per-spin density
    matrices for UHF / UKS. All four methods route through the
    low-level ``run_*_scf_with_jk`` entry points that accept an
    external initial density.

    The molecule is reconstructed from ``positions`` (bohr) using
    ``template`` for atomic numbers + charge + multiplicity. The basis
    set is rebuilt per geometry -- vibe-qc's BasisSet is bound to the
    nuclei it was constructed with.

    When ``initial_density`` is provided the SCF starts from that
    density (or ``(alpha, beta)`` pair for open-shell) instead of
    SAD/Hcore -- within-image density warm-start across NEB outer
    iterations.
    """
    from ._vibeqc_core import BasisSet
    from .molecular_optimize import _compute_molecular_gradient, _run_molecular_scf

    mol = _rebuild_with_positions(template, positions)
    basis = BasisSet(mol, basis_name)

    method_lower = method.lower()
    # The density cache slot is one of:
    #   - None (no warm-start density available)
    #   - np.ndarray for closed-shell (RHF / RKS)
    #   - (alpha, beta) tuple of np.ndarrays for open-shell (UHF / UKS)
    # _evaluate_image returns the cache in whichever shape matches
    # the method, and the outer loop carries it back per image.
    converged_density: Optional[Any] = None
    if method_lower == "rhf":
        scf_result = _run_rhf_warm_start(
            mol, basis, rhf_options, initial_density
        )
        energy = float(scf_result.energy)
        converged_density = np.asarray(
            scf_result.density, dtype=np.float64
        ).copy()
    elif method_lower == "uhf":
        scf_result = _run_uhf_warm_start(
            mol, basis, uhf_options, initial_density
        )
        energy = float(scf_result.energy)
        converged_density = (
            np.asarray(scf_result.density_alpha, dtype=np.float64).copy(),
            np.asarray(scf_result.density_beta, dtype=np.float64).copy(),
        )
    elif method_lower == "rks":
        scf_result = _run_rks_warm_start(
            mol, basis, rks_options, functional, grid_options, initial_density
        )
        energy = float(scf_result.energy)
        converged_density = np.asarray(
            scf_result.density, dtype=np.float64
        ).copy()
    elif method_lower == "uks":
        scf_result = _run_uks_warm_start(
            mol, basis, uks_options, functional, grid_options, initial_density
        )
        energy = float(scf_result.energy)
        converged_density = (
            np.asarray(scf_result.density_alpha, dtype=np.float64).copy(),
            np.asarray(scf_result.density_beta, dtype=np.float64).copy(),
        )
    else:
        energy, scf_result = _run_molecular_scf(
            mol,
            basis,
            method_lower,
            functional=functional,
            rhf_options=rhf_options,
            uhf_options=uhf_options,
            rks_options=rks_options,
            uks_options=uks_options,
        )
    # A non-converged image has no valid gradient -- the C++ gradient
    # builders reject a non-converged density with a cryptic RuntimeError.
    # Catch it here and raise a clear, image-named NEB error (F2).
    if not getattr(scf_result, "converged", True):
        raise _nonconverged_image_error(method, scf_result, image_index, positions)
    grad = _compute_molecular_gradient(
        mol,
        basis,
        scf_result,
        method_lower,
        gradient_options=gradient_options,
        grid_options=grid_options,
        dispersion_params=dispersion_params,
    )
    if dispersion_params is not None:
        from .dispersion import compute_d3bj

        disp = compute_d3bj(mol, dispersion_params)
        energy = energy + float(disp.energy)
    return float(energy), np.asarray(grad, dtype=float), converged_density


def _evaluate_image_periodic(
    positions: np.ndarray,
    template: PeriodicSystem,
    basis_name: str,
    method: str,
    *,
    kmesh: Any,
    functional: Optional[str],
    rhf_options: Any,
    uhf_options: Any,
    rks_options: Any,
    uks_options: Any,
    fd_step_bohr: float,
    initial_density: Optional[np.ndarray] = None,
    dft_plus_u: Optional[Sequence[Any]] = None,
    image_index: Optional[int] = None,
) -> tuple[float, np.ndarray, Optional[np.ndarray]]:
    """Run a periodic BIPOLE SCF + finite-difference gradient.

    Returns (energy, gradient). Gradient shape (n_atoms, 3), Ha/bohr,
    in Cartesian coordinates -- *not* fractional, even though the
    underlying SCF is BIPOLE. The NEB outer loop works in Cartesian
    space throughout (positions, tangents, spring distances).

    The gradient is computed by central-differencing the SCF energy
    along each Cartesian degree of freedom. This is the path the
    the implementation keeps until the J^LR reciprocal-Ewald
    contribution lands in the analytic BIPOLE gradient (see
    ``python/vibeqc/bipole_gradient.py`` and ``docs/user_guide/neb.md``).
    Cost: 6N + 1 BIPOLE SCFs per
    image per outer iteration. Correctness is exact in the limit
    ``fd_step_bohr -> 0``.
    """
    from ._vibeqc_core import BasisSet
    from .bipole_optimize import _run_scf

    sys_current = _rebuild_with_positions(template, positions)
    basis_current = BasisSet(sys_current.unit_cell_molecule(), basis_name)
    method_upper = method.upper()
    opts = {
        "rhf": rhf_options,
        "uhf": uhf_options,
        "rks": rks_options,
        "uks": uks_options,
    }[method.lower()]

    # Periodic SCF density warm-start. The outer-loop cache feeds in
    # the previous outer iter's converged density at this geometry's
    # image, and the 6N FD-displaced SCFs below additionally
    # warm-start from the reference SCF's converged density
    # (within-image FD speedup). Density shape varies by method:
    #
    #   * RHF / RKS -- single closed-shell density (list of cell
    #     blocks); the SCF driver takes ``initial_density=blocks``.
    #   * UHF / UKS -- open-shell (alpha_blocks, beta_blocks) tuple;
    #     the SCF driver takes ``init_alpha=`` + ``init_beta=``.
    is_closed_shell = method_upper in ("RHF", "RKS")
    is_open_shell = method_upper in ("UHF", "UKS")

    def _warm_kwargs(density: Any) -> dict[str, Any]:
        if density is None:
            return {}
        if is_closed_shell:
            return {"initial_density": density}
        if is_open_shell:
            alpha_blocks, beta_blocks = density
            return {"init_alpha": alpha_blocks, "init_beta": beta_blocks}
        return {}

    # DFT+U kwargs for the BIPOLE drivers. All four periodic BIPOLE
    # entries (``run_pbc_bipole_{rhf,uhf,rks,uks}``) accept
    # ``dft_plus_u=[HubbardSite, ...]`` as of the closed-shell
    # BIPOLE +U landing on v0.9.0 main.
    dft_plus_u_kwargs: dict[str, Any] = (
        {"dft_plus_u": list(dft_plus_u)}
        if dft_plus_u
        else {}
    )

    energy, scf_result = _run_scf(
        sys_current,
        basis_current,
        kmesh,
        opts,
        method_upper,
        functional,
        **_warm_kwargs(initial_density),
        **dft_plus_u_kwargs,
    )
    # Abort on a non-converged reference SCF before the 6N FD-displaced
    # SCFs run against a meaningless density (F2; mirrors the molecular
    # path). Safe no-op if the periodic result lacks a `converged` flag.
    if not getattr(scf_result, "converged", True):
        raise _nonconverged_image_error(method, scf_result, image_index, positions)

    # Reference density: the converged density from the just-run
    # reference SCF, in the shape this method requires. Used to
    # warm-start the FD-displaced SCFs *and* returned to the outer
    # loop's per-image cache for the next iteration.
    reference_density: Optional[Any] = None
    if is_closed_shell:
        reference_density = [
            np.asarray(b, dtype=float).copy()
            for b in scf_result.density.blocks
        ]
    elif is_open_shell:
        reference_density = (
            [np.asarray(b, dtype=float).copy() for b in scf_result.density_alpha.blocks],
            [np.asarray(b, dtype=float).copy() for b in scf_result.density_beta.blocks],
        )

    n_atoms = positions.shape[0]
    grad = np.zeros((n_atoms, 3), dtype=float)
    for a in range(n_atoms):
        for c in range(3):
            disp_plus = positions.copy()
            disp_plus[a, c] += fd_step_bohr
            sys_p = _rebuild_with_positions(template, disp_plus)
            basis_p = BasisSet(sys_p.unit_cell_molecule(), basis_name)
            disp_kwargs = _warm_kwargs(reference_density)
            e_p, _ = _run_scf(
                sys_p,
                basis_p,
                kmesh,
                opts,
                method_upper,
                functional,
                **disp_kwargs,
                **dft_plus_u_kwargs,
            )
            disp_minus = positions.copy()
            disp_minus[a, c] -= fd_step_bohr
            sys_m = _rebuild_with_positions(template, disp_minus)
            basis_m = BasisSet(sys_m.unit_cell_molecule(), basis_name)
            e_m, _ = _run_scf(
                sys_m,
                basis_m,
                kmesh,
                opts,
                method_upper,
                functional,
                **disp_kwargs,
                **dft_plus_u_kwargs,
            )
            grad[a, c] = (e_p - e_m) / (2.0 * fd_step_bohr)
    return float(energy), grad, reference_density


# --- MACE (machine-learned interatomic potential) backend ------------------
#
# MACE provides analytic energy + forces from a pre-trained model, with no
# SCF and no Gaussian basis (CLAUDE.md Sec.10 maintainer-approved external
# pre-trained model -- see ``vibeqc.mlip.mace``). For NEB this is a much
# cheaper per-image evaluation than the SCF path, and -- importantly for the
# periodic case -- it sidesteps the 6N+1 finite-difference SCFs entirely:
# MACE returns the gradient directly.
#
# The model (torch weights) is loaded **once** and the ASE calculator reused
# for every image and outer iteration; constructing a fresh ``MACEModel`` per
# evaluation would reload the model each call. Because the loaded calculator
# is a live torch object, MACE-NEB runs the per-image loop serially
# (``n_jobs=1`` in ``run_neb``) rather than pickling the model across joblib
# worker processes -- each evaluation is a single forward pass, so serial is
# cheap.


def _load_mace_model(
    template: System,
    mlip_options: Any,
    is_periodic: bool,
) -> tuple[Any, np.ndarray, Optional[np.ndarray], str]:
    """Load the MACE model once and return ``(calc, numbers, cell, citation)``.

    ``calc`` is the reusable ASE calculator (eV / Angstrom); ``numbers`` the
    atomic numbers; ``cell`` the 3x3 lattice in bohr (``None`` for molecular);
    ``citation`` the per-model foundation-model citation key (for the
    references block -- e.g. ``batatia_mace_mp_2024``). The ASL gate (academic,
    non-commercial models) fires inside :class:`vibeqc.mlip.mace.MACEModel`.
    """
    from .mlip.mace import MACEModel

    cell = np.asarray(template.lattice, dtype=float) if is_periodic else None
    seed = template.unit_cell_molecule() if is_periodic else template
    model = MACEModel(seed, mlip_options, cell=cell)
    citation = getattr(getattr(model, "info", None), "citation", "") or ""
    return model.calculator, _atomic_numbers_of(template), cell, citation


def _evaluate_image_mace(
    positions: np.ndarray,
    *,
    calc: Any,
    numbers: np.ndarray,
    cell: Optional[np.ndarray],
    **_ignored: Any,
) -> tuple[float, np.ndarray, None]:
    """Energy + analytic gradient at one geometry from a pre-loaded MACE
    ASE calculator.

    Reuses ``calc`` (the loaded model); only the forward pass runs per call.
    Returns ``(energy_ha, gradient_ha_bohr, None)`` -- MACE keeps no SCF state,
    so there is no warm-start density (the ``initial_density=`` kwarg the
    outer loop passes is absorbed by ``**_ignored``).
    """
    from ase import Atoms
    from ase.units import Bohr, Hartree

    atoms = Atoms(
        numbers=[int(z) for z in numbers],
        positions=np.asarray(positions, dtype=float) * Bohr,
    )
    if cell is not None:
        atoms.set_cell(np.asarray(cell, dtype=float) * Bohr)
        atoms.set_pbc(True)
    atoms.calc = calc
    energy_ha = float(atoms.get_potential_energy()) / Hartree
    # ASE forces are eV/Angstrom; gradient = -force in Ha/bohr.
    grad = -np.asarray(atoms.get_forces(), dtype=float) * Bohr / Hartree
    return energy_ha, grad, None


# --- MSINDO (semiempirical INDO) backend -----------------------------------
#
# MSINDO is vibe-qc's own Bredow/Geudtner/Jug INDO re-implementation
# (CLAUDE.md Sec.10; ``vibeqc.semiempirical.methods.msindo``). It supplies a
# molecular total energy + a nuclear gradient with no Gaussian basis and no
# libint -- the STO/INDO Fock is built from the parameter tables, and the
# gradient is the central-difference derivative of that energy
# (``msindo_gradient_fd``, oracle-validated to <= 1e-4 Ha/bohr against
# MSINDO's analytic ``CARTOPT ANALY`` gradient). For NEB this is the first
# *semiempirical* image path.
#
# Two ways it differs from the SCF and MACE paths:
#   * No basis / functional / k-mesh (like MACE; unlike the SCF methods).
#   * No reusable live object (unlike MACE's torch calculator):
#     ``run_msindo`` / ``msindo_gradient_fd`` are stateless module-level
#     functions, so the band evaluates *in parallel* across images (joblib
#     processes) exactly like the SCF path -- MSINDO is not forced to the
#     serial ``n_jobs=1`` MACE uses.
#
# Cost note: the FD gradient is 6N ``run_msindo`` SCFs per image per outer
# iteration (central differences over the 3N Cartesian DOF). Keep
# MSINDO-NEB systems small; parallelism across images is the main lever.


def _evaluate_image_msindo(
    positions: np.ndarray,
    *,
    numbers: np.ndarray,
    charge: int,
    multiplicity: int,
    fd_step_bohr: float,
    dispersion_params: Any = None,
    image_index: Optional[int] = None,
    **_ignored: Any,
) -> tuple[float, np.ndarray, None]:
    """Energy + finite-difference gradient at one geometry from MSINDO.

    ``positions`` are bohr (NEB's working units); the MSINDO engine takes
    Angstrom, so they are converted with MSINDO's own constant for a bit-exact
    round-trip (the same constant ``run_job`` uses -- runner.py). Returns
    ``(energy_ha, gradient_ha_bohr, None)``: MSINDO carries no SCF state across
    geometries, so there is no warm-start density (the ``initial_density=`` the
    outer loop passes is absorbed by ``**_ignored``).

    The gradient is the central-difference nuclear gradient of the MSINDO total
    energy (``msindo_gradient_fd``); it is already Ha/bohr. ``fd_step_bohr`` is
    the FD half-step, converted to the engine's Angstrom. A non-converged
    MSINDO SCF raises the same image-named :class:`NEBImageSCFError` the SCF
    path uses. When ``dispersion_params`` is given, the D3-BJ energy + gradient
    are folded in (the FD gradient sees only the bare MSINDO energy, so the
    dispersion derivative is added explicitly here).
    """
    from .semiempirical.methods.msindo import (
        ANGSTROM_TO_BOHR as _A2B,
    )
    from .semiempirical.methods.msindo import (
        msindo_gradient_fd,
        run_msindo,
    )

    Z = [int(z) for z in numbers]
    coords_ang = np.asarray(positions, dtype=float) / _A2B
    fd_step_ang = float(fd_step_bohr) / _A2B

    result = run_msindo(
        Z, coords_ang, charge=int(charge), multiplicity=int(multiplicity)
    )
    # A non-converged SCF has no valid gradient (the FD displacements would
    # difference meaningless energies). Raise the same clear, image-named error
    # the SCF path uses (F2).
    if not getattr(result, "converged", True):
        raise _nonconverged_image_error("msindo", result, image_index, positions)
    energy = float(result.total_energy)

    grad = np.asarray(
        msindo_gradient_fd(
            Z,
            coords_ang,
            charge=int(charge),
            multiplicity=int(multiplicity),
            step=fd_step_ang,
        ),
        dtype=float,
    )

    if dispersion_params is not None:
        from .dispersion import compute_d3bj

        mol = Molecule(
            [Atom(int(z), [float(p[0]), float(p[1]), float(p[2])])
             for z, p in zip(Z, positions)],
            int(charge),
            int(multiplicity),
        )
        disp = compute_d3bj(mol, dispersion_params, with_gradient=True)
        energy += float(disp.energy)
        if getattr(disp, "gradient", None) is not None:
            grad = grad + np.asarray(disp.gradient, dtype=float)

    return energy, grad, None


# --- NEB force kernel ------------------------------------------------------


def _neb_forces(
    positions: list[np.ndarray],
    energies: list[float],
    gradients: list[np.ndarray],
    spring_constant: float,
    frozen_mask: Optional[np.ndarray],
    climbing_index: Optional[int] = None,
    lattice: Optional[np.ndarray] = None,
    dim: int = 3,
) -> tuple[list[np.ndarray], list[np.ndarray]]:
    """Compute NEB total forces + tangents for every intermediate image.

    For periodic systems (``lattice`` given) the tangent and the spring
    distances use minimum-image inter-image displacements, so a band
    whose images straddle a cell boundary (e.g. surface self-diffusion)
    is handled correctly. This is a no-op for a non-crossing band (the
    minimum image of a small displacement is the displacement itself).

    Returns (forces, tangents). Both are length n_intermediate
    (excluding endpoints). Frozen atoms (per ``frozen_mask`` of shape
    (n_atoms,) bool, True = frozen) have their force components set
    to zero before return.

    ``climbing_index`` is the index (into ``positions``, i.e. global
    image index -- 0 is reactant, ``len(positions) - 1`` is product)
    of the climbing image. For that image the standard
    ``F_spring_∥ + F_true_⊥`` is replaced by the climbing-image force
    ``F_climb = -gradE + 2 (gradE . t) t`` (Henkelman, Uberuaga, Jónsson
    2000): the true force with its tangent-parallel component
    inverted, no spring contribution. The climbing image then
    relaxes uphill *along* t while still relaxing perpendicular,
    landing on the saddle. ``None`` => standard NEB on every
    intermediate image.
    """
    n_total = len(positions)
    forces: list[np.ndarray] = []
    tangents: list[np.ndarray] = []
    for i in range(1, n_total - 1):
        tau = _improved_tangent(
            positions[i - 1],
            positions[i],
            positions[i + 1],
            energies[i - 1],
            energies[i],
            energies[i + 1],
            lattice,
            dim,
        )
        true_force = -gradients[i]
        parallel = float(np.sum(true_force * tau))
        if climbing_index is not None and i == climbing_index:
            # Climbing image: invert the parallel component of the
            # true force and drop the spring contribution.
            #   F_climb = -gradE + 2 (gradE . t) t
            #           = (true_force - parallel t) + (-parallel t)
            #           = F_true_⊥ - parallel.t
            f_total = true_force - 2.0 * parallel * tau
        else:
            # Spring force projected onto t (signed). Minimum-image
            # inter-image displacements for periodic bands (no-op when
            # consecutive images are within half a cell, i.e. always
            # except across a PBC hop).
            disp_next = positions[i + 1] - positions[i]
            disp_prev = positions[i] - positions[i - 1]
            if lattice is not None:
                disp_next = _minimum_image_diff(disp_next, lattice, dim)
                disp_prev = _minimum_image_diff(disp_prev, lattice, dim)
            d_next = float(np.linalg.norm(disp_next))
            d_prev = float(np.linalg.norm(disp_prev))
            f_spring = spring_constant * (d_next - d_prev) * tau
            # True force perpendicular to t.
            f_perp = true_force - parallel * tau
            f_total = f_spring + f_perp
        if frozen_mask is not None:
            f_total = f_total.copy()
            f_total[frozen_mask] = 0.0
        forces.append(f_total)
        tangents.append(tau)
    return forces, tangents


# --- public entry point ----------------------------------------------------


def _neb_scf_options(
    method_lower: str,
    *,
    rhf_options: Any = None,
    uhf_options: Any = None,
    rks_options: Any = None,
    uks_options: Any = None,
) -> Any:
    if method_lower == "rhf":
        return rhf_options
    if method_lower == "uhf":
        return uhf_options
    if method_lower == "rks":
        return rks_options
    if method_lower == "uks":
        return uks_options
    return None


def _neb_dry_run_estimate_bytes(
    reactant: System,
    *,
    basis: str | None,
    method_lower: str,
    n_images: int,
    n_jobs: int,
    n_atoms: int,
    is_periodic: bool,
    warm_start: bool,
    rhf_options: Any = None,
    uhf_options: Any = None,
    rks_options: Any = None,
    uks_options: Any = None,
) -> int | None:
    if basis is None or method_lower not in ("rhf", "uhf", "rks", "uks"):
        return None
    try:
        from .memory import estimate_memory, estimate_neb_memory

        molecule = (
            reactant.unit_cell_molecule()
            if isinstance(reactant, PeriodicSystem)
            else reactant
        )
        basis_obj = BasisSet(molecule, basis)
        per_image = estimate_memory(
            molecule,
            basis_obj,
            method=method_lower,
            options=_neb_scf_options(
                method_lower,
                rhf_options=rhf_options,
                uhf_options=uhf_options,
                rks_options=rks_options,
                uks_options=uks_options,
            ),
        )
        fd_evaluations = 6 * n_atoms + 1 if is_periodic else 1
        return estimate_neb_memory(
            per_image,
            n_images=n_images,
            n_jobs=n_jobs,
            n_atoms=n_atoms,
            n_basis=basis_obj.nbasis,
            open_shell=method_lower in ("uhf", "uks"),
            finite_difference_evaluations=fd_evaluations,
            warm_start=warm_start,
        ).total_bytes
    except Exception:
        return None


def _write_neb_dry_run_manifest(
    *,
    output: str | os.PathLike,
    method: str,
    basis: str | None,
    functional: str | None,
    record_hostname: bool,
    estimate_bytes: int | None,
) -> None:
    output_stem = Path(os.fspath(output))
    output_stem.parent.mkdir(parents=True, exist_ok=True)
    plan = OutputPlan(
        stem=output_stem,
        job_kind="neb",
        method=method,
        basis=basis or "(none)",
        functional=functional,
        files=(),
    )
    dry_run_manifest(
        plan,
        record_hostname=record_hostname,
        estimate_bytes=estimate_bytes,
    )


def run_neb(
    reactant: System,
    product: System,
    basis: Optional[str] = None,
    n_images: int = 7,
    *,
    method: str = "RKS",
    functional: Optional[str] = "pbe",
    spring_constant: float = 0.1,
    interpolation: str = "idpp",
    max_iter: int = 100,
    conv_tol_force: float = 1e-3,
    freeze_indices: Optional[Sequence[int]] = None,
    dispersion_params: Any = None,
    rhf_options: Any = None,
    uhf_options: Any = None,
    rks_options: Any = None,
    uks_options: Any = None,
    gradient_options: Any = None,
    grid_options: Any = None,
    n_jobs: int = 0,
    initial_step: float = 0.05,
    max_step: float = 0.2,
    progress: bool = False,
    output: Union[str, os.PathLike] = "output",
    dry_run: bool = False,
    record_hostname: bool = True,
    climbing_image: bool = False,
    climbing_image_start_fraction: float = 0.3,
    kpoints: Optional[Any] = None,
    fd_step_bohr: float = 1e-3,
    warm_start: bool = True,
    dft_plus_u: Optional[Sequence[Any]] = None,
    mlip_options: Any = None,
) -> NEBResult | None:
    """Find a minimum-energy path with improved-tangent NEB.

    Parameters
    ----------
    reactant, product
        :class:`Molecule` or :class:`PeriodicSystem` endpoints (both
        the same type). Same atom-number sequence and length; the
        order is preserved through the path.
    basis
        Basis-set name passed to :class:`BasisSet`, rebuilt per image
        per outer iteration. Required for the SCF methods; optional and
        ignored for ``method="mace"`` (a pre-trained model) and
        ``method="msindo"`` (INDO over Slater orbitals) -- neither uses a
        Gaussian basis.
    n_images
        Number of *intermediate* images. The returned path has
        ``n_images + 2`` images (endpoints included). Endpoints are
        fixed by default.
    method
        ``"RHF"`` / ``"UHF"`` / ``"RKS"`` / ``"UKS"`` for self-consistent-
        field per-image energies + gradients; ``"MACE"`` to drive a
        pre-trained MACE machine-learned interatomic potential
        (``mlip_options=``; analytic energy + forces, no SCF / basis /
        k-mesh); or ``"MSINDO"`` for the semiempirical INDO engine
        (``vibeqc.semiempirical.methods.msindo`` -- molecular only, no basis /
        functional / k-mesh; energy + a finite-difference nuclear gradient,
        so each image costs 6N ``run_msindo`` SCFs per outer iteration --
        keep the system small). Case-insensitive. Same dispatch as
        :func:`vibeqc.run_job` and
        :func:`vibeqc.molecular_optimize.optimize_molecule`.
    functional
        XC functional for KS-DFT (e.g. ``"pbe"``, ``"b3lyp"``).
        Ignored for HF methods.
    spring_constant
        ``k`` in Ha/bohr^2 for the spring term
        ``F_spring_i = k (|R_{i+1} - R_i| - |R_i - R_{i-1}|) t_i``.
        0.1 is the canonical default; turn down if the path is
        chemically smooth, up if it kinks.
    interpolation
        ``"idpp"`` (default, recommended for bonded paths) or
        ``"linear"``.
    max_iter
        Hard cap on outer iterations. Quick-min iterations are cheap
        in this loop's terms (one parallel SCF batch each).
    conv_tol_force
        Convergence threshold on the max-norm of the NEB force over
        all intermediate atoms (Ha/bohr). 1e-3 ≈ 0.05 eV/Å -- the
        Kolsbjerg 2016 recommendation.
    freeze_indices
        Atom indices to freeze (NEB-local; the SCF + gradient still
        sees them, but their force components are zeroed before the
        step). Useful for slab substrate atoms in surface NEB.
    dispersion_params
        Optional D3-BJ parameters; the dispersion energy and gradient
        are folded into each image's energy + gradient.
    n_jobs
        joblib.Parallel ``n_jobs``. ``0`` (default) selects a bounded
        automatic worker count; ``-1`` explicitly requests all cores;
        ``1`` is serial.
    initial_step, max_step
        Quick-min step sizes (bohr). The integrator starts at
        ``initial_step`` and scales up (capped at ``max_step``) when
        the velocity is aligned with the force.
    progress
        If True, prints one line per outer iteration summarising
        max-force, total path length, and TS-image energy.
    output
        Output stem used by dry-run preflight. ``run_neb`` does not yet write
        a live NEB output bundle, but ``dry_run=True`` (or
        ``VIBEQC_DRY_RUN=1``) writes ``{output}.system`` so ``vq submit auto``
        can read the planned job kind and optional memory estimate.
    dry_run
        If True, write the dry-run manifest and return ``None`` without
        interpolation, endpoint evaluation, or any per-image SCF. Setting
        ``VIBEQC_DRY_RUN=1`` has the same effect; setting
        ``VIBEQC_DRY_RUN_ESTIMATE=1`` additionally records
        ``[memory].estimate_bytes`` when the Gaussian SCF route is estimable.
    climbing_image
        Enable climbing-image NEB (CI-NEB, Henkelman+Uberuaga+Jónsson
        2000). After a warm-up phase the highest-energy intermediate
        image is promoted to "climbing": its spring contribution is
        dropped and the tangent-parallel component of its true force
        is inverted, so it climbs uphill along t to the saddle while
        still relaxing perpendicular. Other images keep standard NEB
        dynamics. Default ``False`` (plain improved-tangent NEB).
    climbing_image_start_fraction
        Fraction of ``max_iter`` to spend in the plain-NEB warm-up
        phase before promoting an image to climbing. Default 0.3 --
        the band has typically found its rough shape by this point,
        so the identity of the highest-energy image is reliable.
        Ignored when ``climbing_image=False``.
    kpoints
        Periodic-only. Either a ``BlochKMesh`` or ``KPoints`` object
        (materialized and passed through to ``run_pbc_bipole_*``), or
        a 3-tuple of ints (the Monkhorst-Pack mesh sizes; converted with
        :func:`vibeqc.monkhorst_pack`). Ignored when
        ``reactant``/``product`` are ``Molecule``. ``None`` =>
        Γ-only mesh (``(1, 1, 1)``) for a sanity-check periodic
        run; pick a real k-mesh for production.
    fd_step_bohr
        Half-step for the central-difference per-image gradient on the
        finite-difference paths -- periodic SCF and ``method="msindo"``.
        Periodic: the J^LR reciprocal contribution is still missing from
        the analytic BIPOLE gradient, so the periodic NEB driver uses the
        FD fallback to keep things bit-exact (cost per image 6N + 1 BIPOLE
        SCFs; the NEB user guide tracks the analytic-gradient
        switch). MSINDO: the engine exposes only the FD gradient today
        (``msindo_gradient_fd``; cost 6N ``run_msindo`` SCFs per image).
        Converted to the MSINDO engine's Angstrom internally. Default
        1e-3 bohr -- same value the BIPOLE-FD gradient unit tests use.
        Ignored for the molecular SCF methods (analytic gradient) and
        ``method="mace"`` (analytic forces).
    warm_start
        Within-image density warm-start across outer iterations
        (the NEB warm-start milestone -- the 2-4x SCF-cost reduction
        this path was built to capture). When True (default) the
        converged density from outer iter N is fed in as the SCF
        initial guess at iter N+1 for the same image -- the geometry
        change between outer iters is small relative to a SAD/Hcore
        guess, so the SCF converges in fewer iterations. Active for
        all four SCF methods (RHF / UHF / RKS / UKS), both molecular
        and periodic (BIPOLE); periodic additionally warm-starts the
        6N FD-displaced SCFs from the reference SCF's converged
        density. Bit-exact vs. cold-start in every case. Pass
        ``False`` to force every SCF to cold-start (useful for
        benchmarking the speedup).
    dft_plus_u
        Optional iterable of :class:`vibeqc.HubbardSite` objects.
        Each entry adds the Dudarev rotationally-invariant per-spin
        potential ``V_U^A = U_eff (1/2 d - n^A_l)`` on that
        ``(atom_index, l)`` channel for every per-image SCF in the
        NEB run. The Hubbard energy ``E_U`` contributes to each
        image's ``e_dft_plus_u`` (and to ``result.energy`` through
        the standard +U bookkeeping); the NEB driver picks this up
        uniformly through the SCF dispatch. Supported for all four
        methods, molecular and periodic -- the molecular leg applies
        +U Options-side via ``_apply_dft_plus_u_to_options``; the
        periodic leg forwards ``dft_plus_u=`` straight to
        ``run_pbc_bipole_{rhf,uhf,rks,uks}``. See the small-cell
        +U projector caveat in ``docs/user_guide/neb.md`` Sec. DFT+U.
        Rejected with ``method="mace"`` (the SCF-only correction).
    mlip_options
        Used only when ``method="mace"``: a
        :class:`vibeqc.mlip.MLIPOptions` selecting the MACE foundation
        model, device, dtype, and (for academic-only ASL models) the
        license acknowledgment. ``None`` => the MIT MACE-MPA-0 default.
        The model is loaded once and its calculator reused for every
        image, so MACE-NEB runs serially (``n_jobs`` is forced to 1).
        MACE returns analytic forces, so the periodic FD-gradient path
        is bypassed entirely. Requires the optional ``[mace]`` extra
        (PyTorch + e3nn; Python <= 3.13) -- see :mod:`vibeqc.mlip.mace`.

    Returns
    -------
    :class:`NEBResult`, or ``None`` for dry-run preflight.

    Notes
    -----
    The outer loop is the quick-min (damped MD) integrator of
    Henkelman+Jónsson 2000. Each intermediate image carries a
    velocity; at each step the velocity is projected onto the NEB
    force, zeroed if the projection is negative (preventing climbing
    against the force). Step length grows by a factor of 1.1 when
    aligned and is capped at ``max_step``; it is reset to
    ``initial_step`` on direction flips. This is the textbook
    "FIRE-lite" cousin used by ASE's MDMin.
    """
    is_periodic = isinstance(reactant, PeriodicSystem)
    if is_periodic and not isinstance(product, PeriodicSystem):
        raise ValueError(
            "run_neb: reactant and product must be the same system "
            "type -- got PeriodicSystem and Molecule."
        )
    if not is_periodic and not isinstance(reactant, Molecule):
        raise NotImplementedError(
            "run_neb: only Molecule and PeriodicSystem endpoints are "
            "supported."
        )
    if n_images < 1:
        raise ValueError(f"n_images must be >= 1; got {n_images}")
    _check_compatible(reactant, product)

    # Lattice + periodic dimensionality for minimum-image inter-image
    # displacements in the NEB force (tangent + spring). None for the
    # molecular case (plain Cartesian); the shared cell is enforced by
    # _check_compatible.
    neb_lattice = (
        np.asarray(reactant.lattice, dtype=float) if is_periodic else None
    )
    neb_dim = int(reactant.dim) if is_periodic else 3

    method_lower = method.lower()
    if method_lower not in ("rhf", "uhf", "rks", "uks", "mace", "msindo"):
        raise ValueError(
            f"run_neb: unsupported method {method!r}. "
            "Use one of 'RHF', 'UHF', 'RKS', 'UKS', 'MACE', 'MSINDO'."
        )
    is_mace = method_lower == "mace"
    is_msindo = method_lower == "msindo"
    n_jobs = _resolve_neb_n_jobs(
        n_jobs,
        n_images=n_images,
        is_periodic=is_periodic,
    )
    # Methods that need a Gaussian basis. MACE (pre-trained model) and MSINDO
    # (INDO over Slater orbitals -- no Gaussian integrals) do not.
    needs_basis = method_lower in ("rhf", "uhf", "rks", "uks")
    if needs_basis and basis is None:
        raise ValueError(
            f"run_neb: a basis set is required for method={method!r}. "
            "(basis is only optional for method='mace' / 'msindo', which use "
            "a pre-trained model / Slater-orbital INDO rather than a Gaussian "
            "basis.)"
        )
    if is_msindo and is_periodic:
        raise NotImplementedError(
            "run_neb: method='msindo' is molecular-only. Periodic MSINDO uses "
            "the Cyclic Cluster Model "
            "(vibeqc.semiempirical.methods.msindo_ccm), which is a standalone "
            "PeriodicSystem API and is not wired into run_neb."
        )
    if (is_mace or is_msindo) and dft_plus_u:
        raise ValueError(
            f"run_neb: dft_plus_u is not supported with method={method!r} "
            "(the Hubbard +U correction applies to the Gaussian SCF/KS methods "
            "RHF / UHF / RKS / UKS only)."
        )

    n_atoms = len(_positions_of(reactant))
    frozen_mask: Optional[np.ndarray] = None
    if freeze_indices is not None:
        fi = {int(i) for i in freeze_indices}
        bad = [i for i in fi if i < 0 or i >= n_atoms]
        if bad:
            raise ValueError(
                f"run_neb: freeze_indices {bad} out of range "
                f"[0, {n_atoms})"
            )
        frozen_mask = np.zeros(n_atoms, dtype=bool)
        for i in fi:
            frozen_mask[i] = True

    if dry_run or is_dry_run_requested():
        estimate_bytes = (
            _neb_dry_run_estimate_bytes(
                reactant,
                basis=basis,
                method_lower=method_lower,
                n_images=n_images,
                n_jobs=n_jobs,
                n_atoms=n_atoms,
                is_periodic=is_periodic,
                warm_start=warm_start,
                rhf_options=rhf_options,
                uhf_options=uhf_options,
                rks_options=rks_options,
                uks_options=uks_options,
            )
            if is_dry_run_estimate_requested()
            else None
        )
        _write_neb_dry_run_manifest(
            output=output,
            method=method_lower,
            basis=basis,
            functional=functional if method_lower in ("rks", "uks") else None,
            record_hostname=record_hostname,
            estimate_bytes=estimate_bytes,
        )
        return None

    # DFT+U coverage matrix (full surface; no periodic guard
    # remains as of v0.9.0):
    #   * Molecular RHF / UHF / RKS / UKS -- Options-side
    #     (`dft_plus_u_sites` populated by
    #     `_apply_dft_plus_u_to_options`).
    #   * Periodic RHF / UHF / RKS / UKS via BIPOLE -- kwarg-side
    #     (`run_pbc_bipole_{rhf,uhf,rks,uks}(..., dft_plus_u=[...])`,
    #     all four landed on `main`).
    # The periodic worker `_evaluate_image_periodic` builds the
    # forwarded kwargs dict below -- it now fires for any method,
    # not just open-shell.

    # --- 1. Initial path ---------------------------------------------------
    if interpolation == "idpp":
        initial_systems = interpolate_idpp(reactant, product, n_images)
    elif interpolation == "linear":
        initial_systems = interpolate_linear(reactant, product, n_images)
    else:
        raise ValueError(
            f"interpolation must be 'idpp' or 'linear'; got {interpolation!r}"
        )

    positions: list[np.ndarray] = [
        _positions_of(s) for s in initial_systems
    ]

    # DFT+U setup (molecular path only). The molecular
    # ``run_*_scf_with_jk`` entry points read the Dudarev fields
    # off Options, so we apply once here using a reactant-built
    # basis and reuse for every per-image SCF.
    # AO grouping depends only on basis structure (geometry-
    # invariant for a fixed basis name + atom ordering).
    # Periodic UHF/UKS BIPOLE drivers consume ``dft_plus_u`` as a
    # direct kwarg (not via Options), so they bypass this block
    # -- the periodic worker forwards ``dft_plus_u`` per-call.
    if dft_plus_u and not is_periodic:
        from ._vibeqc_core import (
            BasisSet,
            RHFOptions,
            RKSOptions,
            UHFOptions,
            UKSOptions,
        )
        from .dft_plus_u import _apply_dft_plus_u_to_options

        _reactant_basis = BasisSet(reactant, basis)
        if method_lower == "rhf":
            if rhf_options is None:
                rhf_options = RHFOptions()
            _apply_dft_plus_u_to_options(
                rhf_options, _reactant_basis, dft_plus_u
            )
        elif method_lower == "uhf":
            if uhf_options is None:
                uhf_options = UHFOptions()
            _apply_dft_plus_u_to_options(
                uhf_options, _reactant_basis, dft_plus_u
            )
        elif method_lower == "rks":
            if rks_options is None:
                rks_options = RKSOptions()
            _apply_dft_plus_u_to_options(
                rks_options, _reactant_basis, dft_plus_u
            )
        elif method_lower == "uks":
            if uks_options is None:
                uks_options = UKSOptions()
            _apply_dft_plus_u_to_options(
                uks_options, _reactant_basis, dft_plus_u
            )

    # --- 2. Periodic-only setup: resolve the k-point mesh ------------------
    # (MACE is a Γ-only pre-trained model -- no k-mesh.)
    kmesh = None
    if is_periodic and not is_mace:
        from ._vibeqc_core import monkhorst_pack as _mp

        if kpoints is None:
            kmesh = _mp(reactant, (1, 1, 1))
        elif hasattr(kpoints, "to_bloch_kmesh") or (
            hasattr(kpoints, "kpoints") and hasattr(kpoints, "weights")
        ):
            from .kpoints import as_bloch_kmesh

            kmesh = as_bloch_kmesh(kpoints)
        else:
            kmesh = _mp(reactant, tuple(int(k) for k in kpoints))

    # --- 3. Evaluate endpoints once + cache --------------------------------
    mace_citation = ""
    if is_mace:
        # Load the MACE model once and reuse the calculator for every image.
        # Per-image work is a single forward pass (analytic energy + forces),
        # so the band is evaluated serially rather than pickling the torch
        # model across joblib processes. No SCF, no k-mesh, no basis.
        _mace_calc, _mace_numbers, _mace_cell, mace_citation = _load_mace_model(
            reactant, mlip_options, is_periodic
        )
        eval_kwargs: dict[str, Any] = {
            "calc": _mace_calc,
            "numbers": _mace_numbers,
            "cell": _mace_cell,
        }
        evaluator = _evaluate_image_mace
        n_jobs = 1
    elif is_msindo:
        # MSINDO (molecular, INDO over Slater orbitals). Composition is fixed
        # across the band, so the atomic numbers + charge + multiplicity are
        # read once from the reactant and reused for every image. No basis /
        # functional / k-mesh. Stateless engine => keep the user's ``n_jobs``
        # (parallel across images), unlike MACE's live-torch serial path.
        eval_kwargs = {
            "numbers": _atomic_numbers_of(reactant),
            "charge": int(reactant.charge),
            "multiplicity": int(reactant.multiplicity),
            "fd_step_bohr": fd_step_bohr,
            "dispersion_params": dispersion_params,
        }
        evaluator = _evaluate_image_msindo
    elif is_periodic:
        eval_kwargs = {
            "template": reactant,
            "basis_name": basis,
            "method": method_lower,
            "kmesh": kmesh,
            "functional": functional,
            "rhf_options": rhf_options,
            "uhf_options": uhf_options,
            "rks_options": rks_options,
            "uks_options": uks_options,
            "fd_step_bohr": fd_step_bohr,
            "dft_plus_u": dft_plus_u,
        }
        evaluator = _evaluate_image_periodic
    else:
        eval_kwargs = {
            "template": reactant,
            "basis_name": basis,
            "method": method_lower,
            "functional": functional,
            "rhf_options": rhf_options,
            "uhf_options": uhf_options,
            "rks_options": rks_options,
            "uks_options": uks_options,
            "gradient_options": gradient_options,
            "grid_options": grid_options,
            "dispersion_params": dispersion_params,
        }
        evaluator = _evaluate_image
    e_R, g_R, _d_R = evaluator(positions[0], image_index=0, **eval_kwargs)
    e_P, g_P, _d_P = evaluator(
        positions[-1], image_index=len(positions) - 1, **eval_kwargs
    )

    energies: list[float] = [e_R] + [0.0] * n_images + [e_P]
    gradients: list[np.ndarray] = (
        [g_R] + [np.zeros((n_atoms, 3))] * n_images + [g_P]
    )

    # Per-image density cache for within-image SCF warm-start
    # (NEB warm-start milestone). Only the RHF path on the
    # molecular evaluator returns a density today; UHF / RKS / UKS
    # and the periodic evaluator return None, so warm-start is a
    # no-op for them. ``warm_start`` (kwarg on run_neb) defaults to
    # True; pass False to force every SCF to cold-start (useful for
    # benchmarking + for paranoid bisection).
    image_densities: list[Optional[np.ndarray]] = [None] * (n_images + 2)

    # --- 3. Outer loop (quick-min) -----------------------------------------
    from joblib import Parallel, delayed

    velocities: list[np.ndarray] = [
        np.zeros((n_atoms, 3)) for _ in range(n_images)
    ]
    step = initial_step
    max_force = float("inf")
    converged = False
    n_iter = 0
    last_forces: list[np.ndarray] = [
        np.zeros((n_atoms, 3)) for _ in range(n_images)
    ]
    # CI-NEB warm-up: the band needs to be roughly settled before
    # we promote its highest-energy image to climbing; otherwise the
    # climbing-image selection can flip between outer iterations and
    # destabilise the loop.
    if not 0.0 <= climbing_image_start_fraction <= 1.0:
        raise ValueError(
            "climbing_image_start_fraction must be in [0, 1]; got "
            f"{climbing_image_start_fraction!r}"
        )
    climbing_warmup_iters = (
        int(round(max_iter * climbing_image_start_fraction))
        if climbing_image
        else 0
    )
    climbing_index: Optional[int] = None

    for outer in range(max_iter):
        n_iter = outer + 1
        # Per-image warm-start density: pass last iter's converged
        # density when ``warm_start`` is on; otherwise pass None
        # which falls back to the cold initial guess (SAD/Hcore).
        results = Parallel(n_jobs=n_jobs, prefer="processes")(
            delayed(evaluator)(
                positions[i],
                **eval_kwargs,
                initial_density=image_densities[i] if warm_start else None,
                image_index=i,
            )
            for i in range(1, n_images + 1)
        )
        for k, (e_k, g_k, d_k) in enumerate(results):
            energies[k + 1] = e_k
            gradients[k + 1] = g_k
            image_densities[k + 1] = d_k  # None for non-RHF; harmless.

        # Promote a climbing image once we are out of the warm-up
        # phase. Henkelman+Uberuaga+Jónsson 2000: pick the highest-
        # energy intermediate image at the moment of promotion;
        # keep that selection fixed for the remainder of the run so
        # the climber doesn't lose its momentum to a re-selection.
        if (
            climbing_image
            and climbing_index is None
            and outer >= climbing_warmup_iters
        ):
            climbing_index = int(np.argmax(energies[1:-1])) + 1

        forces, tangents = _neb_forces(
            positions,
            energies,
            gradients,
            spring_constant,
            frozen_mask,
            climbing_index=climbing_index,
            lattice=neb_lattice,
            dim=neb_dim,
        )
        max_force = max(float(np.max(np.abs(f))) for f in forces)

        if progress:
            ts_idx = int(np.argmax(energies[1:-1])) + 1
            path_len = sum(
                float(np.linalg.norm(positions[i + 1] - positions[i]))
                for i in range(len(positions) - 1)
            )
            climb_tag = (
                f" [CI={climbing_index}]" if climbing_index is not None else ""
            )
            print(
                f"neb iter {n_iter:3d}{climb_tag}: max|F| = {max_force:.4e} "
                f"Ha/bohr, E_TS = {energies[ts_idx]:.6f} Ha, "
                f"path length = {path_len:.3f} bohr"
            )

        if max_force < conv_tol_force:
            converged = True
            break

        # Quick-min velocity update + step. Each intermediate image:
        #   v <- v + Δt F
        #   if v.F < 0: v <- 0
        #   else:       v <- (v.F̂) F̂           (project onto F direction)
        #   R <- R + Δt v
        # Adaptive Δt: grow by 1.1 when aligned, reset on flips. This
        # is the standard MDMin / quick-min recipe (Henkelman+Jónsson
        # 2000 Sec. III.C and ASE's MDMin optimiser).
        aligned = True
        for k in range(n_images):
            f_k = forces[k]
            f_flat = f_k.ravel()
            f_norm = float(np.linalg.norm(f_flat))
            if f_norm < 1e-15:
                velocities[k] = np.zeros_like(f_k)
                continue
            v_k = velocities[k] + step * f_k
            dot = float(np.sum(v_k * f_k))
            if dot < 0.0:
                v_k = np.zeros_like(v_k)
                aligned = False
            else:
                # Project velocity onto force direction.
                f_hat = f_k / f_norm
                v_k = float(np.sum(v_k * f_hat)) * f_hat
            # Cap displacement: |Δx| <= max_step per atom.
            disp = step * v_k
            disp_norm = float(np.max(np.abs(disp)))
            if disp_norm > max_step:
                disp = disp * (max_step / disp_norm)
                v_k = np.zeros_like(v_k)
            velocities[k] = v_k
            positions[k + 1] = positions[k + 1] + disp
        last_forces = forces
        # Grow step when every image stayed aligned; reset on any
        # flip. Cap at 10x initial_step to keep the integrator stable
        # near convergence.
        if aligned:
            step = min(step * 1.1, 10.0 * initial_step)
        else:
            step = initial_step

    # --- 4. Build NEBResult ------------------------------------------------
    images: list[NEBImage] = []
    final_systems = [
        _rebuild_with_positions(reactant, p) for p in positions
    ]
    final_systems[0] = reactant
    final_systems[-1] = product
    # Tangents for endpoints aren't defined; populate intermediates only.
    forces_for_tangents, tangents = _neb_forces(
        positions,
        energies,
        gradients,
        spring_constant,
        frozen_mask,
        climbing_index=climbing_index,
        lattice=neb_lattice,
        dim=neb_dim,
    )
    for i, s in enumerate(final_systems):
        img = NEBImage(system=s)
        img.energy = energies[i]
        img.gradient = gradients[i].copy()
        if 1 <= i <= n_images:
            img.tangent = tangents[i - 1]
        images.append(img)
    path = NEBPath(
        images=images,
        spring_constant=spring_constant,
        climbing_image_index=climbing_index,
    )

    energies_arr = np.array(energies, dtype=float)
    ts_index: Optional[int] = None
    if n_images >= 1:
        # Highest-energy intermediate image (excluding endpoints).
        inner = energies_arr[1:-1]
        ts_index = int(np.argmax(inner)) + 1

    return NEBResult(
        path=path,
        energies=energies_arr,
        converged=converged,
        transition_state_index=ts_index,
        n_iter=n_iter,
        max_force=max_force,
        method=method,
        basis=basis,
        functional=functional,
        is_periodic=is_periodic,
        used_dft_plus_u=bool(dft_plus_u),
        mace_model_citation=(mace_citation or None),
    )


__all__ = [
    "NEBImage",
    "NEBPath",
    "NEBResult",
    "NEBImageSCFError",
    "interpolate_linear",
    "interpolate_idpp",
    "run_neb",
]