Source code for vibeqc.pbc_bipole

"""BIPOLE-style periodic RHF driver in CRYSTAL's electrostatic gauge.

CRYSTAL's 3D periodic HF energy uses one shared Ewald state for the
point-charge tail terms and a separate screened real-space machinery for
the AO two-electron build. This driver mirrors that composition:

* ``V_ne`` and ``E_nn`` use ``EWALD_3D`` with one explicit
  ``EwaldOptions`` object -- a single shared Ewald state across the terms.
  The default 3D ``V_ne`` path evaluates the smooth reciprocal piece
  analytically with shifted AO-pair Fourier transforms.
* The optional ``use_ewald_j_split`` path builds
  ``J = J_SR(w) + J_LR(w)`` with the same a used by ``V_ne`` / ``E_nn``.
  ``J_LR`` is represented as real-space blocks for real-space
  (lattice-sum) energy contractions, and includes the electron-electron
  neutralising-background Fock potential ``-pi N_e /(a^2 V) . S(g)``.
* Exchange remains the full direct-space ``K`` from
  ``build_fock_2e_real_space``; no Madelung K shift is applied.
* Energies are always evaluated as real-space lattice contractions,
  ``S_g tr[D(g)H(g)] + 1/2S_g tr[D(g)F^2e(g)]``, not from a Γ-folded
  operator.

V_ne gauge placement
--------------------
CRYSTAL and vibe-qc use the same four-component Ewald decomposition
(real-space erfc, reciprocal-space K!=0 sum, self-energy, jellium
background), but place the G=0 correction differently:

* **CRYSTAL**: the jellium background ``-pi Q_n^2/(2 b^2 V)`` is added
  to the nuclear-repulsion term ``E_nn``.  ``V_ne`` includes only
  the K!=0 reciprocal sum; the G=0 term is handled implicitly through
  the total-energy cancellation.
* **vibe-qc**: the V_ne operator receives an explicit background
  ``+pi Q_n/(a^2 V) . S(g)``, and E_nn receives the standard
  ``-pi Q_n^2/(2 a^2 V)`` jellium term.  For a neutral cell these
  cancel exactly in E_total.  Per-component diagnostics (E_ne, E_nuc)
  therefore differ from CRYSTAL's ENECYCLE output by the background
  magnitude (~16 Ha for MgO/STO-3G), but the total energy is
  invariant.

This is still an algorithmic re-implementation, not a CRYSTAL wrapper,
and no external QC program is imported at runtime. The remaining parity
gap to CRYSTAL's native BIPOLE code is the full Saunders-Dovesi-Roetti
multipole-far-pair branch: CRYSTAL replaces far direct ERIs with
truncated multipole expansions and prints the corresponding EXT
EL-POLE / EXT EL-SPHEROPOLE decomposition. The exact Ewald-J path here
is the production energy route; the opt-in multipole far-field branch
is still experimental/off by default, and the Γ-only CYC0 two-electron
external-parity comparison remains gated until that decomposition is
certified. Dense-k final parity should still be checked against the
external diagnostics before widening the certified surface.
"""

from __future__ import annotations

import time
from dataclasses import dataclass, field
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np

from ._vibeqc_core import (
    BasisSet,
    BlochKMesh,
    EwaldOptions,
    GridOptions,
    InitialGuess,
    LatticeMatrixSet,
    LatticeSumOptions,
    PeriodicSystem,
    SCFIteration,
    bloch_sum,
    build_fock_2e_real_space,
    build_jk_2e_real_space,
    compute_kinetic_lattice,
    compute_nuclear_erfc_lattice,
    compute_overlap_lattice,
    direct_lattice_cells,
    ewald_nuclear_repulsion,
    make_lattice_matrix_set,
    nuclear_repulsion_per_cell,
    real_space_density_from_kpoints,
)
from ._vibeqc_core import (
    monkhorst_pack as _native_monkhorst_pack,
)
from .bipole_ext_el_pole import compute_ext_el_spheropole
from .guess import initial_density_closed_shell
from .level_shift_schedule import LevelShiftSchedule
from .mom import select_occupied_by_max_overlap as _mom_select
from .oda import compute_oda_lambda as _compute_oda_lambda
from .oda import oda_mix_densities as _oda_mix
from .periodic_rhf_multi_k_ewald import (
    _canonical_orthogonalizer_complex,
    _damp_lattice_matrix,
    _diag_in_orth_basis,
)
from .periodic_scf_accelerators import (
    DynamicDamping,
    MultiKPeriodicSCFAccelerator,
)
from .periodic_v_ne import compute_nuclear_lattice_dispatch
from .progress import ProgressLogger, resolve_progress
from .scf_divergence import check_scf_divergence
from .smearing._support import reject_unsupported_smearing_temperature
from .symmetry_integrals_reduced import (
    compute_kinetic_lattice_reduced,
    compute_overlap_lattice_reduced,
)

__all__ = [
    "PBCBipoleEnergyComponents",
    "PBCBipoleRHFResult",
    "run_pbc_bipole_rhf",
]



[docs]
@dataclass
class PBCBipoleRHFResult:
    """Result of :func:`run_pbc_bipole_rhf`.

    Per-cell ``energy`` / ``e_electronic`` / ``e_nuclear`` and per-k
    matrices (``mo_energies``, ``mo_coeffs``, ``fock``, ``overlap``,
    ``hcore``) alongside the converged real-space ``density``. For 3D
    BIPOLE runs, ``ewald_alpha_bohr_inv`` records the single alpha used
    by V_ne / E_nn / optional J_LR.
    """

    energy: float
    e_electronic: float
    e_nuclear: float
    n_iter: int
    converged: bool

    mo_energies: List[np.ndarray]
    mo_coeffs: List[np.ndarray]
    fock: List[np.ndarray]
    overlap: List[np.ndarray]
    hcore: List[np.ndarray]

    density: LatticeMatrixSet

    # Fields with defaults must come after all non-default fields
    # (Python 3.14 dataclass enforcement).
    e_ext_el_spheropole: Optional[float] = None
    scf_trace: List[SCFIteration] = field(default_factory=list)
    ewald_alpha_bohr_inv: Optional[float] = None
    # Dudarev DFT+U contribution per unit cell (Hartree). 0 unless the
    # caller passed ``dft_plus_u=[HubbardSite(...)]``.
    e_dft_plus_u: float = 0.0
    energy_components: List[PBCBipoleEnergyComponents] = field(
        default_factory=list,
    )
    # Exchange convention provenance (option (b), 2026-06-10): True when
    # the run used the Ewald exchange split (K_SR(erfc) + K_LR(recip) +
    # G=0/Madelung correction, full-Bloch density, no spheropole term).
    # Consumers implementing the legacy gauge (the analytic-gradient
    # preview) check this flag and refuse mismatched inputs.
    exchange_ewald_split: bool = False
    exchange_exxdiv: Optional[str] = None
    fock_mixing: float = 0.0
    # Cartesian k-points (bohr^-1) and weights this result spans, in the
    # same order as the per-k ``mo_coeffs`` / ``mo_energies`` lists. Carried
    # so optional Gamma-only / single-k output writers (molden, QVF
    # wavefunction) can locate the Gamma block instead of guessing that the
    # first k-point is Gamma. Mirrors the GDF multi-k result contract
    # (periodic_k_gdf.py). None for legacy results built without it.
    kpoints_cart: Optional[np.ndarray] = None
    kpoint_weights: Optional[np.ndarray] = None



@dataclass
class _PBCBipoleFockBuild:
    """Internal Fock-build bundle for one density in the BIPOLE driver."""

    f2e_real: LatticeMatrixSet
    f_k_list: List[np.ndarray]
    e_j_short_range: Optional[float] = None
    e_j_long_range: Optional[float] = None
    e_exchange: Optional[float] = None
    e_j_multipole: Optional[float] = None
    # k-space exchange correction (Ewald exchange split): the K_LR +
    # G=0/Madelung pieces enter F(k) directly (not the real-space f2e
    # blocks), so their energy contribution 1/2.S_k w_k Tr[D(k).ΔF(k)]
    # must be added to the lattice-contracted E_2e by the caller.
    e_2e_k_correction: float = 0.0


from .pbc_bipole_common import (
    PBCBipoleEnergyComponents,
    _bloch_sum_blocks,
    _cell_key,
    _compute_nuclear_lattice_ewald_reciprocal_ft,
    _crystal_ewald_options,
    _default_bipole_v_ne_grid_options,
    _density_set_gamma_or_lattice,
    _expand_ibz_kmesh_for_ewald_j,
    _lattice_contract,
    _lattice_contract_blocks,
    _zero_cross_cell_density,
    bvk_torus_density_matrices,
    prepare_bipole_lattice_options,
    resolve_bipole_fock_symmetry,
    warn_bipole_charged_cell,
    warn_bipole_legacy_multik_gauge,
    home_cell_block,
)
from .pbc_bipole_fock import (
    BipoleFockContext,
    build_bipole_restricted_fock,
)



[docs]
def run_pbc_bipole_rhf(
    system: PeriodicSystem,
    basis: BasisSet,
    kmesh: BlochKMesh,
    options=None,
    *,
    linear_dep_threshold: float = 1e-7,
    canonical_orth_normalize_diag_first: bool = True,
    level_shift_schedule: Optional["LevelShiftSchedule"] = None,
    use_mom: bool = False,
    use_oda: bool = False,
    oda_trust_lambda_max: float = 1.0,
    use_incremental_fock: bool = True,
    use_ewald_j_split: Optional[bool] = None,
    ewald_omega: Optional[float] = None,
    ewald_precision: float = 1e-8,
    v_ne_grid_options: Optional[GridOptions] = None,
    use_multipole_diag: bool = False,
    use_multipole_far_field: Optional[bool] = None,
    multipole_l_max: int = 2,
    use_exchange_ewald_split: Optional[bool] = None,
    exchange_exxdiv: str = "ewald",
    use_fock_symmetry: Optional[bool] = None,
    use_fock_symmetry_reduce: bool = False,
    sr_image_extent_bohr: Optional[float] = None,
    progress: Union[bool, ProgressLogger, None] = None,
    verbose: Optional[int] = None,
    initial_density: Optional[Sequence[np.ndarray]] = None,
    bz_integration: Optional[str] = None,
    dft_plus_u: Optional[List["HubbardSite"]] = None,
) -> PBCBipoleRHFResult:
    """Multi-k closed-shell RHF via the CRYSTAL-gauge BIPOLE scaffold.

    ``dft_plus_u``: optional list of :class:`HubbardSite`. When set,
    the Dudarev rotationally-invariant per-spin V_U is added to every
    per-k Fock matrix using the same per-spin Bloch-summed
    convention as :func:`run_pbc_bipole_uhf` (closed-shell:
    ``P_s = P_total / 2``, ``E_U_total = 2 x E_s``).  The +U
    energy lands on ``result.e_dft_plus_u``.

    Algorithm (real-space two-electron / bielectronic build):
      1. Real-space one-electron integrals S(g), T(g), V_ne(g) at
         ``opts.lattice_opts.cutoff_bohr``. For 3D systems V_ne uses
         the same Ewald a as E_nn.
      2. Bloch-sum to S(k), Hcore(k) per k-point; canonical-orth X(k).
      3. Initial guess via ``opts.initial_guess`` (default SAD).
      4. SCF iter:
         a. Build F^{2e}(g). With ``use_ewald_j_split=True`` this is
            ``J_SR(g;w) + J_LR(g;w) + V_bg.S(g) - 1/2K(g)`` where the
            exchange convention depends on ``use_exchange_ewald_split``
            (below). With the flag off, use the legacy direct-only
            ``build_fock_2e_real_space`` scaffold.
         b. Bloch-sum F^{2e}(g) -> F(k); add Hcore(k).
         c. Energy: E_elec = S_g tr[D(g)Hcore(g)]
            + 1/2S_g tr[D(g)F^2e(g)] in real-space block form
            (real-space lattice-sum convention).
         d. Optional DIIS extrapolation of F(k) via [F,DS] errors.
         e. Optional LEVSHIFT shift on F(k).
         f. Diagonalise F(k) -> C(k), e(k).
         g. Optional MOM reorder of occupied subspace.
         h. Rebuild D_real via real_space_density_from_kpoints.
         i. Optional ODA mixing on density.
      5. E_total = E_elec + E_nuc.

    ``use_ewald_j_split`` defaults to ``None``. In that mode the
    driver automatically uses the CRYSTAL-gauge Ewald-J split for 3D
    systems and keeps the old direct-only path for dim < 3 diagnostic
    runs. Pass ``False`` explicitly only when you want the legacy
    direct-only F^2e scaffold for debugging. (Passing ``True`` on a
    dim < 3 system raises -- the Ewald split needs a 3D reciprocal
    lattice.)

    ``use_exchange_ewald_split`` (2026-06-10 energy-assembly redesign;
    multi-k q!=0 channels 2026-06-11; multi-k default flip 2026-06-13)
    defaults to ``None`` = auto: ON for any 3D run under the Ewald J
    split (Γ AND multi-k), OFF otherwise. When ON, the exchange uses
    the Ewald split convention (module docstring of
    :mod:`vibeqc.bipole_fock_ewald`)::

        K(k) = K_SR(erfc w, direct) + K_LR(erf w, reciprocal, q+G!=0)
               + (ξ_M - pi/(V_sc.w^2)).S(k).D(k).S(k)

    with ``ξ_M`` the probe-charge Ewald (Madelung) constant of the
    BvK supercell (= the unit cell at Γ; ``V_sc = n_k.V``) when
    ``exchange_exxdiv='ewald'`` (the default; PySCF-equivalent) or 0
    when ``'none'``. At multi-k the LR term couples every k-point
    pair through the momentum-transfer channels ``q = k - k′`` (see
    :func:`vibeqc.bipole_fock_ewald.compute_K_long_range_at_k`). In
    this mode the SCF density is the full Bloch fold (the Γ-locality
    projection ``P(g!=0)=0`` is **not** applied), and the EXT
    EL-SPHEROPOLE term is omitted from the total -- at the corrected
    gauge it is a double-count (MgO Γ fixed-density audit,
    2026-06-10: the reassembled total matches PySCF GDF RHF to
    truncation with no spheropole term). When OFF (explicit
    ``False``), the legacy convention is kept: full-Coulomb
    direct-space K, Γ-locality projection at n_k = 1, spheropole term
    added -- known to mis-state absolute energies on tight ionic
    cells (kept only for the legacy-gauge analytic gradient + parity
    diagnostics). The corrected multi-k gauge needs a Monkhorst-Pack
    ``BlochKMesh`` carrying its ``mesh`` metadata; under the auto
    default an ad-hoc k-list (band path / explicit list) at multi-k
    falls back to the legacy gauge with a log note, and an explicit
    ``True`` with such a mesh raises.

    For dim < 3 the whole one- and two-electron Coulomb gauge falls
    back to ``DIRECT_TRUNCATED`` (no Ewald, no reciprocal sum), and the
    ``EXT EL-SPHEROPOLE`` correction -- a 3D-Ewald reciprocal-space
    (K=0 limit) term -- is identically zero, so it is omitted and
    ``e_ext_el_spheropole`` is ``None``. The resulting energy is the
    direct-truncated value: vacuum-independent and equal to the
    molecular RHF energy in the isolated-cell limit (see
    ``tests/test_pbc_bipole_dim_lt3.py``).

    For 3D systems the default ``V_ne`` implementation is analytic:
    erfc-screened nuclear attraction from libint plus a reciprocal-space
    AO-pair Fourier-transform sum. Passing ``v_ne_grid_options`` opts
    into the older grid-quadrature long-range ``V_ne`` path for
    diagnostics.
    """
    from ._vibeqc_core import PeriodicRHFOptions

    opts = options if options is not None else PeriodicRHFOptions()
    if bz_integration is not None:
        bz_kind = str(bz_integration).strip().lower()
        if bz_kind != "smearing":
            raise NotImplementedError(
                "run_pbc_bipole_rhf: bz_integration only accepts None or "
                "'smearing'; parameter-free Gilat integration is available "
                "on the BIPOLE RKS route only."
            )
    reject_unsupported_smearing_temperature(
        opts,
        "run_pbc_bipole_rhf",
        detail=(
            "BIPOLE RHF requires integer occupations; use BIPOLE "
            "RKS/UHF/UKS for finite-temperature smearing."
        ),
    )
    lat_opts: LatticeSumOptions = opts.lattice_opts
    plog = resolve_progress(progress, verbose=verbose)
    # CRYSTAL-style gauge separation (per the EWALD_3D / BIPOLE audit):
    # V_ne and E_nn use Ewald with one shared alpha. F^{2e} uses the
    # direct lattice cell list for J_SR/K; the optional J_LR reciprocal
    # sum consumes the same alpha as the one-electron Ewald state.
    (
        use_ewald_j_split,
        use_ewald_j_split_auto,
        lat_opts_2e,
        lat_opts_1e,
    ) = prepare_bipole_lattice_options(system, lat_opts, use_ewald_j_split, plog)

    plog.info(f"PBC BIPOLE (CRYSTAL-gauge) / cutoff {lat_opts.cutoff_bohr:.2f} bohr")
    plog.info(
        f"  V_ne + E_nn  : {lat_opts_1e.coulomb_method.name}"
        f"   (Ewald gauge for point-charge tails)"
    )
    plog.info(
        f"  F^2e (J + K) : "
        f"{'EWALD_J_SPLIT' if use_ewald_j_split else lat_opts_2e.coulomb_method.name}"
        f"{' (auto)' if use_ewald_j_split_auto else ''}"
        f"   (direct J_SR/K cell list"
        f"{' + reciprocal J_LR' if use_ewald_j_split else ''})"
    )
    plog.info(f"basis: {basis.name}  ({basis.nbasis} BFs / {basis.nshells} shells)")

    # Closed-shell sanity.
    n_elec = system.n_electrons()
    if n_elec % 2 != 0:
        raise ValueError(
            f"run_pbc_bipole_rhf: closed-shell RHF requires even electron "
            f"count; got {n_elec}"
        )
    if system.multiplicity != 1:
        raise ValueError(
            f"run_pbc_bipole_rhf: requires multiplicity=1; got {system.multiplicity}"
        )
    n_occ = n_elec // 2

    _kmesh_ibz = kmesh
    _ir_mapping = np.asarray(getattr(kmesh, "ir_mapping", []), dtype=int).reshape(-1)

    k_points = list(_kmesh_ibz.kpoints)
    weights = np.asarray(_kmesh_ibz.weights, dtype=float)

    if use_ewald_j_split and _ir_mapping.size > 0:
        # IBZ-reduced input meshes are EXPANDED TO THE FULL MESH up
        # front and the whole SCF runs on the full mesh. The previous
        # "IBZ-native" shortcut diagonalised at the IBZ points and
        # replicated D(k) into each star without the AO rotation
        # D(R.k) = P(R).D(k).P(R)ᵀ -- exact only for trivial stars (the
        # He validation cells); on MgO/STO-3G [2,2,2] it left the SCF
        # unconverged 8.25 Ha from the full-mesh result (2026-06-10
        # probe; regression in tests/test_pbc_bipole_multik_ewald_split
        # pins full==IBZ equality). True IBZ-native reduction needs the
        # symmetry-adapted k-star transport -- groundwork + probe
        # findings live in vibeqc.periodic_k_symmetry.
        kmesh_full = _expand_ibz_kmesh_for_ewald_j(system, kmesh, plog)
        if len(list(kmesh_full.kpoints)) > len(k_points):
            plog.info(
                "  IBZ input mesh expanded to the full MP mesh for the "
                "whole SCF (correctness; IBZ-native reduction pending "
                "symmetry-adapted k-star transport)"
            )
            kmesh = kmesh_full
            k_points = list(kmesh.kpoints)
            weights = np.asarray(kmesh.weights, dtype=float)
            n_k = len(k_points)
            _ir_mapping = np.asarray([], dtype=int)
        k_points_full = k_points
        weights_full = weights
    else:
        k_points_full = k_points
        weights_full = weights
    n_k = len(k_points)
    if n_k == 0:
        raise ValueError("kmesh has no k-points")
    if not np.isclose(weights.sum(), 1.0):
        raise ValueError(f"kmesh.weights must sum to 1; got {weights.sum():.6f}")
    plog.info(
        f"k-mesh: {n_k} k-point{'s' if n_k != 1 else ''}, "
        f"weights sum = {weights.sum():.4f}"
    )

    # ---- Exchange Ewald-split resolution (option (b), 2026-06-10) ----
    # Corrected exchange convention K_SR(erfc) + K_LR(reciprocal) +
    # G=0/Madelung correction (bipole_fock_ewald module docstring).
    # The corrected gauge is the DEFAULT under the Ewald J split at BOTH
    # Γ and multi-k (Phase-5 flip, 2026-06-13): the q = k-k' != 0
    # LR-exchange channels (Phase 3, 2026-06-11/12) are parity-validated
    # (H₂ box [2,1,1] vs PySCF KRHF, supercell-unfolding identity to
    # +0.0001 mHa/cell; MgO [2,2,2] c8 -14.8 mHa vs legacy +3.9 Ha).
    # Pass use_exchange_ewald_split=False for the legacy gauge.
    if exchange_exxdiv not in ("ewald", "none"):
        raise ValueError(
            f"run_pbc_bipole_rhf: exchange_exxdiv must be 'ewald' or "
            f"'none'; got {exchange_exxdiv!r}"
        )
    _x_split_auto = use_exchange_ewald_split is None
    exchange_split_active = (
        bool(use_ewald_j_split)
        if _x_split_auto
        else bool(use_exchange_ewald_split)
    )
    if exchange_split_active and not use_ewald_j_split:
        raise ValueError(
            "run_pbc_bipole_rhf: use_exchange_ewald_split=True requires "
            "the Ewald J split (use_ewald_j_split=True)."
        )
    # Multi-k split: the q-channel tables, the BvK-torus density fold,
    # and the supercell Madelung correction all need the true
    # Monkhorst-Pack dimensions. Ad-hoc k-lists carry mesh = (1,1,1)
    # placeholders (see the to_bloch_kmesh binding). Under the auto
    # default an ad-hoc multi-k mesh falls back to the legacy gauge
    # (no surprise breakage for explicit-k-list / band-path runs);
    # an explicit use_exchange_ewald_split=True still raises.
    _bvk_mesh: Optional[Tuple[int, int, int]] = None
    if exchange_split_active and n_k > 1:
        _mesh_attr = tuple(
            int(x) for x in getattr(kmesh, "mesh", (1, 1, 1))
        )
        if int(np.prod(_mesh_attr)) != n_k:
            if _x_split_auto:
                plog.info(
                    "  multi-k corrected exchange gauge needs a "
                    "Monkhorst-Pack mesh (BvK-torus fold + supercell ξ_M); "
                    "this ad-hoc k-list has no mesh metadata -> falling back "
                    "to the legacy gauge. Pass a monkhorst_pack(...) mesh "
                    "for the corrected gauge."
                )
                exchange_split_active = False
            else:
                raise ValueError(
                    "run_pbc_bipole_rhf: the Ewald exchange split at multi-k "
                    "requires a Monkhorst-Pack BlochKMesh carrying its mesh "
                    f"dimensions (got mesh={_mesh_attr} for {n_k} k-points). "
                    "Build the mesh via monkhorst_pack(...); ad-hoc k-point "
                    "lists are not supported on the corrected gauge."
                )
        else:
            _bvk_mesh = _mesh_attr

    warn_bipole_legacy_multik_gauge(system, exchange_split_active, n_k, plog)
    warn_bipole_charged_cell(system, plog)

    # CRYSTAL-style shared Ewald state for all point-charge-tail terms.
    # V_ne, E_nn, and the optional reciprocal J^LR build must consume the
    # same alpha AND the same K_max -- because finite-cutoff G=0
    # cancellation requires matched reciprocal envelopes.
    ewald_options_1e: Optional[EwaldOptions] = None
    omega_used: Optional[float] = None
    ewald_cell_volume: Optional[float] = None
    ewald_k_max: Optional[float] = None
    if system.dim == 3:
        from .bipole_ext_el_pole import (
            crystal_default_ewald_alpha,
            crystal_ewald_reciprocal_cutoff,
        )

        V_cell = float(
            abs(
                np.linalg.det(np.asarray(system.lattice, dtype=float)),
            )
        )
        ewald_cell_volume = V_cell
        omega_used = (
            float(ewald_omega)
            if ewald_omega is not None
            else crystal_default_ewald_alpha(V_cell)
        )
        ewald_k_max = crystal_ewald_reciprocal_cutoff(V_cell)
        ewald_options_1e = _crystal_ewald_options(
            lat_opts_1e,
            alpha_bohr_inv=omega_used,
            tolerance=float(ewald_precision),
            recip_cutoff_bohr_inv=ewald_k_max,
        )
        plog.info(
            f"  Ewald state: a = {omega_used:.6f} bohr⁻¹, "
            f"cutoff_real = {lat_opts_1e.nuclear_cutoff_bohr:.2f} bohr, "
            f"K_max = {ewald_k_max:.2f} bohr⁻¹, "
            f"tol = {float(ewald_precision):.0e}"
        )

    # Probe-charge Ewald (Madelung) constant for the exchange G=0
    # correction (exxdiv='ewald'; PySCF-equivalent). a-independent.
    # At multi-k the constant is the BvK-SUPERCELL Madelung -- the
    # multi-k SCF is the supercell Γ SCF exactly unfolded, and PySCF's
    # _ewald_exxdiv_for_G0 applies this same single madelung(cell,
    # kpts) value to every k-point.
    _xi_madelung = 0.0
    if exchange_split_active and exchange_exxdiv == "ewald":
        if n_k > 1:
            from .bipole_fock_ewald import probe_charge_madelung_supercell

            assert _bvk_mesh is not None
            _xi_madelung = probe_charge_madelung_supercell(system, _bvk_mesh)
        else:
            from .bipole_fock_ewald import probe_charge_madelung

            _xi_madelung = probe_charge_madelung(system)
    if exchange_split_active:
        plog.info(
            f"  Exchange: Ewald split -- K_SR(erfc w) + K_LR(reciprocal"
            + (f", {n_k}^2 (k,k′) q-channels" if n_k > 1 else "")
            + f") + G=0 correction (exxdiv={exchange_exxdiv}"
            + (
                f", ξ_M{'(supercell)' if n_k > 1 else ''} = "
                f"{_xi_madelung:.6f} Ha"
                if exchange_exxdiv == "ewald"
                else ""
            )
            + ")"
        )
        plog.info(
            "  Density: full Bloch fold (Γ-locality projection OFF); "
            "EXT EL-SPHEROPOLE omitted (gauge-consistent)"
        )

    # ---- Real-space one-electron integrals -------------------------------
    # S, T use cell-list-only cutoff (lat_opts_2e -- they're independent
    # of coulomb_method). V_ne uses lat_opts_1e so the EWALD_3D path is
    # taken on 3D systems (CRYSTAL-equivalent gauge).
    with plog.stage(
        "integrals_lattice",
        detail=f"S/T/V at cutoff {lat_opts.cutoff_bohr:.2f} bohr",
    ):
        _use_sym = bool(getattr(getattr(system, "symmetry", None), "operations", None))
        if _use_sym:
            ops = system.symmetry.operations
            cells = direct_lattice_cells(system, lat_opts_2e.cutoff_bohr)
            plog.info(
                f"S/T integrals: symmetry-reduced path "
                f"(SG {system.symmetry.international_symbol}, "
                f"{system.symmetry.order} ops, "
                f"{len(cells)} lattice cells)"
            )
            _, S_blocks = compute_overlap_lattice_reduced(
                basis,
                system,
                lat_opts_2e,
                ops,
            )
            S_lat = make_lattice_matrix_set(
                basis.nbasis, cells, [np.asarray(b, dtype=float) for b in S_blocks]
            )

            _, T_blocks = compute_kinetic_lattice_reduced(
                basis,
                system,
                lat_opts_2e,
                ops,
            )
            T_lat = make_lattice_matrix_set(
                basis.nbasis, cells, [np.asarray(b, dtype=float) for b in T_blocks]
            )
        else:
            S_lat = compute_overlap_lattice(basis, system, lat_opts_2e)
            T_lat = compute_kinetic_lattice(basis, system, lat_opts_2e)
        v_ne_lr_cache = None
        if (
            system.dim == 3
            and ewald_options_1e is not None
            and v_ne_grid_options is None
        ):
            plog.info(
                "  V_ne Ewald long range: analytic AO-pair FT (shared with J^LR cache)"
            )
            V_lat, v_ne_lr_cache = _compute_nuclear_lattice_ewald_reciprocal_ft(
                basis,
                system,
                lat_opts_1e,
                ewald_options_1e,
                S_lat,
                precision=ewald_precision,
                K_max=ewald_k_max,
            )
        else:
            v_ne_grid = (
                v_ne_grid_options
                if v_ne_grid_options is not None
                else (_default_bipole_v_ne_grid_options() if system.dim == 3 else None)
            )
            V_lat = compute_nuclear_lattice_dispatch(
                basis,
                system,
                lat_opts_1e,
                grid_options=v_ne_grid,
                ewald_options=ewald_options_1e,
            )
    cells = list(S_lat.cells)
    plog.info(f"n_cells in lattice sum = {len(cells)}")

    # ---- Lattice-fold convergence guard (Ewald exchange split) ----------
    # The corrected gauge contracts the FULL Bloch folds of S/T/V/F2e --
    # unlike the legacy Γ-locality gauge, whose projected energy only
    # ever touched home-cell operator blocks. Diffuse AO tails (e.g.
    # STO-3G Mg 3sp, outermost exponent ≈ 0.046) keep cross-cell
    # overlaps alive far beyond typical kernel-driven cutoffs: on
    # MgO/STO-3G the S(Γ) fold truncation is 3.9e-1 at cutoff 8,
    # 1.5e-2 at 10, 5.0e-3 at 12 and 2.3e-6 at 16 bohr -- and an SCF
    # run on a grossly under-converged fold can descend into spurious
    # states (c8: converged 0.70 Ha below the PySCF reference with the
    # electron count off by 0.43 in the converged metric; 2026-06-10
    # diagnosis). Measure the fold drift against a 1.5x cutoff overlap
    # (cheap one-electron integrals) and warn loudly.
    if exchange_split_active:
        from .pbc_bipole_common import s_fold_truncation_drift

        _s_drift = s_fold_truncation_drift(
            basis,
            system,
            lat_opts_2e,
            k_points=(k_points if n_k > 1 else None),
        )
        _s_label = "S(k) fold truncation (max over mesh)" if n_k > 1 else "S(Γ) fold truncation"
        if _s_drift > 1e-2:
            plog.info(
                f"  WARNING: {_s_label} {_s_drift:.1e} at cutoff "
                f"{lat_opts_2e.cutoff_bohr:.1f} bohr -- the lattice sums are "
                f"badly under-converged for this basis's AO tails; "
                f"absolute energies are UNRELIABLE (spurious SCF states "
                f"possible). Increase lattice_opts.cutoff_bohr until the "
                f"drift falls below 1e-4."
            )
        elif _s_drift > 1e-4:
            plog.info(
                f"  note: {_s_label} {_s_drift:.1e} at cutoff "
                f"{lat_opts_2e.cutoff_bohr:.1f} bohr -- expect "
                f"~{_s_drift:.0e}-scale absolute-energy truncation; "
                f"increase cutoff_bohr for tighter work"
            )
        else:
            plog.info(f"  {_s_label}: {_s_drift:.1e} (converged)")

    # Density cell list. Under the Ewald exchange split the SCF density
    # is stored on a 2x-cutoff list: the C++ JK builder's traversal
    # forms cell-pair differences |b-a| up to 2x the cutoff, and P(h)
    # lookups beyond the density's own list are silently skipped
    # (cpp/src/periodic_fock.cpp ``p_block``). With the non-decaying Γ
    # Bloch fold those dropped alive-overlap J/K terms are
    # SCF-exploitable: MgO/STO-3G c8 converged 0.70 Ha BELOW PySCF with
    # the electron count off by 0.43 in the reference metric
    # (2026-06-10 diagnosis). Traversal cost is unchanged -- the builder
    # derives its quartet cells from lat_opts; the density list only
    # feeds lookups. Operators (S/T/V/F2e blocks) stay on ``cells``;
    # the energy contraction iterates operator cells (see
    # ``_lattice_contract_blocks``).
    if exchange_split_active:
        cells_density = list(
            direct_lattice_cells(system, 2.0 * float(lat_opts_2e.cutoff_bohr))
        )
        plog.info(
            f"  density cell list: {len(cells_density)} cells "
            f"(2x cutoff -- resolves every P(b-a) difference)"
        )
    else:
        cells_density = cells

    # Per-k S(k), Hcore(k), orthogonaliser X(k).
    from .linear_dependence import (
        check_overlap_matrix,
        format_linear_dependence_report,
        raise_if_severe,
        scf_preflight_overlap_check,
    )

    S_k_list: List[np.ndarray] = []
    T_k_list: List[np.ndarray] = []
    V_ne_k_list: List[np.ndarray] = []
    Hcore_k_list: List[np.ndarray] = []
    X_k_list: List[np.ndarray] = []
    overlap_reports = []
    for k_idx, k in enumerate(k_points):
        k_arr = np.asarray(k, dtype=float).reshape(3)
        S_k = np.asarray(bloch_sum(S_lat, k_arr))
        T_k = np.asarray(bloch_sum(T_lat, k_arr))
        V_k = np.asarray(bloch_sum(V_lat, k_arr))
        T_k = 0.5 * (T_k + T_k.conj().T)
        V_k = 0.5 * (V_k + V_k.conj().T)
        H_k = T_k + V_k
        S_k = 0.5 * (S_k + S_k.conj().T)
        H_k = 0.5 * (H_k + H_k.conj().T)
        overlap_label = f"S(k={k_idx}, k_cart={k_arr.round(4).tolist()})"
        if n_k <= 16:
            report = scf_preflight_overlap_check(
                S_k,
                plog=plog,
                label=overlap_label,
                basis=basis,
            )
        else:
            report = check_overlap_matrix(
                S_k,
                basis=basis,
                label=overlap_label,
            )
            if report.severity != "ok":
                prefix = {
                    "warn": "WARN",
                    "error": "ERROR",
                    "critical": "CRITICAL",
                }[report.severity]
                cond_str = (
                    f"{report.condition_number:.2e}"
                    if np.isfinite(report.condition_number)
                    else "+inf"
                )
                plog.info(
                    f"[{prefix}] overlap [{overlap_label}]: "
                    f"nbf={report.n_basis}, "
                    f"min eig={report.min_eigenvalue:+.2e}, "
                    f"cond={cond_str}, severity={report.severity}"
                )
                plog.write_raw(format_linear_dependence_report(report))
            raise_if_severe(report)
        X_k, n_kept = _canonical_orthogonalizer_complex(
            S_k,
            linear_dep_threshold,
            normalize_diag_first=canonical_orth_normalize_diag_first,
        )
        overlap_reports.append(report)
        if n_occ > n_kept:
            raise RuntimeError(
                f"run_pbc_bipole_rhf: canonical orth at k={k_idx} "
                f"dropped too many directions (n_occ={n_occ}, n_kept={n_kept})"
            )
        S_k_list.append(S_k)
        T_k_list.append(T_k)
        V_ne_k_list.append(V_k)
        Hcore_k_list.append(H_k)
        X_k_list.append(X_k)
    if n_k > 16:
        severity_rank = {"ok": 0, "warn": 1, "error": 2, "critical": 3}
        worst = max(
            overlap_reports,
            key=lambda r: severity_rank.get(r.severity, -1),
        )
        min_s = min(float(r.min_eigenvalue) for r in overlap_reports)
        max_cond = max(float(r.condition_number) for r in overlap_reports)
        cond_str = f"{max_cond:.2e}" if np.isfinite(max_cond) else "+inf"
        plog.info(
            f"overlap [k-mesh summary]: n_k={n_k}, nbf={basis.nbasis}, "
            f"min eig={min_s:+.2e}, max cond={cond_str}, "
            f"severity={worst.severity}"
        )

    # ---- Nuclear repulsion per cell --------------------------------------
    if ewald_options_1e is not None:
        e_nuc = float(ewald_nuclear_repulsion(system, ewald_options_1e))
    else:
        e_nuc = float(nuclear_repulsion_per_cell(system, lat_opts_1e))
    plog.info(f"E_nuc per cell ({lat_opts_1e.coulomb_method.name}) = {e_nuc:+.10f} Ha")

    # ---- Initial guess ---------------------------------------------------
    C_per_k: List[np.ndarray] = []
    eps_per_k: List[np.ndarray] = []
    for H_k, X_k in zip(Hcore_k_list, X_k_list):
        C_k, eps_k = _diag_in_orth_basis(H_k, X_k)
        C_per_k.append(C_k.astype(complex))
        eps_per_k.append(eps_k)
    def _rebuild_real_space_density(C_per_k_local):
        return real_space_density_from_kpoints(
            C_per_k_local,
            [n_occ] * n_k,
            kmesh,
            cells_density,
        )

    D_real = _rebuild_real_space_density(C_per_k)
    if not exchange_split_active:
        _zero_cross_cell_density(D_real, basis.nbasis, n_k)

    # Caller-supplied warm-start density takes precedence over both the
    # SAD/Hcore guess engine and the Hcore-diag fallback. The caller is
    # responsible for matching ``initial_density`` blocks against the
    # canonical ``direct_lattice_cells(kmesh)`` ordering (which is what
    # the SCF's ``D_real`` uses). Used by the NEB driver for within-
    # image density warm-start across outer iterations + within FD-
    # gradient displaced SCFs (periodic NEB warm-start milestone
    # follow-up).
    if initial_density is not None:
        blocks_in = list(initial_density)
        if len(blocks_in) != len(D_real.cells):
            raise ValueError(
                f"run_pbc_bipole_rhf: initial_density has {len(blocks_in)} "
                f"blocks; expected {len(D_real.cells)} (one per cell in "
                f"direct_lattice_cells(kmesh))"
            )
        for g_idx, block in enumerate(blocks_in):
            D_real.set_block(g_idx, np.asarray(block, dtype=float))
        plog.info("initial guess: caller-supplied density (warm-start)")
        initial_density_is_local = True
        density_from_c_per_k = False
    else:
        # SAD override (place SAD density at g=0; zeros elsewhere).
        guess = getattr(opts, "initial_guess", InitialGuess.HCORE)
        D_engine = initial_density_closed_shell(
            system.unit_cell_molecule(),
            basis,
            n_occ,
            guess,
            is_periodic=True,
            periodic_system=system,
            lattice_opts=lat_opts_2e,
            # READ restart (Γ-only): prior g=0 cell density (pre-resolved from
            # read_from, or read + projected from read_path). Ignored unless READ.
            read_density=getattr(opts, "read_density", None),
            read_path=getattr(opts, "read_path", ""),
        )
        if D_engine is not None:
            plog.info(f"initial guess: {guess.name} (g=0 density from GuessEngine)")
            for g_idx in range(len(D_real.cells)):
                if (D_real.cells[g_idx].index == np.array([0, 0, 0])).all():
                    D_real.set_block(g_idx, D_engine)
                else:
                    D_real.set_block(g_idx, np.zeros_like(D_engine, dtype=float))
        else:
            plog.info(f"initial guess: {guess.name} (Hcore-diag per k)")
        initial_density_is_local = D_engine is not None
        density_from_c_per_k = not initial_density_is_local

    D_real_prev: Optional[LatticeMatrixSet] = None

    # ---- SCF aids: damping, accelerator family, LEVSHIFT, MOM, ODA ------
    damping = float(opts.damping)
    if not (0.0 <= damping < 1.0):
        raise ValueError(f"run_pbc_bipole_rhf: damping must be in [0,1); got {damping}")

    damper: Optional[DynamicDamping] = None
    if bool(getattr(opts, "dynamic_damping", False)):
        damper = DynamicDamping(
            initial_alpha=damping,
            alpha_min=float(getattr(opts, "dynamic_damping_min", 0.0)),
            alpha_max=float(getattr(opts, "dynamic_damping_max", 0.95)),
        )

    use_diis = bool(opts.use_diis)
    diis_start_iter = int(opts.diis_start_iter)
    accel: Optional[MultiKPeriodicSCFAccelerator] = (
        MultiKPeriodicSCFAccelerator(opts) if use_diis else None
    )

    level_shift_static = float(getattr(opts, "level_shift", 0.0))
    if level_shift_schedule is not None and not isinstance(
        level_shift_schedule,
        LevelShiftSchedule,
    ):
        raise TypeError(
            f"level_shift_schedule must be a LevelShiftSchedule or None; "
            f"got {type(level_shift_schedule).__name__}"
        )
    if level_shift_schedule is not None:
        plog.info(f"level_shift_schedule: {level_shift_schedule.as_list()}")

    # CRYSTAL-style FMIXING: blend previous Fock into current before
    # diagonalisation.  Applied after DIIS, before level-shift.  Same
    # convention as the gamma EWALD_3D and GDF drivers.
    fock_mixing_value = float(getattr(opts, "fock_mixing", 0.0))
    if not (0.0 <= fock_mixing_value < 1.0):
        raise ValueError(
            f"run_pbc_bipole_rhf: fock_mixing must be in [0, 1); "
            f"got {fock_mixing_value}"
        )
    if fock_mixing_value != 0.0:
        plog.info(
            f"fock mixing: CRYSTAL FMIXING "
            f"{100.0 * fock_mixing_value:.1f}% "
            "(previous Fock matrix weight)"
        )

    if use_mom:
        plog.info("MOM (Maximum Overlap Method): ON")
    C_prev_occ_per_k: Optional[List[np.ndarray]] = None

    if use_oda and use_diis:
        raise ValueError(
            "run_pbc_bipole_rhf: use_oda and use_diis are mutually exclusive"
        )
    if use_oda:
        if not (0.0 < oda_trust_lambda_max <= 1.0):
            raise ValueError(
                f"oda_trust_lambda_max must be in (0, 1]; got {oda_trust_lambda_max}"
            )
        plog.info(
            f"ODA (Optimal Damping): ON (+1 Fock build/iter, "
            f"trust l_max = {oda_trust_lambda_max})"
        )

    # ---- Optional: Ewald J-split F^2e build (Phase 5 of BIPOLE branch) ---
    j_lr_cache = v_ne_lr_cache
    if use_ewald_j_split:
        # CRYSTAL-equivalent gauge: V_ne + E_nn use Ewald, F^2e uses
        # J^SR(direct erfc-screened) + J^LR(analytic reciprocal-sum) - 1/2K.
        # Single shared a between V_ne, E_nn, and J_LR (one shared
        # Ewald state).
        #
        # Multi-k J^LR uses Bloch-summed shifted-ν AO-pair FTs and a
        # k-space r̂(K). The operator is materialised as real-space
        # blocks below so both diagonalisation and real-space energy
        # accounting see the same long-range J.
        if system.dim != 3:
            raise ValueError(
                f"use_ewald_j_split requires dim=3 (3D periodic). Got dim={system.dim}."
            )
        if n_k > 1 and _ir_mapping.size == 0:
            # Non-uniform weights without ir_mapping: can't expand.
            uniform_w = 1.0 / float(n_k)
            if not np.allclose(weights, uniform_w, atol=1e-9):
                raise ValueError(
                    "use_ewald_j_split at multi-k requires uniform full-mesh "
                    "weights or an IBZ-reduced Monkhorst-Pack mesh carrying "
                    "ir_mapping metadata so the driver can expand it. "
                    f"Got non-uniform weights = {weights.tolist()}."
                )
        from .bipole_fock_ewald import (
            _build_j_long_range_cache,
            compute_J_long_range_real_space_blocks,
            compute_rho_hat_from_k_density,
        )

        assert omega_used is not None
        plog.info(
            f"Ewald J-split F^2e: ON (CRYSTAL-equivalent gauge); "
            f"w = {omega_used:.4f} bohr⁻¹, precision = {ewald_precision:.0e}"
        )
        # Pre-build the shifted-ν FT cache once -- invariant across SCF
        # iters + k-points within an iter.  Γ-only needs the same cache
        # for real-space energy blocks even though the Fock can be built
        # from the k=0 folded matrix.
        cells_r_cart_arr = np.array(
            [np.asarray(c.r_cart, dtype=float) for c in cells],
            dtype=float,
        )
        if j_lr_cache is None:
            j_lr_cache = _build_j_long_range_cache(
                basis,
                system,
                cells_r_cart_arr,
                omega_used,
                ewald_precision,
                K_max=ewald_k_max,
            )
        elif j_lr_cache.ft_per_cell.shape[0] != len(cells):
            raise RuntimeError(
                "prebuilt V_ne/J^LR cache has a different cell count "
                f"({j_lr_cache.ft_per_cell.shape[0]}) from S_lat "
                f"({len(cells)})"
            )
        plog.info(
            f"  J^LR cache: {j_lr_cache.K_vectors.shape[0]} K-vectors, "
            f"{j_lr_cache.ft_per_cell.shape[0]} lattice cells"
        )

    # Multi-k Ewald-exchange-split: per-(k,k′) q-channel tables for the
    # LR exchange (option (b) Phase 3). Shares the J^LR cache's Ewald w
    # and K_max envelope; the q == 0 diagonal channel reuses the J^LR
    # fold tensors outright.
    x_lr_cache = None
    if exchange_split_active and n_k > 1:
        from .bipole_fock_ewald import build_k_exchange_long_range_cache

        assert j_lr_cache is not None and ewald_k_max is not None
        x_lr_cache = build_k_exchange_long_range_cache(
            basis,
            system,
            j_lr_cache,
            K_max=ewald_k_max,
        )
        plog.info(
            f"  K^LR q-channels: {n_k} distinct q = k-k′ shifts on the "
            f"shared K_max = {ewald_k_max:.2f} bohr⁻¹ envelope"
        )

    def _split_k_density_list(density: LatticeMatrixSet) -> List[np.ndarray]:
        """Per-k density matrices for the Ewald-exchange-split paths.

        Exact for every density representation the SCF loop produces
        (orbital rebuilds, SAD/PATOM local guesses, caller warm-starts,
        damped and ODA-mixed densities): at Γ the BvK representative is
        the home-cell block; at multi-k the BvK-torus fold inverts the
        Bloch transform exactly (see ``bvk_torus_density_matrices``).
        """
        if n_k == 1:
            return [home_cell_block(density).astype(complex)]
        assert _bvk_mesh is not None
        return bvk_torus_density_matrices(density, k_points, _bvk_mesh)

    # Incremental/differential J_SR+K_SR accumulator (opt-in). Active
    # only in the corrected gauge (where J_SR+K_SR is one fused erfc
    # traversal -- the linear-in-density piece the accumulator telescopes)
    # and with DIIS (ODA's interleaved naive build would break the per-
    # iter ΔD chain). The single direct traversal is ~99% of the
    # corrected-gauge Fock-build wall (2026-06-14 profile).
    incremental_jk = None
    if use_incremental_fock:
        if exchange_split_active and not use_oda:
            from .bipole_fock_ewald import IncrementalJK

            incremental_jk = IncrementalJK()
            plog.info(
                "  incremental Fock (differential J_SR/K_SR via ΔD "
                "density-envelope screening): ON"
            )
        else:
            plog.info(
                "  incremental Fock requested but inactive "
                "(needs the corrected gauge + DIIS, not ODA)"
            )

    def _build_fock_for_density(
        density: LatticeMatrixSet,
        *,
        coeffs_for_rho: Optional[Sequence[np.ndarray]],
        use_incremental: bool = True,
    ) -> _PBCBipoleFockBuild:
        """Build F^2e(g) and F(k) for one real-space density.

        Thin wrapper over the shared restricted BIPOLE Fock builder
        (``pbc_bipole_fock.build_bipole_restricted_fock``, alpha_hf=1.0);
        the per-k Fock assembly (Bloch sum + Hcore + K_corr, then
        Hermitisation) stays here. See the builder for the gauge
        invariants and the K_corr derivation.

        ``coeffs_for_rho`` is supplied only when the density is exactly
        represented by the current per-k orbitals. ``use_incremental``:
        when False, force a full J_SR/K_SR build even if the incremental
        accumulator is active -- used for ODA's extra naive build and the
        post-convergence rebuild, which are off the per-iter ΔD chain.
        """
        fb = build_bipole_restricted_fock(
            _fock_ctx,
            density,
            coeffs_for_rho=coeffs_for_rho,
            alpha_hf=1.0,
            use_incremental=use_incremental,
        )
        f2e_real = fb.f2e_real
        K_corr_per_k = fb.k_corr_per_k

        f_k_list: List[np.ndarray] = []
        for k_idx, k in enumerate(k_points):
            k_arr = np.asarray(k, dtype=float)
            F2e_k = _bloch_sum_blocks(
                f2e_real.blocks,
                f2e_real.cells,
                k_arr,
            )
            F_k = F2e_k + np.asarray(Hcore_k_list[k_idx], dtype=complex)
            if K_corr_per_k is not None:
                # Ewald exchange split: K_LR + G=0/Madelung pieces live
                # in k-space (one matrix per k; a single Γ entry at
                # n_k = 1).
                F_k = F_k - 0.5 * K_corr_per_k[k_idx]
            F_k = 0.5 * (F_k + F_k.conj().T)
            f_k_list.append(F_k)

        return _PBCBipoleFockBuild(
            f2e_real=f2e_real,
            f_k_list=f_k_list,
            e_j_short_range=fb.e_j_short_range,
            e_j_long_range=fb.e_j_long_range,
            e_exchange=fb.e_exchange,
            e_j_multipole=fb.e_j_multipole,
            e_2e_k_correction=fb.e_2e_k_correction,
        )


    # ---- Multipole far-field config (resolve once before SCF loop) -----
    from .bipole_fock_multipole import (  # noqa: E402
        BipoleMultipoleConfig,
        resolve_multipole_config,
    )

    if exchange_split_active and use_multipole_far_field:
        raise NotImplementedError(
            "run_pbc_bipole_rhf: the multipole far-field J replacement has "
            "not been re-validated under the Ewald exchange split (wide "
            "density list + split-K gauge). Pass "
            "use_exchange_ewald_split=False to combine it with the legacy "
            "gauge, or omit use_multipole_far_field."
        )
    _mp_config = resolve_multipole_config(
        system,
        basis,
        lat_opts_2e,
        user_enable=(False if exchange_split_active else use_multipole_far_field),
        multipole_l_max=multipole_l_max,
    )
    if _mp_config.enabled:
        plog.info(
            f"  BIPOLE multipole far-field: ENABLED  "
            f"(L_max={_mp_config.L_max}, R_bipole={_mp_config.R_bipole:.1f} bohr, "
            f"n_cells={len(_mp_config.cache.cells) if _mp_config.cache else 0})"
        )
    else:
        plog.info(
            f"  BIPOLE multipole far-field: off  "
            f"(R_bipole={_mp_config.R_bipole:.1f} bohr, "
            f"cutoff={lat_opts_2e.cutoff_bohr:.1f} bohr)"
        )

    # ---- SCF loop --------------------------------------------------------

    # SYM3b Fock symmetry enforcement is OPT-IN ONLY -- rationale
    # (boundary truncation asymmetry) lives on the shared resolver.
    _fock_sym_map, _rep_cell_indices = resolve_bipole_fock_symmetry(
        system,
        basis,
        lat_opts_2e,
        use_fock_symmetry,
        use_fock_symmetry_reduce,
        plog,
    )
    # ---- Shared restricted Fock-build context (M2 unification) ----------
    # Bundle the per-run invariants the inline Fock build used to capture,
    # so the heavy J^SR/J^LR/K assembly lives once in pbc_bipole_fock.
    # _build_fock_for_density (above) references this via late binding --
    # it is only ever called from the SCF loop below, after this point.
    _fock_ctx = BipoleFockContext(
        basis=basis,
        system=system,
        lat_opts_2e=lat_opts_2e,
        use_ewald_j_split=use_ewald_j_split,
        exchange_split_active=exchange_split_active,
        n_k=n_k,
        omega_used=omega_used,
        ewald_precision=ewald_precision,
        ewald_cell_volume=ewald_cell_volume,
        n_elec=n_elec,
        xi_madelung=_xi_madelung,
        j_lr_cache=j_lr_cache,
        x_lr_cache=x_lr_cache,
        incremental_jk=incremental_jk,
        rep_cell_indices=_rep_cell_indices,
        fock_sym_map=_fock_sym_map,
        mp_config=_mp_config,
        s_lat=S_lat,
        s_k_list=S_k_list,
        k_points=k_points,
        weights=weights,
        k_points_full=k_points_full,
        weights_full=weights_full,
        ir_mapping=_ir_mapping,
        bvk_mesh=_bvk_mesh,
        n_occ=n_occ,
        plog=plog,
        sr_image_extent=sr_image_extent_bohr,
    )

    plog.banner("SCF (PBC BIPOLE, direct-space)")
    plog.info("  iter         energy (Ha)            dE          ||[F,DS]||   DIIS")

    # ---- DFT+U setup (closed-shell BIPOLE +U) ----------------------------
    # Same per-spin per-k convention as run_pbc_bipole_uhf -- for closed-
    # shell we use P_s = P_total/2 and the spin sum doubles E_s.
    dft_plus_u_sites_cxx: List = []
    dft_plus_u_ao_groups: List[List[int]] = []
    if dft_plus_u:
        from ._vibeqc_core import _HubbardSiteCxx
        from .dft_plus_u import ao_group_indices

        ao_groups_map = ao_group_indices(basis)
        for site in dft_plus_u:
            key = (site.atom_index, site.l)
            if key not in ao_groups_map:
                raise ValueError(
                    f"run_pbc_bipole_rhf: HubbardSite{key} has no AOs "
                    f"in the basis. Available channels: "
                    f"{sorted(ao_groups_map.keys())}"
                )
            dft_plus_u_sites_cxx.append(
                _HubbardSiteCxx(site.atom_index, site.l, site.U_eff_hartree)
            )
            dft_plus_u_ao_groups.append(ao_groups_map[key])

    scf_trace: List[SCFIteration] = []
    energy_components: List[PBCBipoleEnergyComponents] = []
    E_prev = 0.0
    e_dft_plus_u = 0.0
    F_k_list: List[np.ndarray] = [np.zeros_like(H) for H in Hcore_k_list]
    F_k_prev_mixed: Optional[List[np.ndarray]] = None  # for fock_mixing
    E_elec = 0.0
    converged = False
    iter_idx = 0

    for iter_idx in range(1, int(opts.max_iter) + 1):
        if damper is not None:
            damping = damper.alpha
        diis_active = use_diis and iter_idx >= diis_start_iter
        E_j_short_range: Optional[float] = None
        E_j_long_range: Optional[float] = None
        E_exchange: Optional[float] = None
        E_j_multipole: Optional[float] = None

        # Damping (skip when DIIS active).
        D_used = D_real
        if iter_idx > 1 and damping > 0.0 and not diis_active:
            D_used = _damp_lattice_matrix(D_real, D_real_prev, damping)

        # --- F^{2e}(g) build.
        # Use the k-space r̂(K) route only when the real-space density
        # is exactly represented by C_per_k. Local SAD, fixed damping,
        # and ODA-mixed densities are real-space densities; for those,
        # J^LR must be built from the actual density blocks to avoid
        # using stale orbitals in the reciprocal-space piece.
        d_used_is_damped = iter_idx > 1 and damping > 0.0 and not diis_active
        d_used_from_coeffs = (
            density_from_c_per_k
            and not (initial_density_is_local and iter_idx == 1)
            and not d_used_is_damped
        )
        fock_build = _build_fock_for_density(
            D_used,
            coeffs_for_rho=(C_per_k if d_used_from_coeffs else None),
        )
        F2e_real = fock_build.f2e_real
        # (SYM3b Fock symmetrization happens inside _build_fock_for_density,
        # before the Bloch sum, so f2e_real and f_k_list are consistent.)
        F_k_list = fock_build.f_k_list
        E_j_short_range = fock_build.e_j_short_range
        E_j_long_range = fock_build.e_j_long_range
        E_exchange = fock_build.e_exchange
        E_j_multipole = fock_build.e_j_multipole

        # ---- DFT+U: per-spin per-k Fock contribution (closed-shell).
        # n_s = S_k w_k Re[(S(k) P_s(k) S(k))_(A,l)] with P_s = P_total/2;
        # V_AO_s = U_eff (1/2 - n_s); per-k Fock += S(k) V_AO_s S(k).
        # E_total_U = 2 x E_s (spin sum).
        e_dft_plus_u = 0.0
        if dft_plus_u_sites_cxx:
            from ._vibeqc_core import (
                _compute_dft_plus_u_multi_k_per_spin_cxx,
            )

            P_split_k_for_u: Optional[List[np.ndarray]] = None
            if exchange_split_active:
                # Unprojected Bloch fold: S_g over the full cell list
                # overcounts (the stored density is the BvK-periodic
                # extension) -- read the BvK representative instead
                # (home block at Γ; exact torus fold at multi-k).
                P_split_k_for_u = _split_k_density_list(D_used)
            P_sigma_k_for_U: List[np.ndarray] = []
            for k_idx in range(n_k):
                if P_split_k_for_u is not None:
                    P_k = P_split_k_for_u[k_idx]
                else:
                    k_arr = np.asarray(k_points[k_idx], dtype=float)
                    P_k = _bloch_sum_blocks(
                        D_used.blocks,
                        D_used.cells,
                        k_arr,
                    )
                # Closed-shell: per-spin density = P_total / 2.
                P_sigma = 0.25 * (P_k + P_k.conj().T)
                P_sigma_k_for_U.append(P_sigma)
            E_sigma, V_AO = _compute_dft_plus_u_multi_k_per_spin_cxx(
                dft_plus_u_sites_cxx,
                dft_plus_u_ao_groups,
                S_k_list,
                P_sigma_k_for_U,
                list(weights),
            )
            e_dft_plus_u = 2.0 * float(E_sigma)
            V_AO_cmplx = np.asarray(V_AO, dtype=complex)
            for k_idx in range(n_k):
                S_k = S_k_list[k_idx]
                F_k_list[k_idx] = F_k_list[k_idx] + (S_k @ V_AO_cmplx @ S_k)
                F_k_list[k_idx] = 0.5 * (
                    F_k_list[k_idx] + F_k_list[k_idx].conj().T
                )

        # --- Per-cell electronic energy + [F,DS] error vectors.
        #
        # CRYSTAL's energy path contracts the real-space density against
        # real-space operator blocks: E = S_g D(g)H(g) + 1/2S_g D(g)F^2e(g).
        # This is essential at CYC0, where SAD is localised at g=0 and
        # Γ-folding T/V would incorrectly add cross-cell one-electron
        # blocks. k-space D(k) is still needed for error vectors,
        # level-shift projection, and the J^LR split path.
        E_kin = _lattice_contract(D_used, T_lat, operator_name="T")
        E_ne = _lattice_contract(D_used, V_lat, operator_name="V_ne")
        E_2e = (
            0.5
            * _lattice_contract(
                D_used,
                F2e_real,
                operator_name="F2e",
            )
            # k-space exchange correction (Ewald exchange split): the
            # K_LR + G=0/Madelung pieces live in F(k), not f2e_real.
            + fock_build.e_2e_k_correction
        )
        E_elec = E_kin + E_ne + E_2e
        grad_norm_sum = 0.0
        error_k_list: List[np.ndarray] = []
        D_k_list: List[np.ndarray] = []
        D_k_split_guess: Optional[List[np.ndarray]] = None
        if exchange_split_active and initial_density_is_local and iter_idx == 1:
            # Caller warm-starts may carry the full Bloch fold (D at
            # every cell) -- S_g over the cutoff list would overcount;
            # read the BvK representative instead (home block at Γ,
            # exact torus fold at multi-k).
            D_k_split_guess = _split_k_density_list(D_used)
        for idx in range(n_k):
            if initial_density_is_local and iter_idx == 1:
                # SAD/PATOM-style local guesses are stored explicitly as
                # D(g=0)=D_atom_sum and D(g!=0)=0. Their Bloch sum is the
                # same D at every k; using the Hcore-diag C(k) seed here
                # would make the energy/error vector inconsistent with
                # the Fock matrix that was just built from SAD.
                if D_k_split_guess is not None:
                    D_k = D_k_split_guess[idx]
                else:
                    k_arr = np.asarray(k_points[idx], dtype=float)
                    D_k = _bloch_sum_blocks(D_used.blocks, D_used.cells, k_arr)
                D_k = 0.5 * (D_k + D_k.conj().T)
            else:
                # Multi-k (or legacy): D_k from previous iter's C.
                C_k = C_per_k[idx]
                C_occ = C_k[:, :n_occ]
                D_k = 2.0 * (C_occ @ C_occ.conj().T)
            D_k_list.append(D_k)
            H_k = Hcore_k_list[idx]
            F_k = F_k_list[idx]
            w = float(weights[idx])
            S_k = S_k_list[idx]
            FDS = F_k @ D_k @ S_k
            grad = FDS - FDS.conj().T
            error_k_list.append(grad)
            grad_norm_sum += w * float(np.linalg.norm(grad))

        E_total = float(E_elec) + e_nuc + e_dft_plus_u

        # EXT EL-SPHEROPOLE -- CRYSTAL's K=0 Ewald reciprocal-space
        # limit term, added to energy only (not the Fock matrix). It is a
        # 3D-Ewald-gauge correction and is identically zero in the direct
        # (non-Ewald) gauge used for dim<3, so it is absent there.
        # Under the Ewald exchange split it is omitted: at the corrected
        # gauge (full Bloch density + split exchange + v_bg.S) the total
        # already matches the reference assembly and the spheropole would
        # be a double-count (MgO Γ fixed-density audit, 2026-06-10).
        if system.dim == 3 and not exchange_split_active:
            E_sphero = compute_ext_el_spheropole(D_used, basis, system, lat_opts)
            E_total += E_sphero
        else:
            E_sphero = None

        dE = E_total - E_prev if iter_idx > 1 else 0.0

        check_scf_divergence(
            "run_pbc_bipole_rhf",
            iter_idx,
            E_total,
            grad_norm_sum,
            dE,
        )
        scf_trace.append(
            SCFIteration(
                iter=iter_idx,
                energy=float(E_total),
                delta_e=float(dE if iter_idx > 1 else 0.0),
                grad_norm=float(grad_norm_sum),
                diis_subspace=(accel.subspace_size if accel is not None else 0),
            )
        )
        plog.iteration(
            iter_idx,
            energy=float(E_total),
            dE=float(dE if iter_idx > 1 else 0.0),
            grad=float(grad_norm_sum),
            diis=(accel.subspace_size if accel is not None else 0),
        )
        energy_components.append(
            PBCBipoleEnergyComponents(
                iter=int(iter_idx),
                e_total=float(E_total),
                e_electronic=float(E_elec),
                e_kinetic=float(E_kin),
                e_nuclear_attraction=float(E_ne),
                e_two_electron=float(E_2e),
                e_nuclear_repulsion=float(e_nuc),
                e_bielet_zone_ee=(None if use_ewald_j_split else float(E_2e)),
                e_ext_el_spheropole=E_sphero,
                e_j_short_range=E_j_short_range,
                e_j_long_range=E_j_long_range,
                e_exchange=E_exchange,
                e_j_multipole=E_j_multipole,
            )
        )
        plog.energy_decomposition(
            iter_idx,
            E_kin=float(E_kin),
            E_ne=float(E_ne),
            E_2e=float(E_2e),
            E_elec=float(E_elec),
            E_nuc=float(e_nuc),
        )

        # ---- Multipole far-field diagnostics (if enabled) -----------
        if use_multipole_diag and system.dim == 3 and not exchange_split_active:
            from .bipole_fock_multipole import (
                build_j_far_field_multipole,
                estimate_bipole_radius,
            )

            try:
                R_bipole = estimate_bipole_radius(
                    system,
                    basis,
                    L_max=multipole_l_max,
                )
                far_j = build_j_far_field_multipole(
                    D_used,
                    basis,
                    system,
                    lat_opts_2e,
                    L_max=multipole_l_max,
                    R_bipole=R_bipole,
                    cache=_mp_config.cache if _mp_config.enabled else None,
                )
                plog.info(
                    f"  BIPOLE far-field (L_max={multipole_l_max}, "
                    f"R_bipole={R_bipole:.1f} bohr): "
                    f"E_J_far = {far_j.e_j_far:+.6f} Ha, "
                    f"n_pairs = {far_j.n_cell_pairs}"
                )
            except Exception as exc:
                plog.info(
                    f"  BIPOLE far-field diagnostic failed: {type(exc).__name__}: {exc}"
                )

        converged = (
            iter_idx > 1
            and abs(dE) < float(opts.conv_tol_energy)
            and grad_norm_sum < float(opts.conv_tol_grad)
        )

        # --- SCF-accelerator extrapolation. The full
        # {DIIS, KDIIS, EDIIS, EDIIS_DIIS, ADIIS} family + dynamic_damping
        # is wired on the multi-k BIPOLE path: DIIS / KDIIS run natively
        # per-k (Pulay / orbital-rotation-gradient designs from M2c);
        # EDIIS / ADIIS / EDIIS_DIIS bridge through the stacked-real-block
        # representation landed in M2e (see
        # ``per_k_to_stacked_real_blocks`` in
        # ``periodic_scf_accelerators.py``).
        if accel is not None:
            if exchange_split_active:
                # Unprojected Bloch fold: S_g overcounts (see the +U
                # fold above) -- BvK representative per k instead.
                density_k_list = _split_k_density_list(D_used)
            else:
                density_k_list = [
                    _bloch_sum_blocks(D_used.blocks, D_used.cells, np.asarray(k))
                    for k in k_points
                ]
            F_ex_list = accel.extrapolate_rhf(
                F_k_list,
                error_k_list=error_k_list,
                density_k_list=density_k_list,
                energy=E_total,
                mo_coeffs_k_list=C_per_k,
                n_occ=n_occ,
                weights=list(weights),
                cells=cells,
                kpoints=list(k_points),
            )
            # On the converged iteration, diagonalise the *physical* Fock
            # F(D_used) -- not the extrapolated one. At a true fixed point
            # F(D) commutes with D, so diagonalising the bare Fock
            # reproduces the converged density exactly and yields canonical
            # orbitals. Diagonalising an extrapolated Fock here can move the
            # solution off the fixed point: when the SCF lands essentially
            # on the solution in one step, the DIIS error history collapses
            # to machine-zero, the Pulay B-matrix goes singular, and the
            # degenerate solve returns large ±coefficients whose Fock
            # combination is numerical garbage. Its Aufbau diagonalisation
            # can occupy the wrong orbital -- the H₂/STO-3G RHF + [2,1,1]
            # spurious basins at bz≈1.449 (-0.347 Ha) and bz≈1.399
            # (-0.528 Ha); smooth -1.728 elsewhere; PySCF KRHF confirms a
            # single smooth solution. See
            # tests/test_pbc_bipole_diis_converged_basin.py.
            if diis_active and not converged:
                F_k_list = F_ex_list

        # --- FMIXING (CRYSTAL-style, after DIIS, before level-shift)
        # Skipped on the converged iteration (see DIIS note above): the
        # final diagonalisation must see the physical converged Fock.
        if fock_mixing_value != 0.0 and not converged:
            if F_k_prev_mixed is not None:
                F_mixed_list: List[np.ndarray] = []
                for idx in range(n_k):
                    F_mixed = (1.0 - fock_mixing_value) * F_k_list[
                        idx
                    ] + fock_mixing_value * F_k_prev_mixed[idx]
                    F_mixed = 0.5 * (F_mixed + F_mixed.conj().T)
                    F_mixed_list.append(F_mixed)
                F_k_list = F_mixed_list
            F_k_prev_mixed = [np.asarray(F, dtype=complex).copy() for F in F_k_list]

        # --- LEVSHIFT (per-iter schedule or static)
        # Skipped on the converged iteration (see DIIS note above).
        if level_shift_schedule is not None:
            level_shift_b = level_shift_schedule.at(iter_idx)
        else:
            level_shift_b = level_shift_static
        if level_shift_b != 0.0 and not converged:
            F_for_diag: List[np.ndarray] = []
            for idx in range(n_k):
                D_k = D_k_list[idx]
                S_k = S_k_list[idx]
                F_shift = (
                    F_k_list[idx]
                    + level_shift_b * S_k
                    - (level_shift_b / 2.0) * (S_k @ D_k @ S_k)
                )
                F_shift = 0.5 * (F_shift + F_shift.conj().T)
                F_for_diag.append(F_shift)
        else:
            F_for_diag = F_k_list

        # --- Diagonalise F(k) -> new C(k), e(k)
        new_C_per_k = []
        new_eps_per_k = []
        for idx in range(n_k):
            C_k, eps_k = _diag_in_orth_basis(F_for_diag[idx], X_k_list[idx])
            new_C_per_k.append(C_k)
            new_eps_per_k.append(eps_k)

        # --- MOM reorder (iter >= 2 only; falls through to Aufbau at iter 1)
        if use_mom and C_prev_occ_per_k is not None:
            for idx in range(n_k):
                C_k = new_C_per_k[idx]
                eps_k = new_eps_per_k[idx]
                S_k = S_k_list[idx]
                sel = _mom_select(
                    C_k,
                    S_k,
                    C_prev_occ_per_k[idx],
                    n_occ,
                    eps_new=eps_k,
                )
                n_kept_idx = C_k.shape[1]
                virt_mask = np.ones(n_kept_idx, dtype=bool)
                virt_mask[sel] = False
                virt_sel = np.where(virt_mask)[0]
                virt_sel = virt_sel[np.argsort(np.real(eps_k[virt_sel]))]
                order = np.concatenate([sel, virt_sel])
                new_C_per_k[idx] = C_k[:, order]
                new_eps_per_k[idx] = eps_k[order]

        C_per_k = new_C_per_k
        eps_per_k = new_eps_per_k

        # --- Rebuild D_real.  At Γ-only, real_space_density_from_kpoints
        # produces P(g)=P(Γ) at every cell -- the correct Bloch fold. With
        # the Ewald exchange split that is exactly the density the
        # builders need (the erfc-screened K_SR sum is absolutely
        # convergent with non-decaying P). On the legacy path the
        # Γ-locality projection P(g!=0)=0 is kept: its full-Coulomb K
        # series would diverge with the unprojected fold.
        D_real_new = _rebuild_real_space_density(C_per_k)
        if not exchange_split_active:
            _zero_cross_cell_density(D_real_new, basis.nbasis, n_k)

        # --- ODA mixing (extra Fock build)
        if use_oda:
            fock_naive = _build_fock_for_density(
                D_real_new,
                coeffs_for_rho=C_per_k,
                use_incremental=False,  # off the per-iter ΔD chain
            )
            oda_step = _compute_oda_lambda(
                D_used,
                D_real_new,
                F_k_list,
                fock_naive.f_k_list,
                [np.asarray(k) for k in k_points],
                weights,
                trust_lambda_max=oda_trust_lambda_max,
            )
            _oda_mix(D_used, D_real_new, oda_step.lam)
            D_real_prev = D_real
            D_real = D_used
            density_from_c_per_k = oda_step.lam == 1.0
            plog.info(
                f"  ODA: l = {oda_step.lam:.4f} "
                f"(g0 = {oda_step.g0:+.3e}, g1 = {oda_step.g1:+.3e})"
            )
        else:
            D_real_prev = D_used
            D_real = D_real_new
            density_from_c_per_k = True

        # Snapshot for next iter's MOM
        if use_mom:
            C_prev_occ_per_k = [
                np.asarray(C_per_k[idx][:, :n_occ]).copy() for idx in range(n_k)
            ]

        if damper is not None:
            damper.update(E_total)
        E_prev = E_total
        if converged:
            break

    plog.converged(n_iter=iter_idx, energy=E_total, converged=converged)

    # ---- Post-loop: recompute energy on final density for consistency
    if converged:
        _fb = _build_fock_for_density(
            D_real, coeffs_for_rho=C_per_k, use_incremental=False
        )
        E_kin_final = _lattice_contract(D_real, T_lat, operator_name="T")
        E_ne_final = _lattice_contract(D_real, V_lat, operator_name="V_ne")
        E_2e_final = (
            0.5
            * _lattice_contract(
                D_real,
                _fb.f2e_real,
                operator_name="F2e",
            )
            + _fb.e_2e_k_correction
        )
        E_elec = E_kin_final + E_ne_final + E_2e_final
        E_total = float(E_elec) + e_nuc + e_dft_plus_u
        # Fresh E_total doesn't include spheropole -- add it (3D only; the
        # term is zero in the direct gauge used for dim<3, and omitted
        # under the Ewald exchange split -- see the SCF-loop note).
        if system.dim == 3 and not exchange_split_active:
            E_sphero_final = compute_ext_el_spheropole(D_real, basis, system, lat_opts)
            E_total += E_sphero_final
        else:
            E_sphero_final = None
    else:
        # Non-converged: E_total already includes spheropole from the
        # last SCF iteration.  Store it for the result.
        E_sphero_final = energy_components[-1].e_ext_el_spheropole

    return PBCBipoleRHFResult(
        energy=float(E_total),
        e_electronic=float(E_elec),
        e_nuclear=e_nuc,
        n_iter=iter_idx,
        converged=converged,
        mo_energies=eps_per_k,
        mo_coeffs=C_per_k,
        fock=F_k_list,
        overlap=S_k_list,
        hcore=Hcore_k_list,
        density=D_real,
        e_ext_el_spheropole=E_sphero_final,
        scf_trace=scf_trace,
        ewald_alpha_bohr_inv=omega_used,
        e_dft_plus_u=float(e_dft_plus_u),
        energy_components=energy_components,
        exchange_ewald_split=bool(exchange_split_active),
        exchange_exxdiv=(exchange_exxdiv if exchange_split_active else None),
        fock_mixing=fock_mixing_value,
        kpoints_cart=np.asarray(k_points, dtype=float).reshape(-1, 3),
        kpoint_weights=np.asarray(weights, dtype=float).reshape(-1),
    )