Source code for vibeqc.io.orca_hess

"""ORCA ``.hess`` ASCII writer for vibe-qc :class:`HessianResult` objects.

Phase M1. The ``.hess`` format is ORCA's documented ASCII Hessian
container, consumed natively by:

  - **moltui** (``moltui out.hess`` for animated normal-mode display);
  - **chemcraft**, **avogadro**, **gabedit** (vibrational-mode
    visualization);
  - **VMD** (via the molefacture / nmwiz plugin).

This is the cleanest interchange path between vibe-qc and the
visualization ecosystem because ``.hess`` is fully ASCII (no
proprietary binary structure like ``.gbw``) and version-stable.

Format reference: extracted from a real ORCA 6.1.1 NumFreq run on
H₂O / HF / STO-3G. The 14 sections ORCA writes, in order:

  1. ``$orca_hessian_file`` — header
  2. ``$act_atom`` — atom index for partial Hessians (``0`` for full)
  3. ``$act_coord`` — coordinate index (``0`` for full)
  4. ``$act_energy`` — total electronic energy in Ha
  5. ``$multiplicity``
  6. ``$hessian`` — (3N, 3N) Cartesian Hessian in Ha/bohr², printed in
     5-columns-per-block layout with row labels
  7. ``$vibrational_frequencies`` — 3N freqs in cm⁻¹ (zero modes 0.0,
     imaginary modes negative)
  8. ``$normal_modes`` — (3N, 3N) mass-weighted normal-mode matrix
     (each *column* is a mode), 5-columns-per-block layout
  9. ``$atoms`` — N atoms with ``label mass(amu) x y z`` in bohr
 10. ``$actual_temperature`` — finite-T data temperature; 0.0 by default
 11. ``$frequency_scale_factor`` — ORCA's harmonic-to-empirical scale
 12. ``$dipole_derivatives`` — (3N, 3) ∂μ/∂R tensor in atomic units;
     **only written when present on the HessianResult**
 13. ``$ir_spectrum`` — N rows of ``wavenumber eps Int TX TY TZ``;
     **only written when dipole derivatives are present**
 14. ``$end``

When ``HessianResult.dipole_derivatives is None`` the writer emits a
zeroed ``$dipole_derivatives`` block + a zeroed ``$ir_spectrum`` so
the file remains readable by tools that scan for those sections; the
mode-amplitude information (which moltui actually uses) is unaffected.
"""

from __future__ import annotations

import os
from typing import Optional, Union

import numpy as np

from .._vibeqc_core import Atom, Molecule
from ..hessian import HessianResult, ir_intensities


__all__ = ["write_orca_hess"]


# ----------------------------------------------------------------------
# Element symbols — covers Z = 1..118
# ----------------------------------------------------------------------

_ELEMENT_SYMBOLS = (
    "H",  "He", "Li", "Be", "B",  "C",  "N",  "O",  "F",  "Ne",
    "Na", "Mg", "Al", "Si", "P",  "S",  "Cl", "Ar", "K",  "Ca",
    "Sc", "Ti", "V",  "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
    "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y",  "Zr",
    "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
    "Sb", "Te", "I",  "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
    "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
    "Lu", "Hf", "Ta", "W",  "Re", "Os", "Ir", "Pt", "Au", "Hg",
    "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th",
    "Pa", "U",  "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
    "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
    "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og",
)


def _element_symbol(z: int) -> str:
    if 1 <= z <= len(_ELEMENT_SYMBOLS):
        return _ELEMENT_SYMBOLS[z - 1]
    return f"Z{z}"


# ----------------------------------------------------------------------
# Block-printing helpers — match ORCA's exact layout so character-by-
# character diff against an ORCA-produced file passes.
# ----------------------------------------------------------------------

def _write_block_matrix(fh, M: np.ndarray, *, cols_per_block: int = 5) -> None:
    """Write a 2-D matrix in ORCA's ``$hessian`` / ``$normal_modes``
    layout — split columns into chunks of ``cols_per_block``, each
    chunk preceded by a header row of column indices, followed by
    rows labeled by row index.

    Layout (matches ORCA 6.x exactly so the file is a drop-in for any
    consumer that does positional parsing rather than tokenising on
    whitespace):

      - Row label: ``%5d``
      - First column number: ``%22.10E`` (6-space-leading for positive,
        5-space + sign for negative) — extra wide so the row label
        and the first datum read clearly.
      - Subsequent column numbers: ``%19.10E`` (3-space-leading for
        positive, 2-space + sign for negative).

    Header row mirrors the column-number layout: 5-space row-label gap,
    then per-column ``%22d`` for the first column and ``%19d`` for the
    rest, with 8 trailing spaces (ORCA's habit).
    """
    n_rows, n_cols = M.shape
    for c0 in range(0, n_cols, cols_per_block):
        c1 = min(c0 + cols_per_block, n_cols)
        # Header line follows ORCA's Fortran format:
        #   (18X, I3, 4(16X, I3), 8X)
        # i.e. 18 leading spaces, col index in 3 chars, then 16-space
        # gap + 3-char col index per subsequent column, with 8 trailing
        # spaces. Total = 18 + 3 + 4*(16+3) + 8 = 105 chars for a
        # 5-wide block.
        header = " " * 18
        for j, c in enumerate(range(c0, c1)):
            if j > 0:
                header += " " * 16
            header += f"{c:>3d}"
        header += " " * 8
        fh.write(header + "\n")
        # Data lines:
        #   (I5, 6X, ES16.10, 4(3X, ES16.10))
        # i.e. 5-wide row label, 6 spaces, first 16-char number, then
        # 3-space gap + 16-char number per subsequent column. The
        # %22.10E and %19.10E formats produce identical output for
        # positive numbers; for negatives the sign consumes 1 of the
        # leading spaces (5 + 16 instead of 6 + 16).
        for r in range(n_rows):
            line = f"{r:>5d}"
            for j, c in enumerate(range(c0, c1)):
                width = 22 if j == 0 else 19
                line += f"{M[r, c]:>{width}.10E}"
            fh.write(line + "\n")


def _write_dipole_derivatives(fh, D: np.ndarray) -> None:
    """``$dipole_derivatives`` block: 3N rows × 3 cols, no labels."""
    n = D.shape[0]
    fh.write("$dipole_derivatives\n")
    fh.write(f"{n}\n")
    for r in range(n):
        fh.write(
            f"   {D[r, 0]:.10E}   {D[r, 1]:.10E}   {D[r, 2]:.10E}\n"
        )


# ----------------------------------------------------------------------
# Public writer
# ----------------------------------------------------------------------


[docs]
def write_orca_hess(
    path: Union[str, os.PathLike],
    molecule: Molecule,
    result: HessianResult,
    *,
    energy_ha: float = 0.0,
    multiplicity: int = 1,
    actual_temperature_K: float = 0.0,
    frequency_scale_factor: float = 1.0,
) -> None:
    """Write a vibe-qc :class:`HessianResult` to an ORCA-format
    ``.hess`` file.

    Parameters
    ----------
    path:
        Output file path (overwritten if it exists).
    molecule:
        The :class:`Molecule` the Hessian was computed for. Atom
        species, masses, and Cartesian positions go into the
        ``$atoms`` block.
    result:
        :class:`HessianResult` carrying the Cartesian Hessian, mass-
        weighted normal modes, and frequencies.
    energy_ha:
        Total electronic energy at the reference geometry, in Hartree.
        Goes into ``$act_energy``. Pass the SCF total energy here.
    multiplicity:
        Spin multiplicity 2S+1. Default 1 (singlet).
    actual_temperature_K:
        Finite-temperature data temperature in K. ORCA writes 0.0 by
        default; we follow.
    frequency_scale_factor:
        Empirical harmonic-to-experimental frequency scaling factor
        (e.g. 0.95 for HF, 0.97 for DFT). 1.0 = no scaling. Default 1.0.

    Notes
    -----
    moltui consumes ``$atoms``, ``$vibrational_frequencies``, and
    ``$normal_modes`` to animate vibrational modes. The Hessian +
    dipole-derivative + IR-spectrum blocks are written unconditionally
    (zeroed when not available on the HessianResult) so the file is a
    drop-in for any consumer that scans for them.

    Examples
    --------
    >>> from vibeqc import (
    ...     Molecule, BasisSet, run_rhf, RHFOptions,
    ...     compute_hessian_rhf_analytic,
    ... )
    >>> from vibeqc.io import write_orca_hess
    >>> mol = Molecule.from_xyz("h2o.xyz")
    >>> basis = BasisSet(mol, "sto-3g")
    >>> rhf = run_rhf(mol, basis, RHFOptions())
    >>> hess = compute_hessian_rhf_analytic(mol, basis, rhf,
    ...                                       basis_name="sto-3g")
    >>> write_orca_hess("h2o.hess", mol, hess, energy_ha=rhf.energy)
    """
    H = np.asarray(result.hessian, dtype=np.float64)
    L = np.asarray(result.normal_modes, dtype=np.float64)
    freqs = np.asarray(result.frequencies_cm1, dtype=np.float64)

    n_dof = H.shape[0]
    if H.shape != (n_dof, n_dof):
        raise ValueError(
            f"write_orca_hess: hessian has shape {H.shape}; expected square."
        )
    if L.shape != (n_dof, n_dof) or freqs.shape != (n_dof,):
        raise ValueError(
            f"write_orca_hess: normal_modes shape {L.shape} / frequencies "
            f"shape {freqs.shape} inconsistent with Hessian shape {H.shape}."
        )

    n_atoms = n_dof // 3
    if 3 * n_atoms != n_dof:
        raise ValueError(
            f"write_orca_hess: 3N = {n_dof} is not divisible by 3."
        )
    if len(molecule.atoms) != n_atoms:
        raise ValueError(
            f"write_orca_hess: molecule has {len(molecule.atoms)} atoms but "
            f"Hessian implies {n_atoms}."
        )

    masses_amu = (
        np.asarray(result.masses_amu, dtype=np.float64)
        if result.masses_amu is not None
        else _default_masses_amu(molecule)
    )
    if masses_amu.shape != (n_atoms,):
        raise ValueError(
            f"write_orca_hess: masses_amu shape {masses_amu.shape}; "
            f"expected ({n_atoms},)."
        )

    # Dipole derivatives (3N, 3) — zero pad if missing.
    if result.dipole_derivatives is not None:
        D = np.asarray(result.dipole_derivatives, dtype=np.float64)
        ir_int = ir_intensities(result)
        have_dipole = True
    else:
        D = np.zeros((n_dof, 3), dtype=np.float64)
        ir_int = np.zeros(n_dof, dtype=np.float64)
        have_dipole = False

    with open(path, "w", encoding="utf-8") as fh:
        # Section 1 — header
        fh.write("\n")
        fh.write("$orca_hessian_file\n")
        fh.write("\n")

        # Sections 2 + 3 — partial-Hessian markers (zero = full Hessian)
        fh.write("$act_atom\n  0\n\n")
        fh.write("$act_coord\n  0\n\n")

        # Section 4 — total energy
        fh.write("$act_energy\n")
        fh.write(f"      {energy_ha:.6f}\n\n")

        # Section 5 — multiplicity
        fh.write("$multiplicity\n")
        fh.write(f"  {int(multiplicity)}\n\n")

        # Section 6 — Cartesian Hessian
        fh.write("$hessian\n")
        fh.write(f"{n_dof}\n")
        _write_block_matrix(fh, H)
        fh.write("\n")

        # Section 7 — frequencies
        fh.write("$vibrational_frequencies\n")
        fh.write(f"{n_dof}\n")
        for i, f in enumerate(freqs):
            fh.write(f"    {i:<5d}{f:>22.16f}\n")
        fh.write("\n")

        # Section 8 — normal modes (mass-weighted, columns)
        fh.write("$normal_modes\n")
        fh.write(f"{n_dof} {n_dof}\n")
        _write_block_matrix(fh, L)
        fh.write("\n")

        # Section 9 — atoms (label, mass[amu], x, y, z[bohr])
        fh.write("#\n# The atoms: label  mass x y z (in bohrs)\n#\n")
        fh.write("$atoms\n")
        fh.write(f"{n_atoms}\n")
        for i, a in enumerate(molecule.atoms):
            sym = _element_symbol(a.Z)
            x, y, z = a.xyz
            fh.write(
                f" {sym:<4s} {masses_amu[i]:>10.5f}     "
                f"{x: .12f}     {y: .12f}     {z: .12f}\n"
            )
        fh.write("\n")

        # Section 10 — finite-T temperature
        fh.write("$actual_temperature\n")
        fh.write(f"  {actual_temperature_K:.6f}\n\n")

        # Section 11 — frequency scale factor
        fh.write("$frequency_scale_factor\n")
        fh.write(f"  {frequency_scale_factor:.6f}\n\n")

        # Section 12 — dipole derivatives (3N rows × 3 cols)
        _write_dipole_derivatives(fh, D)
        fh.write("\n")

        # Section 13 — IR spectrum (N rows: wavenumber, eps, Int, TX, TY, TZ)
        fh.write("#\n# The IR spectrum\n#  wavenumber eps Int TX TY TZ\n#\n")
        fh.write("$ir_spectrum\n")
        fh.write(f"{n_dof}\n")
        for i in range(n_dof):
            wavenum = freqs[i] if freqs[i] >= 0 else 0.0
            intensity = ir_int[i] if have_dipole else 0.0
            # ``eps`` (extinction coefficient) is intensity / (some
            # frequency-dependent prefactor); we leave it 0 unless we
            # have IR intensities — moltui doesn't read it anyway.
            eps = 0.0
            tx, ty, tz = (D[i, 0], D[i, 1], D[i, 2]) if have_dipole \
                else (0.0, 0.0, 0.0)
            fh.write(
                f"   {wavenum:>7.2f}   {eps:.8f}   {intensity:.8f}     "
                f"{tx: .6f}     {ty: .6f}     {tz: .6f}\n"
            )
        fh.write("\n")

        # Section 14 — sentinel
        fh.write("\n$end\n")
        fh.write("\n")



# ----------------------------------------------------------------------
# Internal — fallback masses when HessianResult.masses_amu is missing
# (older code paths).
# ----------------------------------------------------------------------

def _default_masses_amu(molecule: Molecule) -> np.ndarray:
    """Per-atom isotope-averaged masses keyed by atomic number. We
    duplicate the small table from ``hessian._resolve_masses`` rather
    than importing it (private helper, but identical numerical
    contents) — keeps this module self-contained for the writer.
    """
    # Standard atomic weights (IUPAC 2021), most-abundant-isotope-
    # averaged. Same reference table used elsewhere in vibe-qc.
    table = {
        1:   1.00800,    2:   4.00260,
        3:   6.94000,    4:   9.01218,
        5:  10.81000,    6:  12.01100,
        7:  14.00700,    8:  15.99900,
        9:  18.99800,   10:  20.18000,
        11: 22.98977,   12: 24.30500,
        13: 26.98154,   14: 28.08500,
        15: 30.97376,   16: 32.06000,
        17: 35.45000,   18: 39.94800,
        19: 39.09830,   20: 40.07800,
    }
    masses = np.zeros(len(molecule.atoms), dtype=np.float64)
    for i, atom in enumerate(molecule.atoms):
        masses[i] = table.get(int(atom.Z), float(atom.Z) * 2.0)
    return masses