Source code for vibeqc.basis_crystal

"""CRYSTAL-format basis-set parser and NWChem/.g94 emitter.

Targets the per-element CRYSTAL basis-set files distributed by the
Bredow group (Peintinger--Vilela Oliveira--Bredow, ``pob-*``): one file
per element, named ``ZZ_Sym`` (e.g. ``06_C``), plain-text CRYSTAL input.

Supported today
---------------
All-electron shells (S, SP, P, D, F, G). These cover the light-element
Bredow sets (pob-TZVP and pob-DZVP-rev2 for H–Br). The emitter produces
NWChem/.g94 files that libint loads without modification.

Not yet supported
-----------------
ECP blocks (``INPUT`` header on Rb–I and Cs–Po pob-* archives). Parsing
them is trivial; *applying* them in vibe-qc needs libecpint integration,
which is a separate follow-on. Such files raise ``NotImplementedError``
for now.

CRYSTAL per-element format summary
----------------------------------
Line 1::
    Z  NSHELL
or for ECP-using atoms::
    (200 + Z)  NSHELL

Each shell starts with::
    ITYPE  LAT  NG  CHE  SCAL

where

* ``ITYPE`` — 0 = user-defined (the only case the Bredow files use).
* ``LAT``   — shell angular-momentum code:
      0=S, 1=SP, 2=P, 3=D, 4=F, 5=G.
* ``NG``    — number of primitive Gaussians.
* ``CHE``   — formal occupancy (electrons) assigned to this shell.
* ``SCAL``  — scale factor applied to the exponents (1.0 = as-written).

Followed by ``NG`` lines of ``exponent coefficient`` (or ``exponent
s_coef p_coef`` for SP shells).
"""

from __future__ import annotations

import io
import re
import tarfile
import urllib.request
import zipfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable, Iterator, Optional, Sequence

# CRYSTAL LAT-code → angular momentum label understood by libint's .g94 loader.
# Libint's BasisSet parser treats "SP" as a general-contraction s+p pair.
_LAT_TO_SHELL = {0: "S", 1: "SP", 2: "P", 3: "D", 4: "F", 5: "G"}

# Element symbols, 1-based index so _ELEMENT_SYMBOLS[Z] returns the symbol.
# Light-element subset — extend as needed; pob-TZVP only requires through Br.
_ELEMENT_SYMBOLS = [
    "",
    "H",
    "He",
    "Li",
    "Be",
    "B",
    "C",
    "N",
    "O",
    "F",
    "Ne",
    "Na",
    "Mg",
    "Al",
    "Si",
    "P",
    "S",
    "Cl",
    "Ar",
    "K",
    "Ca",
    "Sc",
    "Ti",
    "V",
    "Cr",
    "Mn",
    "Fe",
    "Co",
    "Ni",
    "Cu",
    "Zn",
    "Ga",
    "Ge",
    "As",
    "Se",
    "Br",
    "Kr",
    "Rb",
    "Sr",
    "Y",
    "Zr",
    "Nb",
    "Mo",
    "Tc",
    "Ru",
    "Rh",
    "Pd",
    "Ag",
    "Cd",
    "In",
    "Sn",
    "Sb",
    "Te",
    "I",
    "Xe",
    "Cs",
    "Ba",
    "La",
    "Ce",
    "Pr",
    "Nd",
    "Pm",
    "Sm",
    "Eu",
    "Gd",
    "Tb",
    "Dy",
    "Ho",
    "Er",
    "Tm",
    "Yb",
    "Lu",
    "Hf",
    "Ta",
    "W",
    "Re",
    "Os",
    "Ir",
    "Pt",
    "Au",
    "Hg",
    "Tl",
    "Pb",
    "Bi",
    "Po",
]


[docs] @dataclass class CrystalShell: """One contracted Gaussian shell in CRYSTAL format.""" shell_type: str # "S", "SP", "P", "D", "F", "G" occupancy: float # CHE scale_factor: float # SCAL (exponents multiplied by this) exponents: list[float] = field(default_factory=list) coefficients: list[float] = field(default_factory=list) # For SP shells only: the p-side coefficients. Length matches exponents; # empty for non-SP shells. ``coefficients`` holds the s-side. coefficients_p: list[float] = field(default_factory=list) @property def n_primitives(self) -> int: return len(self.exponents)
[docs] @dataclass class CrystalECPTerm: """One Gaussian term inside a CRYSTAL ECP angular-momentum block. Encodes the contribution V_ℓ_term(r) = C · r^n · exp(−α · r²) in the standard Stuttgart-Köln / Hay-Wadt convention. ``ell`` is the angular momentum (0=s, 1=p, …, 4=g) — or the special label ``"local"`` for the M-counted local-potential terms in CRYSTAL's eq. 3.18. """ ell: object # int (0..4) or the literal "local" alpha: float # ALFKL — exponent coefficient: float # CGKL — Gaussian prefactor n_pow: int # NKL — power of r prefactor
[docs] @dataclass class CrystalECP: """Parsed CRYSTAL effective-core-potential block. Built from the ``INPUT``-keyword block that follows a ``200+Z`` atom header in CRYSTAL-format basis files. Carries everything libecpint needs to build the V_ECP operator: the effective nuclear charge (``Z_eff = Z − ncore``), and the per-ℓ Gaussian expansion of the local + ℓ-projected potentials. The CRYSTAL ECP convention has up to five ℓ-projected components (ℓ = 0..4) plus a "local" group counted by ``M`` in the header. ``terms`` collects them all in source order; group them by ``term.ell`` to match a specific potential expansion. """ znuc: float # effective core charge (= Z − ncore) m_local: int # M in the CRYSTAL header (eq. 3.18) m_per_ell: tuple[int, ...] = () # (M0, M1, M2, M3, M4) terms: list[CrystalECPTerm] = field(default_factory=list) @property def total_terms(self) -> int: return self.m_local + sum(self.m_per_ell)
[docs] @dataclass class CrystalAtomBasis: """Parsed per-atom basis set (one Bredow file).""" Z: int # atomic number (without ECP offset) has_ecp: bool # True if file had 200+Z header shells: list[CrystalShell] = field(default_factory=list) ecp: Optional["CrystalECP"] = None # populated when has_ecp + INPUT block parsed @property def element_symbol(self) -> str: return ( _ELEMENT_SYMBOLS[self.Z] if 0 < self.Z < len(_ELEMENT_SYMBOLS) else f"Z{self.Z}" )
# -------------------------------------------------------------------------- # Parser # -------------------------------------------------------------------------- def _tokens(stream: Iterator[str]) -> Iterator[tuple[int, str]]: """Yield (line_no, token) pairs, skipping blank lines and CRYSTAL comments (``!`` introduces a comment-to-end-of-line, matching CRYSTAL).""" for lineno, raw in enumerate(stream, start=1): # Strip trailing comments and whitespace. cleaned = raw.split("!", 1)[0] for tok in cleaned.split(): yield lineno, tok
[docs] def parse_crystal_atom_basis(text: str, source: str = "<string>") -> CrystalAtomBasis: """Parse a single CRYSTAL per-element basis file. ``source`` is used only in error messages. """ tokens = _tokens(iter(text.splitlines())) # Single-token pushback so the parser can peek one token *together with # its source line number* without consuming it. Holds (lineno, token) # pairs; only ever 0 or 1 deep in practice. The line number is what lets # the shell-header reader below tell a stray on-line token (the ``1 0`` # SCAL typo) apart from the next record on the following line. pushback: list[tuple[int, str]] = [] def next_pair() -> tuple[int, str]: if pushback: return pushback.pop() try: return next(tokens) except StopIteration as exc: raise ValueError(f"{source}: unexpected end of file") from exc def nxt() -> str: return next_pair()[1] def peek_pair() -> Optional[tuple[int, str]]: """Look at the next (lineno, token) without consuming it; None at EOF.""" if pushback: return pushback[-1] try: pair = next(tokens) except StopIteration: return None pushback.append(pair) return pair # ---- Header: (Z or 200+Z) NSHELL --------------------------------- z_raw = int(nxt()) nshell = int(nxt()) if z_raw >= 200: atom = CrystalAtomBasis(Z=z_raw - 200, has_ecp=True) elif z_raw >= 1: atom = CrystalAtomBasis(Z=z_raw, has_ecp=False) else: raise ValueError(f"{source}: invalid atomic number {z_raw}") # ---- Optional ECP block (``INPUT`` keyword) ---------------------- # Peek the next token; if it is 'INPUT' we must consume the whole ECP # block before the shells. try: first_pair = next_pair() except ValueError: first_pair = None if first_pair is not None and first_pair[1].upper() == "INPUT": # CRYSTAL Manual §"Effective core pseudo-potentials" (p. 84): # line 1: ZNUC M M0 M1 M2 M3 M4 (effective charge + 6 counts) # then M+M0+M1+M2+M3+M4 records, each: alpha coefficient n_factor # # M is the number of "local" terms (eq. 3.18); M0..M4 are the # per-ℓ projections (eq. 3.19) for ℓ = 0..4. Source order in # the file is local first, then ℓ=0, ℓ=1, … which we preserve # in the resulting ``CrystalECPTerm`` list. try: znuc = float(nxt()) m_local = int(nxt()) m0 = int(nxt()) m1 = int(nxt()) m2 = int(nxt()) m3 = int(nxt()) m4 = int(nxt()) except (ValueError, StopIteration) as exc: raise ValueError( f"{source}: malformed CRYSTAL ECP INPUT header " f"(expected ZNUC M M0 M1 M2 M3 M4): {exc}" ) from exc ecp = CrystalECP( znuc=znuc, m_local=m_local, m_per_ell=(m0, m1, m2, m3, m4), ) # Read the grouped records in source order. groups: list[tuple[object, int]] = [("local", m_local)] for ell, m_ell in enumerate((m0, m1, m2, m3, m4)): groups.append((ell, m_ell)) for ell_label, m_count in groups: for _ in range(m_count): try: alpha = float(nxt()) coef = float(nxt()) n_pow = int(nxt()) except (ValueError, StopIteration) as exc: raise ValueError( f"{source}: malformed CRYSTAL ECP record for " f"ell={ell_label!r}: {exc}" ) from exc ecp.terms.append( CrystalECPTerm( ell=ell_label, alpha=alpha, coefficient=coef, n_pow=n_pow, ) ) atom.ecp = ecp # The INPUT block is fully consumed; nothing is pushed back, so the # shell loop below reads the first shell header directly. elif first_pair is not None: pushback.append(first_pair) # ---- Shell loop --------------------------------------------------- for shell_idx in range(nshell): itype = int(nxt()) lat = int(nxt()) ng = int(nxt()) che = float(nxt()) scal_lineno, scal_tok = next_pair() # The CRYSTAL shell header is exactly five tokens: # ITYPE LAT NG CHE SCAL # Any further token on the *same physical line* is anomalous. The one # known case is the Bredow ``16_S`` typo where SCAL ``1.0`` was # mis-keyed as two tokens ``1 0`` (decimal point typed as a space), # i.e. ``0 3 1 0.0 1 0``. Without line awareness the stray ``0`` is # silently consumed as the d-shell exponent and the real exponent # (0.5207… for pob-TZVP, 0.4107… for pob-TZVP-rev2) as the # coefficient — yielding a physically invalid exp=0 function with no # error raised. Tolerate that exact ``<int> <int>`` → ``<int>.<int>`` # pattern — mirroring # scripts/basisset_dev/pob_basis_verify.py:parse_crystal_file (the # canonical verify/regenerate tool) so the two parsers agree — and # reject anything else extra on the header line. extra = peek_pair() if extra is not None and extra[0] == scal_lineno: if "." not in scal_tok and extra[1].isdigit(): next_pair() # consume the stray fractional digit scal_tok = f"{scal_tok}.{extra[1]}" else: raise ValueError( f"{source}: shell {shell_idx + 1}: unexpected extra token " f"{extra[1]!r} on the shell-header line (expected exactly " f"5 tokens: ITYPE LAT NG CHE SCAL)." ) try: scal = float(scal_tok) except ValueError as exc: raise ValueError( f"{source}: shell {shell_idx + 1}: malformed SCAL value " f"{scal_tok!r} in shell header: {exc}" ) from exc if itype != 0: raise ValueError( f"{source}: shell {shell_idx + 1}: CRYSTAL built-in basis " f"types (ITYPE != 0) are not supported — only user-defined " f"contracted Gaussians." ) shell_type = _LAT_TO_SHELL.get(lat) if shell_type is None: raise ValueError( f"{source}: shell {shell_idx + 1}: unknown LAT code {lat}. " f"Known: {_LAT_TO_SHELL}." ) shell = CrystalShell( shell_type=shell_type, occupancy=che, scale_factor=scal, ) for _ in range(ng): expn = float(nxt()) # Apply the CRYSTAL exponent-scale convention. if scal != 1.0: expn = expn * (scal**2) # A Gaussian primitive exponent must be strictly positive. A zero # or negative exponent is physically meaningless and is the # tell-tale of a mis-parsed shell header silently shifting the # token stream (see the SCAL-typo handling above). Fail loudly # rather than emit a bogus diffuse function. if expn <= 0.0: raise ValueError( f"{source}: shell {shell_idx + 1}: non-positive Gaussian " f"exponent {expn!r}; basis primitive exponents must be " f"> 0 (often a sign the shell header mis-parsed)." ) if shell_type == "SP": s_coef = float(nxt()) p_coef = float(nxt()) shell.exponents.append(expn) shell.coefficients.append(s_coef) shell.coefficients_p.append(p_coef) else: coef = float(nxt()) shell.exponents.append(expn) shell.coefficients.append(coef) atom.shells.append(shell) return atom
[docs] def parse_crystal_atom_basis_file(path: str | Path) -> CrystalAtomBasis: """Convenience wrapper: parse a CRYSTAL per-element file from disk.""" p = Path(path) return parse_crystal_atom_basis(p.read_text(), source=str(p))
# -------------------------------------------------------------------------- # CRYSTAL → libecpint bridge (Phase 14g) # --------------------------------------------------------------------------
[docs] @dataclass(frozen=True) class LibecpintInlineArrays: """Flat primitive arrays in the exact shape ``libecpint::ECPIntegrator:: set_ecp_basis(...)`` expects for one ECP center. Mapping convention (per CRYSTAL23 §"Effective core pseudo-potentials" p. 84, eqs. 3.18 + 3.19, against libecpint's XML convention where the local channel sits at ``lval = max_proj_ell + 1``): * CRYSTAL ``M`` (``ell="local"`` term group) → libecpint ``ams[i] = local_ell = max_proj_ell + 1``. This is the U_L channel that acts on all angular momenta (no projector applied), matching the local-potential semantics in CRYSTAL eq. 3.18. * CRYSTAL ``M_ℓ`` (``ell ∈ {0..4}`` term groups, with non-zero count) → libecpint ``ams[i] = ℓ``. Projected channels; libecpint applies P_ℓ during integration. Both conventions yield the same V_ECP matrix elements when summed against the basis. Verified against ecp10mdf.xml (K, Z=19) shell layout: local at lval=4 with placeholder zero, projectors at lval= 0..3 for s/p/d/f, max projector ell = 3. """ # Flattened per-primitive arrays. All four lists have the same # length == n_primitives. Pass these straight to libecpint's # ``set_ecp_basis(necps=1, ..., shell_lengths=&n_primitives)`` # (one ECP per atom-center; the n_primitives counts every primitive # across all l channels for that ECP). exponents: tuple[float, ...] coefficients: tuple[float, ...] ams: tuple[int, ...] # angular momentum per primitive ns: tuple[int, ...] # r^n power per primitive local_ell: int # libecpint l-index for CRYSTAL's M-local block max_proj_ell: int # highest ℓ with non-zero M_ℓ in CRYSTAL input @property def n_primitives(self) -> int: return len(self.exponents)
[docs] def crystal_ecp_to_libecpint_arrays(ecp: CrystalECP) -> LibecpintInlineArrays: """Flatten a :class:`CrystalECP` into libecpint primitive arrays. The conversion is bookkeeping — no algebraic rewrite of the ECP. Each :class:`CrystalECPTerm` becomes one primitive with its (alpha, coefficient, n_pow) carried over verbatim; only the ``ell`` label is re-keyed onto libecpint's per-primitive ``am`` convention (see :class:`LibecpintInlineArrays` for the mapping). Phase 14g pairs this with a C++ ``compute_ecp_matrix_inline`` that calls libecpint's ``set_ecp_basis(...)`` (the inline- primitive form) instead of ``set_ecp_basis_from_library(...)`` (the XML form used today). That unblocks the 5th-period pob basis sets (Rb–I, Cs–Po) and any future inline-ECP archives (vDZP, mixed-row dhf-*). Parameters ---------- ecp Parsed CRYSTAL ECP block from :func:`parse_crystal_atom_basis`. Returns ------- LibecpintInlineArrays Bundle ready to feed libecpint's inline-primitive API. Raises ------ ValueError If a term carries an unrecognised ``ell`` label (not ``"local"`` and not in 0..4). """ # Determine the libecpint local-channel l-index. The XML convention # places the local at ``max_proj_ell + 1`` so it sits strictly above # every projector. When no projectors are present (rare; only valid # for trivial test cases), fall back to local_ell=0 — libecpint will # still treat the single channel as the local one since it's # max-l. max_proj_ell = -1 for ell_idx, m_count in enumerate(ecp.m_per_ell): if m_count > 0: max_proj_ell = max(max_proj_ell, ell_idx) local_ell = max_proj_ell + 1 if max_proj_ell >= 0 else 0 exponents: list[float] = [] coefficients: list[float] = [] ams: list[int] = [] ns: list[int] = [] for term in ecp.terms: if term.ell == "local": assigned_l = local_ell elif isinstance(term.ell, int) and 0 <= term.ell <= 4: assigned_l = term.ell else: raise ValueError( f"crystal_ecp_to_libecpint_arrays: term carries " f"unrecognised ell={term.ell!r}; expected " f"'local' or int in 0..4." ) exponents.append(term.alpha) coefficients.append(term.coefficient) ams.append(assigned_l) ns.append(term.n_pow) return LibecpintInlineArrays( exponents=tuple(exponents), coefficients=tuple(coefficients), ams=tuple(ams), ns=tuple(ns), local_ell=local_ell, max_proj_ell=max_proj_ell, )
# -------------------------------------------------------------------------- # NWChem / .g94 emitter # --------------------------------------------------------------------------
[docs] def emit_g94(atoms: Sequence[CrystalAtomBasis], header_comment: str = "") -> str: """Convert a list of per-element basis specs into a single .g94 (Gaussian / NWChem format) string that libint can load.""" lines: list[str] = [] if header_comment: for ln in header_comment.splitlines(): lines.append(f"! {ln}") lines.append("") for atom in atoms: if atom.has_ecp: raise NotImplementedError( "emit_g94: atoms with ECPs cannot be written to a pure " "basis-only .g94 file yet — libecpint integration pending." ) lines.append("****") lines.append(f"{atom.element_symbol:<2s} 0") for shell in atom.shells: # NWChem format uses a leading keyword + NG + 1.00 (unused). lines.append(f"{shell.shell_type:<2s} {shell.n_primitives:2d} 1.00") for i, exp in enumerate(shell.exponents): if shell.shell_type == "SP": lines.append( f" {exp:18.10f} {shell.coefficients[i]:18.10f}" f" {shell.coefficients_p[i]:18.10f}" ) else: lines.append(f" {exp:18.10f} {shell.coefficients[i]:18.10f}") lines.append("****") lines.append("") return "\n".join(lines)
# CRYSTAL LAT-code ← string lookup for the emitter. _SHELL_TO_LAT = {"S": 0, "SP": 1, "P": 2, "D": 3, "F": 4, "G": 5} # Default formal occupancies per shell type (used when CrystalShell.occupancy # is 0.0 because the data came from a .g94 file that doesn't carry it). _DEFAULT_OCCUPANCY = {"S": 2, "SP": 2, "P": 6, "D": 10, "F": 14, "G": 18} def _occ(shell: CrystalShell) -> float: """Return formal electron count per shell for CRYSTAL14 neutrality. CRYSTAL14 requires the sum of CHE values to equal the nuclear charge. Use proper electron counts: 2.0 for s/sp, 6.0 for p, 10.0 for d, 14.0 for f. Polarisation shells (single-primitive, occupancy=0.0) stay at 0.0. """ if shell.occupancy > 0.0: return shell.occupancy # Single-primitive shells with occ=0.0 are polarisation functions. if shell.n_primitives == 1: return 0.0 return float(_DEFAULT_OCCUPANCY.get(shell.shell_type, 0)) # -------------------------------------------------------------------------- # CRYSTAL inline-basis emitter (for .d12 files) # --------------------------------------------------------------------------
[docs] def emit_crystal(atoms: Sequence[CrystalAtomBasis]) -> str: """Convert per-element basis specs into CRYSTAL inline-basis format. Produces text suitable for pasting into a CRYSTAL ``.d12`` input deck after the geometry block (``ENDGEOM``). The format is:: Z NSHELL 0 LAT NPG OCC SCALE exp1 coeff1 exp2 coeff2 ... 99 0 END where LAT = 0(S), 1(SP), 2(P), 3(D), 4(F), 5(G). For SP shells each primitive line carries both s and p coefficients. ECP-bearing atoms (``has_ecp=True``) are emitted with ``200+Z`` as the header number; the ECP block itself is **not** emitted (ECP inline emission needs a separate function — libecpint integration pending). All-electron atoms emit the raw Z. Parameters ---------- atoms Sequence of :class:`CrystalAtomBasis` objects, typically parsed from the Bredow-group per-element CRYSTAL files via :func:`parse_crystal_atom_basis` or :func:`parse_crystal_atom_basis_file`. Returns ------- str Multi-line CRYSTAL basis-set block ready for insertion into a ``.d12`` deck. """ blocks: list[str] = [] for atom in atoms: z_header = atom.Z + 200 if atom.has_ecp else atom.Z nshell = len(atom.shells) lines: list[str] = [f"{z_header} {nshell}"] for shell in atom.shells: lat = _SHELL_TO_LAT.get(shell.shell_type) if lat is None: raise ValueError( f"{atom.element_symbol}: unknown shell type {shell.shell_type!r}" ) npg = shell.n_primitives occ = _occ(shell) scale = 0.0 lines.append(f"0 {lat} {npg:3d} {occ:.1f} {scale:.1f}") for i in range(npg): exp = shell.exponents[i] if shell.shell_type == "SP": c_s = shell.coefficients[i] c_p = shell.coefficients_p[i] lines.append(f"{exp:20.10f} {c_s:20.10f} {c_p:20.10f}") else: c = shell.coefficients[i] lines.append(f"{exp:20.10f} {c:20.10f}") blocks.append("\n".join(lines)) # Join per-element blocks with a blank line, then close with END. return "\n".join(blocks) + "\n 99 0"
# -------------------------------------------------------------------------- # Bredow-group archive fetcher # -------------------------------------------------------------------------- # Canonical registry of the all-electron Bredow archives (light elements, # H–Br). ECP-bearing archives are known but not fetched until libecpint # support lands; see _BREDOW_ARCHIVES_ECP below. _BREDOW_BASE = "https://www.chemie.uni-bonn.de/bredow/de/software" # Two modes of distribution: # "archive": URL returns a tar.gz / zip archive. # "listing": URL returns an HTML page with per-element subpages; every # element page embeds its file contents inside a <p> block. # The fetcher auto-dispatches on the magic bytes of the response. _BREDOW_ARCHIVES = { # key = local basis name → (URL, pretty-name) "pob-tzvp": (f"{_BREDOW_BASE}/pob-tzvp-tar.gz", "pob-TZVP"), "pob-tzvp-rev2": (f"{_BREDOW_BASE}/pob-tzvp-rev2-tar.gz", "pob-TZVP-rev2"), # pob-DZVP-rev2 is served only as an HTML listing with embedded file # bodies — the fetcher transparently scrapes it. "pob-dzvp-rev2": (f"{_BREDOW_BASE}/pob-dzvp-rev2", "pob-DZVP-rev2"), } _BREDOW_ARCHIVES_ECP = { # Not fetched today; listed for transparency. libecpint integration is a # follow-on phase. "pob-dzvp-rb-i": f"{_BREDOW_BASE}/pob-dzvp-rb-i-tar.gz", "pob-tzvp-rb-i": f"{_BREDOW_BASE}/pob-tzvp-rb-i-tar.gz", "pob-tzvp-rev2-rb-i": f"{_BREDOW_BASE}/pob-tzvp-ref2-rb-i-tar.gz", "pob-tzvp-rev2-cs-po": f"{_BREDOW_BASE}/pob-tzvp-rev2-cs-po-tar.gz", "pob-tzv-rev2-la-lu": f"{_BREDOW_BASE}/pob-tzv-rev2-la-lu-tar.gz", } # Per-basis citation text surfaced to the user on successful fetch. _BREDOW_CITATIONS = { "pob-tzvp": ( "Peintinger, Vilela Oliveira, Bredow. J. Comput. Chem. 34, 451–459 " "(2013). DOI: 10.1002/jcc.23153" ), "pob-tzvp-rev2": ( "Vilela Oliveira, Laun, Peintinger, Bredow. J. Comput. Chem. 40, " "2364–2376 (2019). DOI: 10.1002/jcc.26013" ), "pob-dzvp-rev2": ( "Vilela Oliveira, Laun, Peintinger, Bredow. J. Comput. Chem. 40, " "2364–2376 (2019). DOI: 10.1002/jcc.26013" ), } def _download(url: str, timeout: float = 60.0, retries: int = 3) -> bytes: """Fetch a URL to memory; tolerates both tar.gz archives and single files. The Bredow Plone server is fast for tar.gz archives and slower for per-element HTML pages. Retries transparently on transient timeouts. """ import time last_err: Exception | None = None for attempt in range(retries): try: with urllib.request.urlopen(url, timeout=timeout) as resp: return resp.read() except Exception as err: last_err = err time.sleep(1.0 * (attempt + 1)) assert last_err is not None raise last_err _ELEMENT_LINK_RE = re.compile( r'href="(?P<url>[^"]*?/(?P<name>\d{1,3}_[a-z]+))"', re.IGNORECASE, ) # File contents inside a Plone page are wrapped in a single <p>...</p> block # with <br /> line separators. We extract that and normalize it to plain # text. The content block is identified by starting with "ZZ_SYM" or by # having the characteristic "Z N" first data line — we match the former. _PLONE_FILE_BLOCK_RE = re.compile( r"<p>\s*(?:<strong>[^<]*</strong>\s*<br\s*/>)?(?P<body>.*?)</p>", re.IGNORECASE | re.DOTALL, ) def _unescape_plone_body(body: str) -> str: """Convert a Plone <p>…<br />…</p> blob to plain text lines.""" # Collapse <br/> in any form to newline, drop remaining tags. t = re.sub(r"<br\s*/?>", "\n", body) t = re.sub(r"</?strong>", "", t) t = re.sub(r"<[^>]+>", "", t) # any stray tag # HTML entities we might encounter. t = t.replace("&nbsp;", " ").replace("&amp;", "&") # Unicode non-breaking spaces frequently appear in scraped text. t = t.replace("\u00a0", " ") return t.strip() def _iter_archive_members(data: bytes) -> Iterator[tuple[str, bytes]]: """Yield (name, bytes) for every member of a tar.gz or zip archive. Bredow's server serves some URLs without extensions (raw tar.gz bytes); we autodetect the magic bytes. """ # Tarball (gzip-compressed). if data[:2] == b"\x1f\x8b": with tarfile.open(fileobj=io.BytesIO(data), mode="r:gz") as tf: for member in tf.getmembers(): if not member.isfile(): continue f = tf.extractfile(member) if f is None: continue yield member.name, f.read() return # Zip. if data[:4] == b"PK\x03\x04": with zipfile.ZipFile(io.BytesIO(data)) as zf: for name in zf.namelist(): if name.endswith("/"): continue yield name, zf.read(name) return raise RuntimeError("unknown archive format (not tar.gz or zip)") def _iter_plone_listing_members( data: bytes, base_url: str ) -> Iterator[tuple[str, bytes]]: """Walk a Plone listing page and fetch every per-element subpage, returning the embedded file body as bytes. ``base_url`` is the listing URL — used to resolve relative links and to de-duplicate cross-references. """ html = data.decode("utf-8", errors="replace") seen: set[str] = set() for match in _ELEMENT_LINK_RE.finditer(html): url = match.group("url") name = match.group("name") # Normalize: ``06_c`` → ``06_C`` so the parser's filename regex picks it up. parts = name.split("_", 1) if len(parts) == 2: name = f"{parts[0]}_{parts[1].capitalize()}" # Stay on the same Bredow basis listing — reject stray nav links. if "/bredow/" not in url: continue if url in seen: continue seen.add(url) sub_html = _download(url) sub_text = sub_html.decode("utf-8", errors="replace") # The Plone layout wraps the per-element page in several <p> # blocks; the first typically contains just the element header # ("<strong>06_C</strong><br />"), the next contains the actual # basis. Iterate and pick the first body that looks like CRYSTAL # input (starts with ZZ NSHELL and has numeric data). body = "" for m in _PLONE_FILE_BLOCK_RE.finditer(sub_text): candidate = _unescape_plone_body(m.group("body")) if not candidate: continue first_line = candidate.splitlines()[0].split() if ( len(first_line) >= 2 and first_line[0].isdigit() and first_line[1].isdigit() ): body = candidate break if not body: continue yield name, body.encode("utf-8") _FILENAME_RE = re.compile(r"^\s*(?:.*/)?(\d{2,3})_([A-Z][a-z]?)\s*$")
[docs] def fetch_bredow_basis_sets( dest_dir: str | Path | None = None, names: Iterable[str] | None = None, *, verbose: bool = True, ) -> dict[str, Path]: """Download the Bredow group all-electron basis sets and convert them to .g94 files loadable by libint. Parameters ---------- dest_dir Target directory for the .g94 output files. Defaults to ``basis_library/basis/`` inside the vibe-qc source tree (the path already on ``LIBINT_DATA_PATH`` in editable installs). names Subset of basis-set keys to fetch (from ``_BREDOW_ARCHIVES``). ``None`` fetches all supported light-element archives. verbose Print progress + citation info. Returns ------- dict mapping basis name → path of the generated .g94 file. """ if dest_dir is None: # Default target: basis_library/custom/ — this directory is the # committed-to-repo staging area. scripts/setup_basis_library.sh # copies custom/*.g94 on top of libint's stock bases to populate # basis_library/basis/, which is what LIBINT_DATA_PATH points at. # Writing here keeps the generated files under version control. here = Path(__file__).resolve().parent.parent.parent dest_dir = here / "basis_library" / "custom" dest = Path(dest_dir) dest.mkdir(parents=True, exist_ok=True) keys = list(names) if names is not None else list(_BREDOW_ARCHIVES) results: dict[str, Path] = {} for key in keys: if key not in _BREDOW_ARCHIVES: raise KeyError( f"Unknown Bredow basis key {key!r}. Known: {sorted(_BREDOW_ARCHIVES)}" ) url, _inner = _BREDOW_ARCHIVES[key] if verbose: print(f"[fetch] {key:<15s}{url}") data = _download(url) # Autodetect: archive (tar.gz / zip) or HTML listing. if data[:2] == b"\x1f\x8b" or data[:4] == b"PK\x03\x04": members: Iterable[tuple[str, bytes]] = list(_iter_archive_members(data)) elif data.lstrip()[:5].lower() in (b"<!doc", b"<html"): members = list(_iter_plone_listing_members(data, url)) else: raise RuntimeError(f"{key}: unrecognised server response format") # Parse every per-element file into a CrystalAtomBasis, sorted by Z. atoms_by_z: dict[int, CrystalAtomBasis] = {} for member_name, content in members: m = _FILENAME_RE.match(member_name) if not m: continue # skip top-level directory entries or READMEs atom = parse_crystal_atom_basis( content.decode("utf-8", errors="replace"), source=member_name, ) atoms_by_z[atom.Z] = atom if not atoms_by_z: raise RuntimeError(f"{key}: archive contained no parseable elements") ordered = [atoms_by_z[z] for z in sorted(atoms_by_z)] citation = _BREDOW_CITATIONS.get(key, "") header = ( f"vibeqc-generated from Bredow-group archive '{key}'.\n" f"Source URL: {url}\n" f"Elements: {', '.join(a.element_symbol for a in ordered)}\n" + (f"Cite: {citation}\n" if citation else "") ) g94 = emit_g94(ordered, header_comment=header) out = dest / f"{key}.g94" out.write_text(g94) results[key] = out if verbose: print(f"[write] {out} ({len(ordered)} elements)") if citation: print(f" cite: {citation}") return results
# -------------------------------------------------------------------------- # CLI: python -m vibeqc.basis_crystal [fetch | convert <file.g94> <in.crystal>] # -------------------------------------------------------------------------- def _main(argv: list[str]) -> int: if not argv or argv[0] in ("-h", "--help"): print( "usage:\n" " python -m vibeqc.basis_crystal fetch " "[--dest DIR] [name ...]\n" " Download Bredow-group basis sets and convert to " ".g94 for libint.\n" " python -m vibeqc.basis_crystal convert IN.crystal OUT.g94\n" " Convert a CRYSTAL per-element file to a single-atom " ".g94 block." ) return 0 cmd, rest = argv[0], argv[1:] if cmd == "fetch": dest = None if rest and rest[0] == "--dest": dest = rest[1] rest = rest[2:] fetch_bredow_basis_sets(dest_dir=dest, names=rest if rest else None) return 0 if cmd == "convert": if len(rest) != 2: print("convert expects 2 arguments: input output") return 2 atom = parse_crystal_atom_basis_file(rest[0]) Path(rest[1]).write_text( emit_g94([atom], header_comment=f"converted from {rest[0]}") ) return 0 print(f"unknown command {cmd!r}") return 2 if __name__ == "__main__": import sys sys.exit(_main(sys.argv[1:]))