"""CRYSTAL-format basis-set parser and NWChem/.g94 emitter.
Targets the per-element CRYSTAL basis-set files distributed by the
Bredow group (Peintinger--Vilela Oliveira--Bredow, ``pob-*``): one file
per element, named ``ZZ_Sym`` (e.g. ``06_C``), plain-text CRYSTAL input.
Supported today
---------------
All-electron shells (S, SP, P, D, F, G). These cover the light-element
Bredow sets (pob-TZVP and pob-DZVP-rev2 for H–Br). The emitter produces
NWChem/.g94 files that libint loads without modification.
Not yet supported
-----------------
ECP blocks (``INPUT`` header on Rb–I and Cs–Po pob-* archives). Parsing
them is trivial; *applying* them in vibe-qc needs libecpint integration,
which is a separate follow-on. Such files raise ``NotImplementedError``
for now.
CRYSTAL per-element format summary
----------------------------------
Line 1::
Z NSHELL
or for ECP-using atoms::
(200 + Z) NSHELL
Each shell starts with::
ITYPE LAT NG CHE SCAL
where
* ``ITYPE`` — 0 = user-defined (the only case the Bredow files use).
* ``LAT`` — shell angular-momentum code:
0=S, 1=SP, 2=P, 3=D, 4=F, 5=G.
* ``NG`` — number of primitive Gaussians.
* ``CHE`` — formal occupancy (electrons) assigned to this shell.
* ``SCAL`` — scale factor applied to the exponents (1.0 = as-written).
Followed by ``NG`` lines of ``exponent coefficient`` (or ``exponent
s_coef p_coef`` for SP shells).
"""
from __future__ import annotations
import io
import re
import tarfile
import urllib.request
import zipfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable, Iterator, Optional, Sequence
# CRYSTAL LAT-code → angular momentum label understood by libint's .g94 loader.
# Libint's BasisSet parser treats "SP" as a general-contraction s+p pair.
_LAT_TO_SHELL = {0: "S", 1: "SP", 2: "P", 3: "D", 4: "F", 5: "G"}
# Element symbols, 1-based index so _ELEMENT_SYMBOLS[Z] returns the symbol.
# Light-element subset — extend as needed; pob-TZVP only requires through Br.
_ELEMENT_SYMBOLS = [
"",
"H",
"He",
"Li",
"Be",
"B",
"C",
"N",
"O",
"F",
"Ne",
"Na",
"Mg",
"Al",
"Si",
"P",
"S",
"Cl",
"Ar",
"K",
"Ca",
"Sc",
"Ti",
"V",
"Cr",
"Mn",
"Fe",
"Co",
"Ni",
"Cu",
"Zn",
"Ga",
"Ge",
"As",
"Se",
"Br",
"Kr",
"Rb",
"Sr",
"Y",
"Zr",
"Nb",
"Mo",
"Tc",
"Ru",
"Rh",
"Pd",
"Ag",
"Cd",
"In",
"Sn",
"Sb",
"Te",
"I",
"Xe",
"Cs",
"Ba",
"La",
"Ce",
"Pr",
"Nd",
"Pm",
"Sm",
"Eu",
"Gd",
"Tb",
"Dy",
"Ho",
"Er",
"Tm",
"Yb",
"Lu",
"Hf",
"Ta",
"W",
"Re",
"Os",
"Ir",
"Pt",
"Au",
"Hg",
"Tl",
"Pb",
"Bi",
"Po",
]
[docs]
@dataclass
class CrystalShell:
"""One contracted Gaussian shell in CRYSTAL format."""
shell_type: str # "S", "SP", "P", "D", "F", "G"
occupancy: float # CHE
scale_factor: float # SCAL (exponents multiplied by this)
exponents: list[float] = field(default_factory=list)
coefficients: list[float] = field(default_factory=list)
# For SP shells only: the p-side coefficients. Length matches exponents;
# empty for non-SP shells. ``coefficients`` holds the s-side.
coefficients_p: list[float] = field(default_factory=list)
@property
def n_primitives(self) -> int:
return len(self.exponents)
[docs]
@dataclass
class CrystalECPTerm:
"""One Gaussian term inside a CRYSTAL ECP angular-momentum block.
Encodes the contribution
V_ℓ_term(r) = C · r^n · exp(−α · r²)
in the standard Stuttgart-Köln / Hay-Wadt convention. ``ell`` is
the angular momentum (0=s, 1=p, …, 4=g) — or the special label
``"local"`` for the M-counted local-potential terms in CRYSTAL's
eq. 3.18.
"""
ell: object # int (0..4) or the literal "local"
alpha: float # ALFKL — exponent
coefficient: float # CGKL — Gaussian prefactor
n_pow: int # NKL — power of r prefactor
[docs]
@dataclass
class CrystalECP:
"""Parsed CRYSTAL effective-core-potential block.
Built from the ``INPUT``-keyword block that follows a ``200+Z``
atom header in CRYSTAL-format basis files. Carries everything
libecpint needs to build the V_ECP operator: the effective
nuclear charge (``Z_eff = Z − ncore``), and the per-ℓ Gaussian
expansion of the local + ℓ-projected potentials.
The CRYSTAL ECP convention has up to five ℓ-projected
components (ℓ = 0..4) plus a "local" group counted by ``M`` in
the header. ``terms`` collects them all in source order; group
them by ``term.ell`` to match a specific potential expansion.
"""
znuc: float # effective core charge (= Z − ncore)
m_local: int # M in the CRYSTAL header (eq. 3.18)
m_per_ell: tuple[int, ...] = () # (M0, M1, M2, M3, M4)
terms: list[CrystalECPTerm] = field(default_factory=list)
@property
def total_terms(self) -> int:
return self.m_local + sum(self.m_per_ell)
[docs]
@dataclass
class CrystalAtomBasis:
"""Parsed per-atom basis set (one Bredow file)."""
Z: int # atomic number (without ECP offset)
has_ecp: bool # True if file had 200+Z header
shells: list[CrystalShell] = field(default_factory=list)
ecp: Optional["CrystalECP"] = None # populated when has_ecp + INPUT block parsed
@property
def element_symbol(self) -> str:
return (
_ELEMENT_SYMBOLS[self.Z]
if 0 < self.Z < len(_ELEMENT_SYMBOLS)
else f"Z{self.Z}"
)
# --------------------------------------------------------------------------
# Parser
# --------------------------------------------------------------------------
def _tokens(stream: Iterator[str]) -> Iterator[tuple[int, str]]:
"""Yield (line_no, token) pairs, skipping blank lines and CRYSTAL
comments (``!`` introduces a comment-to-end-of-line, matching CRYSTAL)."""
for lineno, raw in enumerate(stream, start=1):
# Strip trailing comments and whitespace.
cleaned = raw.split("!", 1)[0]
for tok in cleaned.split():
yield lineno, tok
[docs]
def parse_crystal_atom_basis(text: str, source: str = "<string>") -> CrystalAtomBasis:
"""Parse a single CRYSTAL per-element basis file.
``source`` is used only in error messages.
"""
tokens = _tokens(iter(text.splitlines()))
# Single-token pushback so the parser can peek one token *together with
# its source line number* without consuming it. Holds (lineno, token)
# pairs; only ever 0 or 1 deep in practice. The line number is what lets
# the shell-header reader below tell a stray on-line token (the ``1 0``
# SCAL typo) apart from the next record on the following line.
pushback: list[tuple[int, str]] = []
def next_pair() -> tuple[int, str]:
if pushback:
return pushback.pop()
try:
return next(tokens)
except StopIteration as exc:
raise ValueError(f"{source}: unexpected end of file") from exc
def nxt() -> str:
return next_pair()[1]
def peek_pair() -> Optional[tuple[int, str]]:
"""Look at the next (lineno, token) without consuming it; None at EOF."""
if pushback:
return pushback[-1]
try:
pair = next(tokens)
except StopIteration:
return None
pushback.append(pair)
return pair
# ---- Header: (Z or 200+Z) NSHELL ---------------------------------
z_raw = int(nxt())
nshell = int(nxt())
if z_raw >= 200:
atom = CrystalAtomBasis(Z=z_raw - 200, has_ecp=True)
elif z_raw >= 1:
atom = CrystalAtomBasis(Z=z_raw, has_ecp=False)
else:
raise ValueError(f"{source}: invalid atomic number {z_raw}")
# ---- Optional ECP block (``INPUT`` keyword) ----------------------
# Peek the next token; if it is 'INPUT' we must consume the whole ECP
# block before the shells.
try:
first_pair = next_pair()
except ValueError:
first_pair = None
if first_pair is not None and first_pair[1].upper() == "INPUT":
# CRYSTAL Manual §"Effective core pseudo-potentials" (p. 84):
# line 1: ZNUC M M0 M1 M2 M3 M4 (effective charge + 6 counts)
# then M+M0+M1+M2+M3+M4 records, each: alpha coefficient n_factor
#
# M is the number of "local" terms (eq. 3.18); M0..M4 are the
# per-ℓ projections (eq. 3.19) for ℓ = 0..4. Source order in
# the file is local first, then ℓ=0, ℓ=1, … which we preserve
# in the resulting ``CrystalECPTerm`` list.
try:
znuc = float(nxt())
m_local = int(nxt())
m0 = int(nxt())
m1 = int(nxt())
m2 = int(nxt())
m3 = int(nxt())
m4 = int(nxt())
except (ValueError, StopIteration) as exc:
raise ValueError(
f"{source}: malformed CRYSTAL ECP INPUT header "
f"(expected ZNUC M M0 M1 M2 M3 M4): {exc}"
) from exc
ecp = CrystalECP(
znuc=znuc,
m_local=m_local,
m_per_ell=(m0, m1, m2, m3, m4),
)
# Read the grouped records in source order.
groups: list[tuple[object, int]] = [("local", m_local)]
for ell, m_ell in enumerate((m0, m1, m2, m3, m4)):
groups.append((ell, m_ell))
for ell_label, m_count in groups:
for _ in range(m_count):
try:
alpha = float(nxt())
coef = float(nxt())
n_pow = int(nxt())
except (ValueError, StopIteration) as exc:
raise ValueError(
f"{source}: malformed CRYSTAL ECP record for "
f"ell={ell_label!r}: {exc}"
) from exc
ecp.terms.append(
CrystalECPTerm(
ell=ell_label,
alpha=alpha,
coefficient=coef,
n_pow=n_pow,
)
)
atom.ecp = ecp
# The INPUT block is fully consumed; nothing is pushed back, so the
# shell loop below reads the first shell header directly.
elif first_pair is not None:
pushback.append(first_pair)
# ---- Shell loop ---------------------------------------------------
for shell_idx in range(nshell):
itype = int(nxt())
lat = int(nxt())
ng = int(nxt())
che = float(nxt())
scal_lineno, scal_tok = next_pair()
# The CRYSTAL shell header is exactly five tokens:
# ITYPE LAT NG CHE SCAL
# Any further token on the *same physical line* is anomalous. The one
# known case is the Bredow ``16_S`` typo where SCAL ``1.0`` was
# mis-keyed as two tokens ``1 0`` (decimal point typed as a space),
# i.e. ``0 3 1 0.0 1 0``. Without line awareness the stray ``0`` is
# silently consumed as the d-shell exponent and the real exponent
# (0.5207… for pob-TZVP, 0.4107… for pob-TZVP-rev2) as the
# coefficient — yielding a physically invalid exp=0 function with no
# error raised. Tolerate that exact ``<int> <int>`` → ``<int>.<int>``
# pattern — mirroring
# scripts/basisset_dev/pob_basis_verify.py:parse_crystal_file (the
# canonical verify/regenerate tool) so the two parsers agree — and
# reject anything else extra on the header line.
extra = peek_pair()
if extra is not None and extra[0] == scal_lineno:
if "." not in scal_tok and extra[1].isdigit():
next_pair() # consume the stray fractional digit
scal_tok = f"{scal_tok}.{extra[1]}"
else:
raise ValueError(
f"{source}: shell {shell_idx + 1}: unexpected extra token "
f"{extra[1]!r} on the shell-header line (expected exactly "
f"5 tokens: ITYPE LAT NG CHE SCAL)."
)
try:
scal = float(scal_tok)
except ValueError as exc:
raise ValueError(
f"{source}: shell {shell_idx + 1}: malformed SCAL value "
f"{scal_tok!r} in shell header: {exc}"
) from exc
if itype != 0:
raise ValueError(
f"{source}: shell {shell_idx + 1}: CRYSTAL built-in basis "
f"types (ITYPE != 0) are not supported — only user-defined "
f"contracted Gaussians."
)
shell_type = _LAT_TO_SHELL.get(lat)
if shell_type is None:
raise ValueError(
f"{source}: shell {shell_idx + 1}: unknown LAT code {lat}. "
f"Known: {_LAT_TO_SHELL}."
)
shell = CrystalShell(
shell_type=shell_type,
occupancy=che,
scale_factor=scal,
)
for _ in range(ng):
expn = float(nxt())
# Apply the CRYSTAL exponent-scale convention.
if scal != 1.0:
expn = expn * (scal**2)
# A Gaussian primitive exponent must be strictly positive. A zero
# or negative exponent is physically meaningless and is the
# tell-tale of a mis-parsed shell header silently shifting the
# token stream (see the SCAL-typo handling above). Fail loudly
# rather than emit a bogus diffuse function.
if expn <= 0.0:
raise ValueError(
f"{source}: shell {shell_idx + 1}: non-positive Gaussian "
f"exponent {expn!r}; basis primitive exponents must be "
f"> 0 (often a sign the shell header mis-parsed)."
)
if shell_type == "SP":
s_coef = float(nxt())
p_coef = float(nxt())
shell.exponents.append(expn)
shell.coefficients.append(s_coef)
shell.coefficients_p.append(p_coef)
else:
coef = float(nxt())
shell.exponents.append(expn)
shell.coefficients.append(coef)
atom.shells.append(shell)
return atom
[docs]
def parse_crystal_atom_basis_file(path: str | Path) -> CrystalAtomBasis:
"""Convenience wrapper: parse a CRYSTAL per-element file from disk."""
p = Path(path)
return parse_crystal_atom_basis(p.read_text(), source=str(p))
# --------------------------------------------------------------------------
# CRYSTAL → libecpint bridge (Phase 14g)
# --------------------------------------------------------------------------
[docs]
@dataclass(frozen=True)
class LibecpintInlineArrays:
"""Flat primitive arrays in the exact shape ``libecpint::ECPIntegrator::
set_ecp_basis(...)`` expects for one ECP center.
Mapping convention (per CRYSTAL23 §"Effective core pseudo-potentials"
p. 84, eqs. 3.18 + 3.19, against libecpint's XML convention where the
local channel sits at ``lval = max_proj_ell + 1``):
* CRYSTAL ``M`` (``ell="local"`` term group)
→ libecpint ``ams[i] = local_ell = max_proj_ell + 1``.
This is the U_L channel that acts on all angular momenta (no
projector applied), matching the local-potential semantics in
CRYSTAL eq. 3.18.
* CRYSTAL ``M_ℓ`` (``ell ∈ {0..4}`` term groups, with non-zero count)
→ libecpint ``ams[i] = ℓ``.
Projected channels; libecpint applies P_ℓ during integration.
Both conventions yield the same V_ECP matrix elements when summed
against the basis. Verified against ecp10mdf.xml (K, Z=19) shell
layout: local at lval=4 with placeholder zero, projectors at lval=
0..3 for s/p/d/f, max projector ell = 3.
"""
# Flattened per-primitive arrays. All four lists have the same
# length == n_primitives. Pass these straight to libecpint's
# ``set_ecp_basis(necps=1, ..., shell_lengths=&n_primitives)``
# (one ECP per atom-center; the n_primitives counts every primitive
# across all l channels for that ECP).
exponents: tuple[float, ...]
coefficients: tuple[float, ...]
ams: tuple[int, ...] # angular momentum per primitive
ns: tuple[int, ...] # r^n power per primitive
local_ell: int # libecpint l-index for CRYSTAL's M-local block
max_proj_ell: int # highest ℓ with non-zero M_ℓ in CRYSTAL input
@property
def n_primitives(self) -> int:
return len(self.exponents)
[docs]
def crystal_ecp_to_libecpint_arrays(ecp: CrystalECP) -> LibecpintInlineArrays:
"""Flatten a :class:`CrystalECP` into libecpint primitive arrays.
The conversion is bookkeeping — no algebraic rewrite of the ECP.
Each :class:`CrystalECPTerm` becomes one primitive with its
(alpha, coefficient, n_pow) carried over verbatim; only the
``ell`` label is re-keyed onto libecpint's per-primitive ``am``
convention (see :class:`LibecpintInlineArrays` for the mapping).
Phase 14g pairs this with a C++ ``compute_ecp_matrix_inline``
that calls libecpint's ``set_ecp_basis(...)`` (the inline-
primitive form) instead of ``set_ecp_basis_from_library(...)``
(the XML form used today). That unblocks the 5th-period pob
basis sets (Rb–I, Cs–Po) and any future inline-ECP archives
(vDZP, mixed-row dhf-*).
Parameters
----------
ecp
Parsed CRYSTAL ECP block from :func:`parse_crystal_atom_basis`.
Returns
-------
LibecpintInlineArrays
Bundle ready to feed libecpint's inline-primitive API.
Raises
------
ValueError
If a term carries an unrecognised ``ell`` label (not
``"local"`` and not in 0..4).
"""
# Determine the libecpint local-channel l-index. The XML convention
# places the local at ``max_proj_ell + 1`` so it sits strictly above
# every projector. When no projectors are present (rare; only valid
# for trivial test cases), fall back to local_ell=0 — libecpint will
# still treat the single channel as the local one since it's
# max-l.
max_proj_ell = -1
for ell_idx, m_count in enumerate(ecp.m_per_ell):
if m_count > 0:
max_proj_ell = max(max_proj_ell, ell_idx)
local_ell = max_proj_ell + 1 if max_proj_ell >= 0 else 0
exponents: list[float] = []
coefficients: list[float] = []
ams: list[int] = []
ns: list[int] = []
for term in ecp.terms:
if term.ell == "local":
assigned_l = local_ell
elif isinstance(term.ell, int) and 0 <= term.ell <= 4:
assigned_l = term.ell
else:
raise ValueError(
f"crystal_ecp_to_libecpint_arrays: term carries "
f"unrecognised ell={term.ell!r}; expected "
f"'local' or int in 0..4."
)
exponents.append(term.alpha)
coefficients.append(term.coefficient)
ams.append(assigned_l)
ns.append(term.n_pow)
return LibecpintInlineArrays(
exponents=tuple(exponents),
coefficients=tuple(coefficients),
ams=tuple(ams),
ns=tuple(ns),
local_ell=local_ell,
max_proj_ell=max_proj_ell,
)
# --------------------------------------------------------------------------
# NWChem / .g94 emitter
# --------------------------------------------------------------------------
[docs]
def emit_g94(atoms: Sequence[CrystalAtomBasis], header_comment: str = "") -> str:
"""Convert a list of per-element basis specs into a single .g94
(Gaussian / NWChem format) string that libint can load."""
lines: list[str] = []
if header_comment:
for ln in header_comment.splitlines():
lines.append(f"! {ln}")
lines.append("")
for atom in atoms:
if atom.has_ecp:
raise NotImplementedError(
"emit_g94: atoms with ECPs cannot be written to a pure "
"basis-only .g94 file yet — libecpint integration pending."
)
lines.append("****")
lines.append(f"{atom.element_symbol:<2s} 0")
for shell in atom.shells:
# NWChem format uses a leading keyword + NG + 1.00 (unused).
lines.append(f"{shell.shell_type:<2s} {shell.n_primitives:2d} 1.00")
for i, exp in enumerate(shell.exponents):
if shell.shell_type == "SP":
lines.append(
f" {exp:18.10f} {shell.coefficients[i]:18.10f}"
f" {shell.coefficients_p[i]:18.10f}"
)
else:
lines.append(f" {exp:18.10f} {shell.coefficients[i]:18.10f}")
lines.append("****")
lines.append("")
return "\n".join(lines)
# CRYSTAL LAT-code ← string lookup for the emitter.
_SHELL_TO_LAT = {"S": 0, "SP": 1, "P": 2, "D": 3, "F": 4, "G": 5}
# Default formal occupancies per shell type (used when CrystalShell.occupancy
# is 0.0 because the data came from a .g94 file that doesn't carry it).
_DEFAULT_OCCUPANCY = {"S": 2, "SP": 2, "P": 6, "D": 10, "F": 14, "G": 18}
def _occ(shell: CrystalShell) -> float:
"""Return formal electron count per shell for CRYSTAL14 neutrality.
CRYSTAL14 requires the sum of CHE values to equal the nuclear
charge. Use proper electron counts: 2.0 for s/sp, 6.0 for p,
10.0 for d, 14.0 for f. Polarisation shells (single-primitive,
occupancy=0.0) stay at 0.0.
"""
if shell.occupancy > 0.0:
return shell.occupancy
# Single-primitive shells with occ=0.0 are polarisation functions.
if shell.n_primitives == 1:
return 0.0
return float(_DEFAULT_OCCUPANCY.get(shell.shell_type, 0))
# --------------------------------------------------------------------------
# CRYSTAL inline-basis emitter (for .d12 files)
# --------------------------------------------------------------------------
[docs]
def emit_crystal(atoms: Sequence[CrystalAtomBasis]) -> str:
"""Convert per-element basis specs into CRYSTAL inline-basis format.
Produces text suitable for pasting into a CRYSTAL ``.d12`` input
deck after the geometry block (``ENDGEOM``). The format is::
Z NSHELL
0 LAT NPG OCC SCALE
exp1 coeff1
exp2 coeff2
...
99 0
END
where LAT = 0(S), 1(SP), 2(P), 3(D), 4(F), 5(G). For SP shells
each primitive line carries both s and p coefficients.
ECP-bearing atoms (``has_ecp=True``) are emitted with ``200+Z``
as the header number; the ECP block itself is **not** emitted
(ECP inline emission needs a separate function — libecpint
integration pending). All-electron atoms emit the raw Z.
Parameters
----------
atoms
Sequence of :class:`CrystalAtomBasis` objects, typically
parsed from the Bredow-group per-element CRYSTAL files via
:func:`parse_crystal_atom_basis` or :func:`parse_crystal_atom_basis_file`.
Returns
-------
str
Multi-line CRYSTAL basis-set block ready for insertion into
a ``.d12`` deck.
"""
blocks: list[str] = []
for atom in atoms:
z_header = atom.Z + 200 if atom.has_ecp else atom.Z
nshell = len(atom.shells)
lines: list[str] = [f"{z_header} {nshell}"]
for shell in atom.shells:
lat = _SHELL_TO_LAT.get(shell.shell_type)
if lat is None:
raise ValueError(
f"{atom.element_symbol}: unknown shell type {shell.shell_type!r}"
)
npg = shell.n_primitives
occ = _occ(shell)
scale = 0.0
lines.append(f"0 {lat} {npg:3d} {occ:.1f} {scale:.1f}")
for i in range(npg):
exp = shell.exponents[i]
if shell.shell_type == "SP":
c_s = shell.coefficients[i]
c_p = shell.coefficients_p[i]
lines.append(f"{exp:20.10f} {c_s:20.10f} {c_p:20.10f}")
else:
c = shell.coefficients[i]
lines.append(f"{exp:20.10f} {c:20.10f}")
blocks.append("\n".join(lines))
# Join per-element blocks with a blank line, then close with END.
return "\n".join(blocks) + "\n 99 0"
# --------------------------------------------------------------------------
# Bredow-group archive fetcher
# --------------------------------------------------------------------------
# Canonical registry of the all-electron Bredow archives (light elements,
# H–Br). ECP-bearing archives are known but not fetched until libecpint
# support lands; see _BREDOW_ARCHIVES_ECP below.
_BREDOW_BASE = "https://www.chemie.uni-bonn.de/bredow/de/software"
# Two modes of distribution:
# "archive": URL returns a tar.gz / zip archive.
# "listing": URL returns an HTML page with per-element subpages; every
# element page embeds its file contents inside a <p> block.
# The fetcher auto-dispatches on the magic bytes of the response.
_BREDOW_ARCHIVES = {
# key = local basis name → (URL, pretty-name)
"pob-tzvp": (f"{_BREDOW_BASE}/pob-tzvp-tar.gz", "pob-TZVP"),
"pob-tzvp-rev2": (f"{_BREDOW_BASE}/pob-tzvp-rev2-tar.gz", "pob-TZVP-rev2"),
# pob-DZVP-rev2 is served only as an HTML listing with embedded file
# bodies — the fetcher transparently scrapes it.
"pob-dzvp-rev2": (f"{_BREDOW_BASE}/pob-dzvp-rev2", "pob-DZVP-rev2"),
}
_BREDOW_ARCHIVES_ECP = {
# Not fetched today; listed for transparency. libecpint integration is a
# follow-on phase.
"pob-dzvp-rb-i": f"{_BREDOW_BASE}/pob-dzvp-rb-i-tar.gz",
"pob-tzvp-rb-i": f"{_BREDOW_BASE}/pob-tzvp-rb-i-tar.gz",
"pob-tzvp-rev2-rb-i": f"{_BREDOW_BASE}/pob-tzvp-ref2-rb-i-tar.gz",
"pob-tzvp-rev2-cs-po": f"{_BREDOW_BASE}/pob-tzvp-rev2-cs-po-tar.gz",
"pob-tzv-rev2-la-lu": f"{_BREDOW_BASE}/pob-tzv-rev2-la-lu-tar.gz",
}
# Per-basis citation text surfaced to the user on successful fetch.
_BREDOW_CITATIONS = {
"pob-tzvp": (
"Peintinger, Vilela Oliveira, Bredow. J. Comput. Chem. 34, 451–459 "
"(2013). DOI: 10.1002/jcc.23153"
),
"pob-tzvp-rev2": (
"Vilela Oliveira, Laun, Peintinger, Bredow. J. Comput. Chem. 40, "
"2364–2376 (2019). DOI: 10.1002/jcc.26013"
),
"pob-dzvp-rev2": (
"Vilela Oliveira, Laun, Peintinger, Bredow. J. Comput. Chem. 40, "
"2364–2376 (2019). DOI: 10.1002/jcc.26013"
),
}
def _download(url: str, timeout: float = 60.0, retries: int = 3) -> bytes:
"""Fetch a URL to memory; tolerates both tar.gz archives and single files.
The Bredow Plone server is fast for tar.gz archives and slower for
per-element HTML pages. Retries transparently on transient timeouts.
"""
import time
last_err: Exception | None = None
for attempt in range(retries):
try:
with urllib.request.urlopen(url, timeout=timeout) as resp:
return resp.read()
except Exception as err:
last_err = err
time.sleep(1.0 * (attempt + 1))
assert last_err is not None
raise last_err
_ELEMENT_LINK_RE = re.compile(
r'href="(?P<url>[^"]*?/(?P<name>\d{1,3}_[a-z]+))"',
re.IGNORECASE,
)
# File contents inside a Plone page are wrapped in a single <p>...</p> block
# with <br /> line separators. We extract that and normalize it to plain
# text. The content block is identified by starting with "ZZ_SYM" or by
# having the characteristic "Z N" first data line — we match the former.
_PLONE_FILE_BLOCK_RE = re.compile(
r"<p>\s*(?:<strong>[^<]*</strong>\s*<br\s*/>)?(?P<body>.*?)</p>",
re.IGNORECASE | re.DOTALL,
)
def _unescape_plone_body(body: str) -> str:
"""Convert a Plone <p>…<br />…</p> blob to plain text lines."""
# Collapse <br/> in any form to newline, drop remaining tags.
t = re.sub(r"<br\s*/?>", "\n", body)
t = re.sub(r"</?strong>", "", t)
t = re.sub(r"<[^>]+>", "", t) # any stray tag
# HTML entities we might encounter.
t = t.replace(" ", " ").replace("&", "&")
# Unicode non-breaking spaces frequently appear in scraped text.
t = t.replace("\u00a0", " ")
return t.strip()
def _iter_archive_members(data: bytes) -> Iterator[tuple[str, bytes]]:
"""Yield (name, bytes) for every member of a tar.gz or zip archive.
Bredow's server serves some URLs without extensions (raw tar.gz bytes);
we autodetect the magic bytes.
"""
# Tarball (gzip-compressed).
if data[:2] == b"\x1f\x8b":
with tarfile.open(fileobj=io.BytesIO(data), mode="r:gz") as tf:
for member in tf.getmembers():
if not member.isfile():
continue
f = tf.extractfile(member)
if f is None:
continue
yield member.name, f.read()
return
# Zip.
if data[:4] == b"PK\x03\x04":
with zipfile.ZipFile(io.BytesIO(data)) as zf:
for name in zf.namelist():
if name.endswith("/"):
continue
yield name, zf.read(name)
return
raise RuntimeError("unknown archive format (not tar.gz or zip)")
def _iter_plone_listing_members(
data: bytes, base_url: str
) -> Iterator[tuple[str, bytes]]:
"""Walk a Plone listing page and fetch every per-element subpage,
returning the embedded file body as bytes.
``base_url`` is the listing URL — used to resolve relative links and
to de-duplicate cross-references.
"""
html = data.decode("utf-8", errors="replace")
seen: set[str] = set()
for match in _ELEMENT_LINK_RE.finditer(html):
url = match.group("url")
name = match.group("name")
# Normalize: ``06_c`` → ``06_C`` so the parser's filename regex picks it up.
parts = name.split("_", 1)
if len(parts) == 2:
name = f"{parts[0]}_{parts[1].capitalize()}"
# Stay on the same Bredow basis listing — reject stray nav links.
if "/bredow/" not in url:
continue
if url in seen:
continue
seen.add(url)
sub_html = _download(url)
sub_text = sub_html.decode("utf-8", errors="replace")
# The Plone layout wraps the per-element page in several <p>
# blocks; the first typically contains just the element header
# ("<strong>06_C</strong><br />"), the next contains the actual
# basis. Iterate and pick the first body that looks like CRYSTAL
# input (starts with ZZ NSHELL and has numeric data).
body = ""
for m in _PLONE_FILE_BLOCK_RE.finditer(sub_text):
candidate = _unescape_plone_body(m.group("body"))
if not candidate:
continue
first_line = candidate.splitlines()[0].split()
if (
len(first_line) >= 2
and first_line[0].isdigit()
and first_line[1].isdigit()
):
body = candidate
break
if not body:
continue
yield name, body.encode("utf-8")
_FILENAME_RE = re.compile(r"^\s*(?:.*/)?(\d{2,3})_([A-Z][a-z]?)\s*$")
[docs]
def fetch_bredow_basis_sets(
dest_dir: str | Path | None = None,
names: Iterable[str] | None = None,
*,
verbose: bool = True,
) -> dict[str, Path]:
"""Download the Bredow group all-electron basis sets and convert them
to .g94 files loadable by libint.
Parameters
----------
dest_dir
Target directory for the .g94 output files. Defaults to
``basis_library/basis/`` inside the vibe-qc source tree (the path
already on ``LIBINT_DATA_PATH`` in editable installs).
names
Subset of basis-set keys to fetch (from ``_BREDOW_ARCHIVES``).
``None`` fetches all supported light-element archives.
verbose
Print progress + citation info.
Returns
-------
dict mapping basis name → path of the generated .g94 file.
"""
if dest_dir is None:
# Default target: basis_library/custom/ — this directory is the
# committed-to-repo staging area. scripts/setup_basis_library.sh
# copies custom/*.g94 on top of libint's stock bases to populate
# basis_library/basis/, which is what LIBINT_DATA_PATH points at.
# Writing here keeps the generated files under version control.
here = Path(__file__).resolve().parent.parent.parent
dest_dir = here / "basis_library" / "custom"
dest = Path(dest_dir)
dest.mkdir(parents=True, exist_ok=True)
keys = list(names) if names is not None else list(_BREDOW_ARCHIVES)
results: dict[str, Path] = {}
for key in keys:
if key not in _BREDOW_ARCHIVES:
raise KeyError(
f"Unknown Bredow basis key {key!r}. Known: {sorted(_BREDOW_ARCHIVES)}"
)
url, _inner = _BREDOW_ARCHIVES[key]
if verbose:
print(f"[fetch] {key:<15s} ← {url}")
data = _download(url)
# Autodetect: archive (tar.gz / zip) or HTML listing.
if data[:2] == b"\x1f\x8b" or data[:4] == b"PK\x03\x04":
members: Iterable[tuple[str, bytes]] = list(_iter_archive_members(data))
elif data.lstrip()[:5].lower() in (b"<!doc", b"<html"):
members = list(_iter_plone_listing_members(data, url))
else:
raise RuntimeError(f"{key}: unrecognised server response format")
# Parse every per-element file into a CrystalAtomBasis, sorted by Z.
atoms_by_z: dict[int, CrystalAtomBasis] = {}
for member_name, content in members:
m = _FILENAME_RE.match(member_name)
if not m:
continue # skip top-level directory entries or READMEs
atom = parse_crystal_atom_basis(
content.decode("utf-8", errors="replace"),
source=member_name,
)
atoms_by_z[atom.Z] = atom
if not atoms_by_z:
raise RuntimeError(f"{key}: archive contained no parseable elements")
ordered = [atoms_by_z[z] for z in sorted(atoms_by_z)]
citation = _BREDOW_CITATIONS.get(key, "")
header = (
f"vibeqc-generated from Bredow-group archive '{key}'.\n"
f"Source URL: {url}\n"
f"Elements: {', '.join(a.element_symbol for a in ordered)}\n"
+ (f"Cite: {citation}\n" if citation else "")
)
g94 = emit_g94(ordered, header_comment=header)
out = dest / f"{key}.g94"
out.write_text(g94)
results[key] = out
if verbose:
print(f"[write] {out} ({len(ordered)} elements)")
if citation:
print(f" cite: {citation}")
return results
# --------------------------------------------------------------------------
# CLI: python -m vibeqc.basis_crystal [fetch | convert <file.g94> <in.crystal>]
# --------------------------------------------------------------------------
def _main(argv: list[str]) -> int:
if not argv or argv[0] in ("-h", "--help"):
print(
"usage:\n"
" python -m vibeqc.basis_crystal fetch "
"[--dest DIR] [name ...]\n"
" Download Bredow-group basis sets and convert to "
".g94 for libint.\n"
" python -m vibeqc.basis_crystal convert IN.crystal OUT.g94\n"
" Convert a CRYSTAL per-element file to a single-atom "
".g94 block."
)
return 0
cmd, rest = argv[0], argv[1:]
if cmd == "fetch":
dest = None
if rest and rest[0] == "--dest":
dest = rest[1]
rest = rest[2:]
fetch_bredow_basis_sets(dest_dir=dest, names=rest if rest else None)
return 0
if cmd == "convert":
if len(rest) != 2:
print("convert expects 2 arguments: input output")
return 2
atom = parse_crystal_atom_basis_file(rest[0])
Path(rest[1]).write_text(
emit_g94([atom], header_comment=f"converted from {rest[0]}")
)
return 0
print(f"unknown command {cmd!r}")
return 2
if __name__ == "__main__":
import sys
sys.exit(_main(sys.argv[1:]))