"""CRYSTAL-format basis-set parser and NWChem/.g94 emitter.
Targets the per-element CRYSTAL basis-set files distributed by the
Bredow group (Peintinger--Vilela Oliveira--Bredow, ``pob-*``): one file
per element, named ``ZZ_Sym`` (e.g. ``06_C``), plain-text CRYSTAL input.
Supported today
---------------
All-electron shells (S, SP, P, D, F, G). These cover the light-element
Bredow sets (pob-TZVP and pob-DZVP-rev2 for H–Br). The emitter produces
NWChem/.g94 files that libint loads without modification.
Not yet supported
-----------------
ECP blocks (``INPUT`` header on Rb–I and Cs–Po pob-* archives). Parsing
them is trivial; *applying* them in vibe-qc needs libecpint integration,
which is a separate follow-on. Such files raise ``NotImplementedError``
for now.
CRYSTAL per-element format summary
----------------------------------
Line 1::
Z NSHELL
or for ECP-using atoms::
(200 + Z) NSHELL
Each shell starts with::
ITYPE LAT NG CHE SCAL
where
* ``ITYPE`` — 0 = user-defined (the only case the Bredow files use).
* ``LAT`` — shell angular-momentum code:
0=S, 1=SP, 2=P, 3=D, 4=F, 5=G.
* ``NG`` — number of primitive Gaussians.
* ``CHE`` — formal occupancy (electrons) assigned to this shell.
* ``SCAL`` — scale factor applied to the exponents (1.0 = as-written).
Followed by ``NG`` lines of ``exponent coefficient`` (or ``exponent
s_coef p_coef`` for SP shells).
"""
from __future__ import annotations
import io
import re
import tarfile
import urllib.request
import zipfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable, Iterator, Sequence
# CRYSTAL LAT-code → angular momentum label understood by libint's .g94 loader.
# Libint's BasisSet parser treats "SP" as a general-contraction s+p pair.
_LAT_TO_SHELL = {0: "S", 1: "SP", 2: "P", 3: "D", 4: "F", 5: "G"}
# Element symbols, 1-based index so _ELEMENT_SYMBOLS[Z] returns the symbol.
# Light-element subset — extend as needed; pob-TZVP only requires through Br.
_ELEMENT_SYMBOLS = [
"", "H", "He",
"Li", "Be", "B", "C", "N", "O", "F", "Ne",
"Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
"K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
"Ga", "Ge", "As", "Se", "Br", "Kr",
"Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd",
"In", "Sn", "Sb", "Te", "I", "Xe",
"Cs", "Ba",
"La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er",
"Tm", "Yb", "Lu",
"Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
"Tl", "Pb", "Bi", "Po",
]
[docs]
@dataclass
class CrystalShell:
"""One contracted Gaussian shell in CRYSTAL format."""
shell_type: str # "S", "SP", "P", "D", "F", "G"
occupancy: float # CHE
scale_factor: float # SCAL (exponents multiplied by this)
exponents: list[float] = field(default_factory=list)
coefficients: list[float] = field(default_factory=list)
# For SP shells only: the p-side coefficients. Length matches exponents;
# empty for non-SP shells. ``coefficients`` holds the s-side.
coefficients_p: list[float] = field(default_factory=list)
@property
def n_primitives(self) -> int:
return len(self.exponents)
[docs]
@dataclass
class CrystalAtomBasis:
"""Parsed per-atom basis set (one Bredow file)."""
Z: int # atomic number (without ECP offset)
has_ecp: bool # True if file had 200+Z header
shells: list[CrystalShell] = field(default_factory=list)
@property
def element_symbol(self) -> str:
return _ELEMENT_SYMBOLS[self.Z] if 0 < self.Z < len(_ELEMENT_SYMBOLS) \
else f"Z{self.Z}"
# --------------------------------------------------------------------------
# Parser
# --------------------------------------------------------------------------
def _tokens(stream: Iterator[str]) -> Iterator[tuple[int, str]]:
"""Yield (line_no, token) pairs, skipping blank lines and CRYSTAL
comments (``!`` introduces a comment-to-end-of-line, matching CRYSTAL)."""
for lineno, raw in enumerate(stream, start=1):
# Strip trailing comments and whitespace.
cleaned = raw.split("!", 1)[0]
for tok in cleaned.split():
yield lineno, tok
[docs]
def parse_crystal_atom_basis(text: str, source: str = "<string>") -> CrystalAtomBasis:
"""Parse a single CRYSTAL per-element basis file.
``source`` is used only in error messages.
"""
tokens = _tokens(iter(text.splitlines()))
def nxt(expect_numeric: bool = False) -> str:
try:
_, tok = next(tokens)
except StopIteration as exc:
raise ValueError(f"{source}: unexpected end of file") from exc
return tok
# ---- Header: (Z or 200+Z) NSHELL ---------------------------------
z_raw = int(nxt())
nshell = int(nxt())
if z_raw >= 200:
atom = CrystalAtomBasis(Z=z_raw - 200, has_ecp=True)
elif z_raw >= 1:
atom = CrystalAtomBasis(Z=z_raw, has_ecp=False)
else:
raise ValueError(f"{source}: invalid atomic number {z_raw}")
# ---- Optional ECP block (``INPUT`` keyword) ----------------------
# Peek the next token; if it is 'INPUT' we must consume the whole ECP
# block before the shells.
lookahead_queue: list[str] = []
try:
tok = nxt()
except ValueError:
tok = ""
if tok.upper() == "INPUT":
# Not yet supported end-to-end. Swallow the block so downstream
# parsers can see clear shells, but fail fast if the caller
# expects this atom to be usable.
raise NotImplementedError(
f"{source}: CRYSTAL ECP blocks (Rb–I, Cs–Po pob-* archives) "
f"are not yet supported. Use the Rb-I files only after the "
f"libecpint integration lands."
)
else:
lookahead_queue.append(tok)
def pop() -> str:
if lookahead_queue:
return lookahead_queue.pop(0)
return nxt()
# ---- Shell loop ---------------------------------------------------
for shell_idx in range(nshell):
itype = int(pop())
lat = int(pop())
ng = int(pop())
che = float(pop())
scal = float(pop())
if itype != 0:
raise ValueError(
f"{source}: shell {shell_idx + 1}: CRYSTAL built-in basis "
f"types (ITYPE != 0) are not supported — only user-defined "
f"contracted Gaussians."
)
shell_type = _LAT_TO_SHELL.get(lat)
if shell_type is None:
raise ValueError(
f"{source}: shell {shell_idx + 1}: unknown LAT code {lat}. "
f"Known: {_LAT_TO_SHELL}."
)
shell = CrystalShell(
shell_type=shell_type,
occupancy=che,
scale_factor=scal,
)
for _ in range(ng):
expn = float(pop())
# Apply the CRYSTAL exponent-scale convention.
if scal != 1.0:
expn = expn * (scal ** 2)
if shell_type == "SP":
s_coef = float(pop())
p_coef = float(pop())
shell.exponents.append(expn)
shell.coefficients.append(s_coef)
shell.coefficients_p.append(p_coef)
else:
coef = float(pop())
shell.exponents.append(expn)
shell.coefficients.append(coef)
atom.shells.append(shell)
return atom
[docs]
def parse_crystal_atom_basis_file(path: str | Path) -> CrystalAtomBasis:
"""Convenience wrapper: parse a CRYSTAL per-element file from disk."""
p = Path(path)
return parse_crystal_atom_basis(p.read_text(), source=str(p))
# --------------------------------------------------------------------------
# NWChem / .g94 emitter
# --------------------------------------------------------------------------
[docs]
def emit_g94(atoms: Sequence[CrystalAtomBasis],
header_comment: str = "") -> str:
"""Convert a list of per-element basis specs into a single .g94
(Gaussian / NWChem format) string that libint can load."""
lines: list[str] = []
if header_comment:
for ln in header_comment.splitlines():
lines.append(f"! {ln}")
lines.append("")
for atom in atoms:
if atom.has_ecp:
raise NotImplementedError(
"emit_g94: atoms with ECPs cannot be written to a pure "
"basis-only .g94 file yet — libecpint integration pending."
)
lines.append("****")
lines.append(f"{atom.element_symbol:<2s} 0")
for shell in atom.shells:
# NWChem format uses a leading keyword + NG + 1.00 (unused).
lines.append(f"{shell.shell_type:<2s} {shell.n_primitives:2d} 1.00")
for i, exp in enumerate(shell.exponents):
if shell.shell_type == "SP":
lines.append(
f" {exp:18.10f} {shell.coefficients[i]:18.10f}"
f" {shell.coefficients_p[i]:18.10f}"
)
else:
lines.append(
f" {exp:18.10f} {shell.coefficients[i]:18.10f}"
)
lines.append("****")
lines.append("")
return "\n".join(lines)
# --------------------------------------------------------------------------
# Bredow-group archive fetcher
# --------------------------------------------------------------------------
# Canonical registry of the all-electron Bredow archives (light elements,
# H–Br). ECP-bearing archives are known but not fetched until libecpint
# support lands; see _BREDOW_ARCHIVES_ECP below.
_BREDOW_BASE = "https://www.chemie.uni-bonn.de/bredow/de/software"
# Two modes of distribution:
# "archive": URL returns a tar.gz / zip archive.
# "listing": URL returns an HTML page with per-element subpages; every
# element page embeds its file contents inside a <p> block.
# The fetcher auto-dispatches on the magic bytes of the response.
_BREDOW_ARCHIVES = {
# key = local basis name → (URL, pretty-name)
"pob-tzvp": (f"{_BREDOW_BASE}/pob-tzvp-tar.gz", "pob-TZVP"),
"pob-tzvp-rev2": (f"{_BREDOW_BASE}/pob-tzvp-rev2-tar.gz", "pob-TZVP-rev2"),
# pob-DZVP-rev2 is served only as an HTML listing with embedded file
# bodies — the fetcher transparently scrapes it.
"pob-dzvp-rev2": (f"{_BREDOW_BASE}/pob-dzvp-rev2", "pob-DZVP-rev2"),
}
_BREDOW_ARCHIVES_ECP = {
# Not fetched today; listed for transparency. libecpint integration is a
# follow-on phase.
"pob-dzvp-rb-i": f"{_BREDOW_BASE}/pob-dzvp-rb-i-tar.gz",
"pob-tzvp-rb-i": f"{_BREDOW_BASE}/pob-tzvp-rb-i-tar.gz",
"pob-tzvp-rev2-rb-i": f"{_BREDOW_BASE}/pob-tzvp-ref2-rb-i-tar.gz",
"pob-tzvp-rev2-cs-po": f"{_BREDOW_BASE}/pob-tzvp-rev2-cs-po-tar.gz",
"pob-tzv-rev2-la-lu": f"{_BREDOW_BASE}/pob-tzv-rev2-la-lu-tar.gz",
}
# Per-basis citation text surfaced to the user on successful fetch.
_BREDOW_CITATIONS = {
"pob-tzvp": (
"Peintinger, Vilela Oliveira, Bredow. J. Comput. Chem. 34, 451–459 "
"(2013). DOI: 10.1002/jcc.23153"
),
"pob-tzvp-rev2": (
"Vilela Oliveira, Laun, Peintinger, Bredow. J. Comput. Chem. 40, "
"2364–2376 (2019). DOI: 10.1002/jcc.26013"
),
"pob-dzvp-rev2": (
"Vilela Oliveira, Laun, Peintinger, Bredow. J. Comput. Chem. 40, "
"2364–2376 (2019). DOI: 10.1002/jcc.26013"
),
}
def _download(url: str, timeout: float = 60.0,
retries: int = 3) -> bytes:
"""Fetch a URL to memory; tolerates both tar.gz archives and single files.
The Bredow Plone server is fast for tar.gz archives and slower for
per-element HTML pages. Retries transparently on transient timeouts.
"""
import time
last_err: Exception | None = None
for attempt in range(retries):
try:
with urllib.request.urlopen(url, timeout=timeout) as resp:
return resp.read()
except Exception as err:
last_err = err
time.sleep(1.0 * (attempt + 1))
assert last_err is not None
raise last_err
_ELEMENT_LINK_RE = re.compile(
r'href="(?P<url>[^"]*?/(?P<name>\d{1,3}_[a-z]+))"',
re.IGNORECASE,
)
# File contents inside a Plone page are wrapped in a single <p>...</p> block
# with <br /> line separators. We extract that and normalise it to plain
# text. The content block is identified by starting with "ZZ_SYM" or by
# having the characteristic "Z N" first data line — we match the former.
_PLONE_FILE_BLOCK_RE = re.compile(
r"<p>\s*(?:<strong>[^<]*</strong>\s*<br\s*/>)?(?P<body>.*?)</p>",
re.IGNORECASE | re.DOTALL,
)
def _unescape_plone_body(body: str) -> str:
"""Convert a Plone <p>…<br />…</p> blob to plain text lines."""
# Collapse <br/> in any form to newline, drop remaining tags.
t = re.sub(r"<br\s*/?>", "\n", body)
t = re.sub(r"</?strong>", "", t)
t = re.sub(r"<[^>]+>", "", t) # any stray tag
# HTML entities we might encounter.
t = t.replace(" ", " ").replace("&", "&")
# Unicode non-breaking spaces frequently appear in scraped text.
t = t.replace("\u00a0", " ")
return t.strip()
def _iter_archive_members(data: bytes) -> Iterator[tuple[str, bytes]]:
"""Yield (name, bytes) for every member of a tar.gz or zip archive.
Bredow's server serves some URLs without extensions (raw tar.gz bytes);
we autodetect the magic bytes.
"""
# Tarball (gzip-compressed).
if data[:2] == b"\x1f\x8b":
with tarfile.open(fileobj=io.BytesIO(data), mode="r:gz") as tf:
for member in tf.getmembers():
if not member.isfile():
continue
f = tf.extractfile(member)
if f is None:
continue
yield member.name, f.read()
return
# Zip.
if data[:4] == b"PK\x03\x04":
with zipfile.ZipFile(io.BytesIO(data)) as zf:
for name in zf.namelist():
if name.endswith("/"):
continue
yield name, zf.read(name)
return
raise RuntimeError("unknown archive format (not tar.gz or zip)")
def _iter_plone_listing_members(data: bytes,
base_url: str) -> Iterator[tuple[str, bytes]]:
"""Walk a Plone listing page and fetch every per-element subpage,
returning the embedded file body as bytes.
``base_url`` is the listing URL — used to resolve relative links and
to de-duplicate cross-references.
"""
html = data.decode("utf-8", errors="replace")
seen: set[str] = set()
for match in _ELEMENT_LINK_RE.finditer(html):
url = match.group("url")
name = match.group("name")
# Normalise: ``06_c`` → ``06_C`` so the parser's filename regex picks it up.
parts = name.split("_", 1)
if len(parts) == 2:
name = f"{parts[0]}_{parts[1].capitalize()}"
# Stay on the same Bredow basis listing — reject stray nav links.
if "/bredow/" not in url:
continue
if url in seen:
continue
seen.add(url)
sub_html = _download(url)
sub_text = sub_html.decode("utf-8", errors="replace")
# The Plone layout wraps the per-element page in several <p>
# blocks; the first typically contains just the element header
# ("<strong>06_C</strong><br />"), the next contains the actual
# basis. Iterate and pick the first body that looks like CRYSTAL
# input (starts with ZZ NSHELL and has numeric data).
body = ""
for m in _PLONE_FILE_BLOCK_RE.finditer(sub_text):
candidate = _unescape_plone_body(m.group("body"))
if not candidate:
continue
first_line = candidate.splitlines()[0].split()
if (len(first_line) >= 2
and first_line[0].isdigit()
and first_line[1].isdigit()):
body = candidate
break
if not body:
continue
yield name, body.encode("utf-8")
_FILENAME_RE = re.compile(r"^\s*(?:.*/)?(\d{2,3})_([A-Z][a-z]?)\s*$")
[docs]
def fetch_bredow_basis_sets(
dest_dir: str | Path | None = None,
names: Iterable[str] | None = None,
*,
verbose: bool = True) -> dict[str, Path]:
"""Download the Bredow group all-electron basis sets and convert them
to .g94 files loadable by libint.
Parameters
----------
dest_dir
Target directory for the .g94 output files. Defaults to
``basis_library/basis/`` inside the vibe-qc source tree (the path
already on ``LIBINT_DATA_PATH`` in editable installs).
names
Subset of basis-set keys to fetch (from ``_BREDOW_ARCHIVES``).
``None`` fetches all supported light-element archives.
verbose
Print progress + citation info.
Returns
-------
dict mapping basis name → path of the generated .g94 file.
"""
if dest_dir is None:
# Default target: basis_library/custom/ — this directory is the
# committed-to-repo staging area. scripts/setup_basis_library.sh
# copies custom/*.g94 on top of libint's stock bases to populate
# basis_library/basis/, which is what LIBINT_DATA_PATH points at.
# Writing here keeps the generated files under version control.
here = Path(__file__).resolve().parent.parent.parent
dest_dir = here / "basis_library" / "custom"
dest = Path(dest_dir)
dest.mkdir(parents=True, exist_ok=True)
keys = list(names) if names is not None else list(_BREDOW_ARCHIVES)
results: dict[str, Path] = {}
for key in keys:
if key not in _BREDOW_ARCHIVES:
raise KeyError(
f"Unknown Bredow basis key {key!r}. "
f"Known: {sorted(_BREDOW_ARCHIVES)}"
)
url, _inner = _BREDOW_ARCHIVES[key]
if verbose:
print(f"[fetch] {key:<15s} ← {url}")
data = _download(url)
# Autodetect: archive (tar.gz / zip) or HTML listing.
if data[:2] == b"\x1f\x8b" or data[:4] == b"PK\x03\x04":
members: Iterable[tuple[str, bytes]] = list(
_iter_archive_members(data))
elif data.lstrip()[:5].lower() in (b"<!doc", b"<html"):
members = list(_iter_plone_listing_members(data, url))
else:
raise RuntimeError(f"{key}: unrecognised server response format")
# Parse every per-element file into a CrystalAtomBasis, sorted by Z.
atoms_by_z: dict[int, CrystalAtomBasis] = {}
for member_name, content in members:
m = _FILENAME_RE.match(member_name)
if not m:
continue # skip top-level directory entries or READMEs
atom = parse_crystal_atom_basis(
content.decode("utf-8", errors="replace"),
source=member_name,
)
atoms_by_z[atom.Z] = atom
if not atoms_by_z:
raise RuntimeError(f"{key}: archive contained no parseable elements")
ordered = [atoms_by_z[z] for z in sorted(atoms_by_z)]
citation = _BREDOW_CITATIONS.get(key, "")
header = (
f"vibeqc-generated from Bredow-group archive '{key}'.\n"
f"Source URL: {url}\n"
f"Elements: {', '.join(a.element_symbol for a in ordered)}\n"
+ (f"Cite: {citation}\n" if citation else "")
)
g94 = emit_g94(ordered, header_comment=header)
out = dest / f"{key}.g94"
out.write_text(g94)
results[key] = out
if verbose:
print(f"[write] {out} ({len(ordered)} elements)")
if citation:
print(f" cite: {citation}")
return results
# --------------------------------------------------------------------------
# CLI: python -m vibeqc.basis_crystal [fetch | convert <file.g94> <in.crystal>]
# --------------------------------------------------------------------------
def _main(argv: list[str]) -> int:
if not argv or argv[0] in ("-h", "--help"):
print(
"usage:\n"
" python -m vibeqc.basis_crystal fetch "
"[--dest DIR] [name ...]\n"
" Download Bredow-group basis sets and convert to "
".g94 for libint.\n"
" python -m vibeqc.basis_crystal convert IN.crystal OUT.g94\n"
" Convert a CRYSTAL per-element file to a single-atom "
".g94 block."
)
return 0
cmd, rest = argv[0], argv[1:]
if cmd == "fetch":
dest = None
if rest and rest[0] == "--dest":
dest = rest[1]
rest = rest[2:]
fetch_bredow_basis_sets(dest_dir=dest,
names=rest if rest else None)
return 0
if cmd == "convert":
if len(rest) != 2:
print("convert expects 2 arguments: input output")
return 2
atom = parse_crystal_atom_basis_file(rest[0])
Path(rest[1]).write_text(
emit_g94([atom], header_comment=f"converted from {rest[0]}")
)
return 0
print(f"unknown command {cmd!r}")
return 2
if __name__ == "__main__":
import sys
sys.exit(_main(sys.argv[1:]))