Source code for vibeqc.basis_crystal

"""CRYSTAL-format basis-set parser and NWChem/.g94 emitter.

Targets the per-element CRYSTAL basis-set files distributed by the
Bredow group (Peintinger--Vilela Oliveira--Bredow, ``pob-*``): one file
per element, named ``ZZ_Sym`` (e.g. ``06_C``), plain-text CRYSTAL input.

Supported today
---------------
All-electron shells (S, SP, P, D, F, G). These cover the light-element
Bredow sets (pob-TZVP and pob-DZVP-rev2 for H–Br). The emitter produces
NWChem/.g94 files that libint loads without modification.

Not yet supported
-----------------
ECP blocks (``INPUT`` header on Rb–I and Cs–Po pob-* archives). Parsing
them is trivial; *applying* them in vibe-qc needs libecpint integration,
which is a separate follow-on. Such files raise ``NotImplementedError``
for now.

CRYSTAL per-element format summary
----------------------------------
Line 1::
    Z  NSHELL
or for ECP-using atoms::
    (200 + Z)  NSHELL

Each shell starts with::
    ITYPE  LAT  NG  CHE  SCAL

where

* ``ITYPE`` — 0 = user-defined (the only case the Bredow files use).
* ``LAT``   — shell angular-momentum code:
      0=S, 1=SP, 2=P, 3=D, 4=F, 5=G.
* ``NG``    — number of primitive Gaussians.
* ``CHE``   — formal occupancy (electrons) assigned to this shell.
* ``SCAL``  — scale factor applied to the exponents (1.0 = as-written).

Followed by ``NG`` lines of ``exponent coefficient`` (or ``exponent
s_coef p_coef`` for SP shells).
"""

from __future__ import annotations

import io
import re
import tarfile
import urllib.request
import zipfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable, Iterator, Sequence

# CRYSTAL LAT-code → angular momentum label understood by libint's .g94 loader.
# Libint's BasisSet parser treats "SP" as a general-contraction s+p pair.
_LAT_TO_SHELL = {0: "S", 1: "SP", 2: "P", 3: "D", 4: "F", 5: "G"}

# Element symbols, 1-based index so _ELEMENT_SYMBOLS[Z] returns the symbol.
# Light-element subset — extend as needed; pob-TZVP only requires through Br.
_ELEMENT_SYMBOLS = [
    "",   "H",  "He",
    "Li", "Be", "B",  "C",  "N",  "O",  "F",  "Ne",
    "Na", "Mg", "Al", "Si", "P",  "S",  "Cl", "Ar",
    "K",  "Ca", "Sc", "Ti", "V",  "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
    "Ga", "Ge", "As", "Se", "Br", "Kr",
    "Rb", "Sr", "Y",  "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd",
    "In", "Sn", "Sb", "Te", "I",  "Xe",
    "Cs", "Ba",
    "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er",
    "Tm", "Yb", "Lu",
    "Hf", "Ta", "W",  "Re", "Os", "Ir", "Pt", "Au", "Hg",
    "Tl", "Pb", "Bi", "Po",
]


[docs] @dataclass class CrystalShell: """One contracted Gaussian shell in CRYSTAL format.""" shell_type: str # "S", "SP", "P", "D", "F", "G" occupancy: float # CHE scale_factor: float # SCAL (exponents multiplied by this) exponents: list[float] = field(default_factory=list) coefficients: list[float] = field(default_factory=list) # For SP shells only: the p-side coefficients. Length matches exponents; # empty for non-SP shells. ``coefficients`` holds the s-side. coefficients_p: list[float] = field(default_factory=list) @property def n_primitives(self) -> int: return len(self.exponents)
[docs] @dataclass class CrystalAtomBasis: """Parsed per-atom basis set (one Bredow file).""" Z: int # atomic number (without ECP offset) has_ecp: bool # True if file had 200+Z header shells: list[CrystalShell] = field(default_factory=list) @property def element_symbol(self) -> str: return _ELEMENT_SYMBOLS[self.Z] if 0 < self.Z < len(_ELEMENT_SYMBOLS) \ else f"Z{self.Z}"
# -------------------------------------------------------------------------- # Parser # -------------------------------------------------------------------------- def _tokens(stream: Iterator[str]) -> Iterator[tuple[int, str]]: """Yield (line_no, token) pairs, skipping blank lines and CRYSTAL comments (``!`` introduces a comment-to-end-of-line, matching CRYSTAL).""" for lineno, raw in enumerate(stream, start=1): # Strip trailing comments and whitespace. cleaned = raw.split("!", 1)[0] for tok in cleaned.split(): yield lineno, tok
[docs] def parse_crystal_atom_basis(text: str, source: str = "<string>") -> CrystalAtomBasis: """Parse a single CRYSTAL per-element basis file. ``source`` is used only in error messages. """ tokens = _tokens(iter(text.splitlines())) def nxt(expect_numeric: bool = False) -> str: try: _, tok = next(tokens) except StopIteration as exc: raise ValueError(f"{source}: unexpected end of file") from exc return tok # ---- Header: (Z or 200+Z) NSHELL --------------------------------- z_raw = int(nxt()) nshell = int(nxt()) if z_raw >= 200: atom = CrystalAtomBasis(Z=z_raw - 200, has_ecp=True) elif z_raw >= 1: atom = CrystalAtomBasis(Z=z_raw, has_ecp=False) else: raise ValueError(f"{source}: invalid atomic number {z_raw}") # ---- Optional ECP block (``INPUT`` keyword) ---------------------- # Peek the next token; if it is 'INPUT' we must consume the whole ECP # block before the shells. lookahead_queue: list[str] = [] try: tok = nxt() except ValueError: tok = "" if tok.upper() == "INPUT": # Not yet supported end-to-end. Swallow the block so downstream # parsers can see clear shells, but fail fast if the caller # expects this atom to be usable. raise NotImplementedError( f"{source}: CRYSTAL ECP blocks (Rb–I, Cs–Po pob-* archives) " f"are not yet supported. Use the Rb-I files only after the " f"libecpint integration lands." ) else: lookahead_queue.append(tok) def pop() -> str: if lookahead_queue: return lookahead_queue.pop(0) return nxt() # ---- Shell loop --------------------------------------------------- for shell_idx in range(nshell): itype = int(pop()) lat = int(pop()) ng = int(pop()) che = float(pop()) scal = float(pop()) if itype != 0: raise ValueError( f"{source}: shell {shell_idx + 1}: CRYSTAL built-in basis " f"types (ITYPE != 0) are not supported — only user-defined " f"contracted Gaussians." ) shell_type = _LAT_TO_SHELL.get(lat) if shell_type is None: raise ValueError( f"{source}: shell {shell_idx + 1}: unknown LAT code {lat}. " f"Known: {_LAT_TO_SHELL}." ) shell = CrystalShell( shell_type=shell_type, occupancy=che, scale_factor=scal, ) for _ in range(ng): expn = float(pop()) # Apply the CRYSTAL exponent-scale convention. if scal != 1.0: expn = expn * (scal ** 2) if shell_type == "SP": s_coef = float(pop()) p_coef = float(pop()) shell.exponents.append(expn) shell.coefficients.append(s_coef) shell.coefficients_p.append(p_coef) else: coef = float(pop()) shell.exponents.append(expn) shell.coefficients.append(coef) atom.shells.append(shell) return atom
[docs] def parse_crystal_atom_basis_file(path: str | Path) -> CrystalAtomBasis: """Convenience wrapper: parse a CRYSTAL per-element file from disk.""" p = Path(path) return parse_crystal_atom_basis(p.read_text(), source=str(p))
# -------------------------------------------------------------------------- # NWChem / .g94 emitter # --------------------------------------------------------------------------
[docs] def emit_g94(atoms: Sequence[CrystalAtomBasis], header_comment: str = "") -> str: """Convert a list of per-element basis specs into a single .g94 (Gaussian / NWChem format) string that libint can load.""" lines: list[str] = [] if header_comment: for ln in header_comment.splitlines(): lines.append(f"! {ln}") lines.append("") for atom in atoms: if atom.has_ecp: raise NotImplementedError( "emit_g94: atoms with ECPs cannot be written to a pure " "basis-only .g94 file yet — libecpint integration pending." ) lines.append("****") lines.append(f"{atom.element_symbol:<2s} 0") for shell in atom.shells: # NWChem format uses a leading keyword + NG + 1.00 (unused). lines.append(f"{shell.shell_type:<2s} {shell.n_primitives:2d} 1.00") for i, exp in enumerate(shell.exponents): if shell.shell_type == "SP": lines.append( f" {exp:18.10f} {shell.coefficients[i]:18.10f}" f" {shell.coefficients_p[i]:18.10f}" ) else: lines.append( f" {exp:18.10f} {shell.coefficients[i]:18.10f}" ) lines.append("****") lines.append("") return "\n".join(lines)
# -------------------------------------------------------------------------- # Bredow-group archive fetcher # -------------------------------------------------------------------------- # Canonical registry of the all-electron Bredow archives (light elements, # H–Br). ECP-bearing archives are known but not fetched until libecpint # support lands; see _BREDOW_ARCHIVES_ECP below. _BREDOW_BASE = "https://www.chemie.uni-bonn.de/bredow/de/software" # Two modes of distribution: # "archive": URL returns a tar.gz / zip archive. # "listing": URL returns an HTML page with per-element subpages; every # element page embeds its file contents inside a <p> block. # The fetcher auto-dispatches on the magic bytes of the response. _BREDOW_ARCHIVES = { # key = local basis name → (URL, pretty-name) "pob-tzvp": (f"{_BREDOW_BASE}/pob-tzvp-tar.gz", "pob-TZVP"), "pob-tzvp-rev2": (f"{_BREDOW_BASE}/pob-tzvp-rev2-tar.gz", "pob-TZVP-rev2"), # pob-DZVP-rev2 is served only as an HTML listing with embedded file # bodies — the fetcher transparently scrapes it. "pob-dzvp-rev2": (f"{_BREDOW_BASE}/pob-dzvp-rev2", "pob-DZVP-rev2"), } _BREDOW_ARCHIVES_ECP = { # Not fetched today; listed for transparency. libecpint integration is a # follow-on phase. "pob-dzvp-rb-i": f"{_BREDOW_BASE}/pob-dzvp-rb-i-tar.gz", "pob-tzvp-rb-i": f"{_BREDOW_BASE}/pob-tzvp-rb-i-tar.gz", "pob-tzvp-rev2-rb-i": f"{_BREDOW_BASE}/pob-tzvp-ref2-rb-i-tar.gz", "pob-tzvp-rev2-cs-po": f"{_BREDOW_BASE}/pob-tzvp-rev2-cs-po-tar.gz", "pob-tzv-rev2-la-lu": f"{_BREDOW_BASE}/pob-tzv-rev2-la-lu-tar.gz", } # Per-basis citation text surfaced to the user on successful fetch. _BREDOW_CITATIONS = { "pob-tzvp": ( "Peintinger, Vilela Oliveira, Bredow. J. Comput. Chem. 34, 451–459 " "(2013). DOI: 10.1002/jcc.23153" ), "pob-tzvp-rev2": ( "Vilela Oliveira, Laun, Peintinger, Bredow. J. Comput. Chem. 40, " "2364–2376 (2019). DOI: 10.1002/jcc.26013" ), "pob-dzvp-rev2": ( "Vilela Oliveira, Laun, Peintinger, Bredow. J. Comput. Chem. 40, " "2364–2376 (2019). DOI: 10.1002/jcc.26013" ), } def _download(url: str, timeout: float = 60.0, retries: int = 3) -> bytes: """Fetch a URL to memory; tolerates both tar.gz archives and single files. The Bredow Plone server is fast for tar.gz archives and slower for per-element HTML pages. Retries transparently on transient timeouts. """ import time last_err: Exception | None = None for attempt in range(retries): try: with urllib.request.urlopen(url, timeout=timeout) as resp: return resp.read() except Exception as err: last_err = err time.sleep(1.0 * (attempt + 1)) assert last_err is not None raise last_err _ELEMENT_LINK_RE = re.compile( r'href="(?P<url>[^"]*?/(?P<name>\d{1,3}_[a-z]+))"', re.IGNORECASE, ) # File contents inside a Plone page are wrapped in a single <p>...</p> block # with <br /> line separators. We extract that and normalise it to plain # text. The content block is identified by starting with "ZZ_SYM" or by # having the characteristic "Z N" first data line — we match the former. _PLONE_FILE_BLOCK_RE = re.compile( r"<p>\s*(?:<strong>[^<]*</strong>\s*<br\s*/>)?(?P<body>.*?)</p>", re.IGNORECASE | re.DOTALL, ) def _unescape_plone_body(body: str) -> str: """Convert a Plone <p>…<br />…</p> blob to plain text lines.""" # Collapse <br/> in any form to newline, drop remaining tags. t = re.sub(r"<br\s*/?>", "\n", body) t = re.sub(r"</?strong>", "", t) t = re.sub(r"<[^>]+>", "", t) # any stray tag # HTML entities we might encounter. t = t.replace("&nbsp;", " ").replace("&amp;", "&") # Unicode non-breaking spaces frequently appear in scraped text. t = t.replace("\u00a0", " ") return t.strip() def _iter_archive_members(data: bytes) -> Iterator[tuple[str, bytes]]: """Yield (name, bytes) for every member of a tar.gz or zip archive. Bredow's server serves some URLs without extensions (raw tar.gz bytes); we autodetect the magic bytes. """ # Tarball (gzip-compressed). if data[:2] == b"\x1f\x8b": with tarfile.open(fileobj=io.BytesIO(data), mode="r:gz") as tf: for member in tf.getmembers(): if not member.isfile(): continue f = tf.extractfile(member) if f is None: continue yield member.name, f.read() return # Zip. if data[:4] == b"PK\x03\x04": with zipfile.ZipFile(io.BytesIO(data)) as zf: for name in zf.namelist(): if name.endswith("/"): continue yield name, zf.read(name) return raise RuntimeError("unknown archive format (not tar.gz or zip)") def _iter_plone_listing_members(data: bytes, base_url: str) -> Iterator[tuple[str, bytes]]: """Walk a Plone listing page and fetch every per-element subpage, returning the embedded file body as bytes. ``base_url`` is the listing URL — used to resolve relative links and to de-duplicate cross-references. """ html = data.decode("utf-8", errors="replace") seen: set[str] = set() for match in _ELEMENT_LINK_RE.finditer(html): url = match.group("url") name = match.group("name") # Normalise: ``06_c`` → ``06_C`` so the parser's filename regex picks it up. parts = name.split("_", 1) if len(parts) == 2: name = f"{parts[0]}_{parts[1].capitalize()}" # Stay on the same Bredow basis listing — reject stray nav links. if "/bredow/" not in url: continue if url in seen: continue seen.add(url) sub_html = _download(url) sub_text = sub_html.decode("utf-8", errors="replace") # The Plone layout wraps the per-element page in several <p> # blocks; the first typically contains just the element header # ("<strong>06_C</strong><br />"), the next contains the actual # basis. Iterate and pick the first body that looks like CRYSTAL # input (starts with ZZ NSHELL and has numeric data). body = "" for m in _PLONE_FILE_BLOCK_RE.finditer(sub_text): candidate = _unescape_plone_body(m.group("body")) if not candidate: continue first_line = candidate.splitlines()[0].split() if (len(first_line) >= 2 and first_line[0].isdigit() and first_line[1].isdigit()): body = candidate break if not body: continue yield name, body.encode("utf-8") _FILENAME_RE = re.compile(r"^\s*(?:.*/)?(\d{2,3})_([A-Z][a-z]?)\s*$")
[docs] def fetch_bredow_basis_sets( dest_dir: str | Path | None = None, names: Iterable[str] | None = None, *, verbose: bool = True) -> dict[str, Path]: """Download the Bredow group all-electron basis sets and convert them to .g94 files loadable by libint. Parameters ---------- dest_dir Target directory for the .g94 output files. Defaults to ``basis_library/basis/`` inside the vibe-qc source tree (the path already on ``LIBINT_DATA_PATH`` in editable installs). names Subset of basis-set keys to fetch (from ``_BREDOW_ARCHIVES``). ``None`` fetches all supported light-element archives. verbose Print progress + citation info. Returns ------- dict mapping basis name → path of the generated .g94 file. """ if dest_dir is None: # Default target: basis_library/custom/ — this directory is the # committed-to-repo staging area. scripts/setup_basis_library.sh # copies custom/*.g94 on top of libint's stock bases to populate # basis_library/basis/, which is what LIBINT_DATA_PATH points at. # Writing here keeps the generated files under version control. here = Path(__file__).resolve().parent.parent.parent dest_dir = here / "basis_library" / "custom" dest = Path(dest_dir) dest.mkdir(parents=True, exist_ok=True) keys = list(names) if names is not None else list(_BREDOW_ARCHIVES) results: dict[str, Path] = {} for key in keys: if key not in _BREDOW_ARCHIVES: raise KeyError( f"Unknown Bredow basis key {key!r}. " f"Known: {sorted(_BREDOW_ARCHIVES)}" ) url, _inner = _BREDOW_ARCHIVES[key] if verbose: print(f"[fetch] {key:<15s}{url}") data = _download(url) # Autodetect: archive (tar.gz / zip) or HTML listing. if data[:2] == b"\x1f\x8b" or data[:4] == b"PK\x03\x04": members: Iterable[tuple[str, bytes]] = list( _iter_archive_members(data)) elif data.lstrip()[:5].lower() in (b"<!doc", b"<html"): members = list(_iter_plone_listing_members(data, url)) else: raise RuntimeError(f"{key}: unrecognised server response format") # Parse every per-element file into a CrystalAtomBasis, sorted by Z. atoms_by_z: dict[int, CrystalAtomBasis] = {} for member_name, content in members: m = _FILENAME_RE.match(member_name) if not m: continue # skip top-level directory entries or READMEs atom = parse_crystal_atom_basis( content.decode("utf-8", errors="replace"), source=member_name, ) atoms_by_z[atom.Z] = atom if not atoms_by_z: raise RuntimeError(f"{key}: archive contained no parseable elements") ordered = [atoms_by_z[z] for z in sorted(atoms_by_z)] citation = _BREDOW_CITATIONS.get(key, "") header = ( f"vibeqc-generated from Bredow-group archive '{key}'.\n" f"Source URL: {url}\n" f"Elements: {', '.join(a.element_symbol for a in ordered)}\n" + (f"Cite: {citation}\n" if citation else "") ) g94 = emit_g94(ordered, header_comment=header) out = dest / f"{key}.g94" out.write_text(g94) results[key] = out if verbose: print(f"[write] {out} ({len(ordered)} elements)") if citation: print(f" cite: {citation}") return results
# -------------------------------------------------------------------------- # CLI: python -m vibeqc.basis_crystal [fetch | convert <file.g94> <in.crystal>] # -------------------------------------------------------------------------- def _main(argv: list[str]) -> int: if not argv or argv[0] in ("-h", "--help"): print( "usage:\n" " python -m vibeqc.basis_crystal fetch " "[--dest DIR] [name ...]\n" " Download Bredow-group basis sets and convert to " ".g94 for libint.\n" " python -m vibeqc.basis_crystal convert IN.crystal OUT.g94\n" " Convert a CRYSTAL per-element file to a single-atom " ".g94 block." ) return 0 cmd, rest = argv[0], argv[1:] if cmd == "fetch": dest = None if rest and rest[0] == "--dest": dest = rest[1] rest = rest[2:] fetch_bredow_basis_sets(dest_dir=dest, names=rest if rest else None) return 0 if cmd == "convert": if len(rest) != 2: print("convert expects 2 arguments: input output") return 2 atom = parse_crystal_atom_basis_file(rest[0]) Path(rest[1]).write_text( emit_g94([atom], header_comment=f"converted from {rest[0]}") ) return 0 print(f"unknown command {cmd!r}") return 2 if __name__ == "__main__": import sys sys.exit(_main(sys.argv[1:]))