Skip to content

API Reference

vflank can be used as a library. The pieces compose directly — see the Developer Guide for runnable examples. Documentation is generated from the source docstrings.

Core

vflank.core.variant

vflank.core.variant

The :class:Variant value object and per-variant validation.

Decoupling the pipeline from pandas rows (the original script iterated df.iterrows() directly) makes the core logic testable with plain objects.

Variant dataclass

A single small variant in canonical internal form.

Coordinates are 1-based, fully-closed [start, end] (MAF convention). chrom is the bare chromosome; raw_chrom preserves the original display notation for headers/messages only.

Source code in src/vflank/core/variant.py
@dataclass(slots=True)
class Variant:
    """A single small variant in canonical internal form.

    Coordinates are 1-based, fully-closed ``[start, end]`` (MAF convention).
    ``chrom`` is the *bare* chromosome; ``raw_chrom`` preserves the original
    display notation for headers/messages only.
    """

    chrom: str
    start: int
    end: int
    ref: str
    alt: str
    gene: str = "UNKNOWN"
    protein: str = ""
    cdna: str = ""
    sample: str = "SAMPLE"
    raw_chrom: str = ""

    def __post_init__(self) -> None:
        if not self.raw_chrom:
            self.raw_chrom = self.chrom

validate_allele

validate_allele(allele)

True if an allele is a valid base string (or the '-'/'.' placeholders).

Source code in src/vflank/core/variant.py
def validate_allele(allele: str) -> bool:
    """True if an allele is a valid base string (or the '-'/'.' placeholders)."""
    if not allele or allele in (".", "-"):
        return True
    return all(b in VALID_BASES for b in allele)

validate_coordinates

validate_coordinates(start, end)

Return an error message if coordinates are invalid, else None.

Source code in src/vflank/core/variant.py
def validate_coordinates(start: int, end: int) -> str | None:
    """Return an error message if coordinates are invalid, else None."""
    if start < 1:
        return f"start={start} is < 1"
    if end < start:
        return f"end={end} < start={start}"
    return None

vflank.core.chrom

vflank.core.chrom

Chromosome notation detection and normalisation.

All functions here are pure (no I/O) so they are trivially unit-testable. The canonical internal form is the bare chromosome string (e.g. "7", "X", "MT"); callers convert to chr-prefixed form for a given FASTA/VCF at the last moment via :func:chrom_for_contigs.

normalise_chrom

normalise_chrom(raw_chrom)

Convert a raw chromosome value to a canonical bare chrom string.

Handles NaN/None/empty, any case of a chr prefix, bare integers, numeric sex/mito encodings (23->X, 24->Y, 25/26->MT), and mito aliases (M->MT).

Returns (bare_chrom, error). If error is not None the value could not be normalised and bare_chrom is None (the variant should be skipped).

Source code in src/vflank/core/chrom.py
def normalise_chrom(raw_chrom: object) -> tuple[str | None, str | None]:
    """Convert a raw chromosome value to a canonical bare chrom string.

    Handles NaN/None/empty, any case of a ``chr`` prefix, bare integers, numeric
    sex/mito encodings (23->X, 24->Y, 25/26->MT), and mito aliases (M->MT).

    Returns ``(bare_chrom, error)``. If ``error`` is not None the value could not
    be normalised and ``bare_chrom`` is None (the variant should be skipped).
    """
    if raw_chrom is None:
        return None, "chromosome is None"
    if isinstance(raw_chrom, float) and math.isnan(raw_chrom):
        return None, "chromosome is NaN (missing value)"

    # Pandas types any chromosome column containing a NaN as float, turning
    # "17" into 17.0 (and numpy.float64 subclasses float). Recover the integer
    # form so numeric chromosomes normalise instead of being rejected as "17.0".
    if isinstance(raw_chrom, float):
        raw_chrom = int(raw_chrom)

    chrom = str(raw_chrom).strip()
    if not chrom or chrom.lower() in ("nan", "none", ".", ""):
        return None, f"chromosome is empty or missing (got {raw_chrom!r})"

    # Same artifact surviving as a string, e.g. "17.0" read from a text cell.
    if chrom.endswith(".0") and chrom[:-2].isdigit():
        chrom = chrom[:-2]

    upper = chrom.upper()

    # Mitochondrial aliases first (before chr-stripping, to catch 'chrM').
    if upper in _MITO_ALIASES:
        return _MITO_ALIASES[upper], None

    # Case-insensitive chr-prefix stripping; preserve case of the remainder.
    bare = chrom[3:] if upper.startswith("CHR") else chrom

    # Numeric alternative encodings (23->X, 24->Y, 25/26->MT).
    bare = _NUMERIC_CHROM_MAP.get(bare, bare)

    # Uppercase for X/Y/MT consistency.
    if bare.upper() in ("X", "Y", "MT"):
        bare = bare.upper()

    if bare not in VALID_CHROMS:
        return None, (
            f"unrecognised chromosome value {raw_chrom!r} "
            f"(normalised to {bare!r}, not in {sorted(VALID_CHROMS)})"
        )
    return bare, None

contigs_have_chr

contigs_have_chr(contigs)

Return True if any contig in an iterable is chr-prefixed.

Probes chr1/1 first, then falls back through chr2-chr5. Defaults to True (assume prefixed) when undeterminable.

Source code in src/vflank/core/chrom.py
def contigs_have_chr(contigs) -> bool:
    """Return True if any contig in an iterable is ``chr``-prefixed.

    Probes ``chr1``/``1`` first, then falls back through ``chr2``-``chr5``.
    Defaults to True (assume prefixed) when undeterminable.
    """
    refs = set(contigs)
    if "chr1" in refs:
        return True
    if "1" in refs:
        return False
    for i in range(2, 6):
        if f"chr{i}" in refs:
            return True
        if str(i) in refs:
            return False
    return True

chrom_for_contigs

chrom_for_contigs(bare, has_chr)

Convert a bare chromosome to the notation used by a FASTA/VCF.

Source code in src/vflank/core/chrom.py
def chrom_for_contigs(bare: str, has_chr: bool) -> str:
    """Convert a bare chromosome to the notation used by a FASTA/VCF."""
    return f"chr{bare}" if has_chr else bare

detect_series_chr_style

detect_series_chr_style(values)

Inspect an iterable of raw chromosome values for reporting.

Returns True (chr-prefixed), False (bare), or None (unknown/mixed/empty).

Source code in src/vflank/core/chrom.py
def detect_series_chr_style(values) -> bool | None:
    """Inspect an iterable of raw chromosome values for reporting.

    Returns True (chr-prefixed), False (bare), or None (unknown/mixed/empty).
    """
    for val in values:
        if val is None:
            continue
        if isinstance(val, float) and math.isnan(val):
            continue
        v = str(val).strip()
        if not v:
            continue
        if v.lower().startswith("chr"):
            return True
        if v in VALID_CHROMS or v in _NUMERIC_CHROM_MAP:
            return False
    return None

vflank.core.flanks

vflank.core.flanks

Flank extraction and masking.

A :class:FlankSource is the strategy seam for where each flank base comes from. The reference-backed source implemented here covers modes A (reference) and B (reference + population mask). Mode C/D (BAM consensus) will add a ConsensusFlankSource implementing the same protocol.

FlankResult dataclass

Left/right flanks, raw and masked, for one variant.

Source code in src/vflank/core/flanks.py
@dataclass(slots=True)
class FlankResult:
    """Left/right flanks, raw and masked, for one variant."""

    left: str
    right: str
    masked_left: str
    masked_right: str
    covered: int | None = None   # flank positions at/above min_depth (BAM consensus)
    total: int | None = None     # total flank positions (BAM consensus)
    inserted: int | None = None  # flank positions masked for a patient insertion

    @property
    def n_masked(self) -> int:
        return self.masked_left.count("N") + self.masked_right.count("N")

    @property
    def n_corrected(self) -> int:
        """Flank positions where the masked seq is a real base differing from raw."""
        return sum(
            m != r and m != "N"
            for raw, msk in ((self.left, self.masked_left), (self.right, self.masked_right))
            for r, m in zip(raw, msk, strict=False)
        )

    def upper(self) -> FlankResult:
        """Return an uppercased copy (presentation convenience for callers)."""
        return FlankResult(
            self.left.upper(),
            self.right.upper(),
            self.masked_left.upper(),
            self.masked_right.upper(),
            self.covered,
            self.total,
            self.inserted,
        )

n_corrected property

n_corrected

Flank positions where the masked seq is a real base differing from raw.

upper

upper()

Return an uppercased copy (presentation convenience for callers).

Source code in src/vflank/core/flanks.py
def upper(self) -> FlankResult:
    """Return an uppercased copy (presentation convenience for callers)."""
    return FlankResult(
        self.left.upper(),
        self.right.upper(),
        self.masked_left.upper(),
        self.masked_right.upper(),
        self.covered,
        self.total,
        self.inserted,
    )

FlankSource

Bases: Protocol

Strategy for producing flanks around a variant.

Source code in src/vflank/core/flanks.py
class FlankSource(Protocol):
    """Strategy for producing flanks around a variant."""

    def fetch(self, variant: Variant) -> FlankResult: ...

ReferenceFlankSource

Flanks pulled from the reference FASTA, optionally masking common SNPs.

MAF coordinates are 1-based fully-closed [start, end]; pysam uses 0-based half-open [start, end). The left flank is the flank bases ending just before the variant; the right flank is the flank bases starting just after it. The variant interval itself is excluded from both flanks.

Source code in src/vflank/core/flanks.py
class ReferenceFlankSource:
    """Flanks pulled from the reference FASTA, optionally masking common SNPs.

    MAF coordinates are 1-based fully-closed ``[start, end]``; pysam uses 0-based
    half-open ``[start, end)``. The left flank is the ``flank`` bases ending just
    before the variant; the right flank is the ``flank`` bases starting just
    after it. The variant interval itself is excluded from both flanks.
    """

    def __init__(self, reference, gnomad=None, *, flank: int = 200, af_threshold: float = 0.001):
        self.reference = reference
        self.gnomad = gnomad
        self.flank = flank
        self.af_threshold = af_threshold

    def fetch(self, variant: Variant) -> FlankResult:
        left_start_0 = max(0, variant.start - self.flank - 1)
        left_end_0 = variant.start - 1
        right_start_0 = variant.end
        right_end_0 = variant.end + self.flank

        left = self.reference.fetch(variant.chrom, left_start_0, left_end_0)
        right = self.reference.fetch(variant.chrom, right_start_0, right_end_0)

        if self.gnomad is None:
            return FlankResult(left, right, left, right)

        left_snps = self.gnomad.get_positions(
            variant.chrom, left_start_0, left_end_0, self.af_threshold
        )
        right_snps = self.gnomad.get_positions(
            variant.chrom, right_start_0, right_end_0, self.af_threshold
        )
        return FlankResult(
            left,
            right,
            mask_sequence(left, left_start_0, left_snps),
            mask_sequence(right, right_start_0, right_snps),
        )

mask_sequence

mask_sequence(seq, region_start_0based, positions_1based)

Replace bases at the given 1-based genomic positions with 'N'.

seq[0] corresponds to genomic position region_start_0based + 1.

Source code in src/vflank/core/flanks.py
def mask_sequence(seq: str, region_start_0based: int, positions_1based: list[int]) -> str:
    """Replace bases at the given 1-based genomic positions with 'N'.

    ``seq[0]`` corresponds to genomic position ``region_start_0based + 1``.
    """
    if not positions_1based:
        return seq
    chars = list(seq)
    for pos in positions_1based:
        idx = pos - region_start_0based - 1
        if 0 <= idx < len(chars):
            chars[idx] = "N"
    return "".join(chars)

vflank.core.popfreq

vflank.core.popfreq

Population allele-frequency masking source (gnomAD), local-VCF backend.

Masking can draw on gnomAD genome and/or exome data (--pop-data): flanks often fall in non-coding regions where only genomes have data, while exomes add power in coding regions. both masks the union (a position is masked if it is a common SNP in either cohort).

The hot line-parsing kernel (:func:parse_common_snp_positions) is a pure function over an iterable of raw VCF lines, so it is unit-testable without pysam and is the natural seam to later swap for a Rust/noodles implementation.

GnomadStore

Lazy, per-(kind, chromosome) tabix cache over a directory of gnomAD VCFs.

Honours --pop-data (genome / exome / both). For both, queried positions are the union across the two cohorts. Tabix handles are opened on first use; each file's chr-notation is detected on open so queries use the file's own contig names.

Source code in src/vflank/core/popfreq.py
class GnomadStore:
    """Lazy, per-(kind, chromosome) tabix cache over a directory of gnomAD VCFs.

    Honours ``--pop-data`` (genome / exome / both). For ``both``, queried
    positions are the union across the two cohorts. Tabix handles are opened on
    first use; each file's chr-notation is detected on open so queries use the
    file's own contig names.
    """

    def __init__(self, pop_vcf_dir: Path, genome_build: str, pop_data: str = "genome") -> None:
        self.dir = pop_vcf_dir
        self.build = genome_build
        self.kinds = kinds_for(pop_data)  # raises ValueError on bad input
        self._cache: dict[tuple[str, str], Any] = {}  # (kind, bare) -> TabixFile|None
        self._has_chr: dict[tuple[str, str], bool] = {}

    def preflight(self, chroms: Iterable[str]) -> None:
        """Fail fast if a requested data kind is wholly absent for these chroms.

        This is what prevents a silent genome-only fallback when ``--pop-data
        exome``/``both`` is requested without the exome files present. Per-chrom
        gaps (some chromosomes missing) remain warnings at query time.
        """
        chroms = list(chroms)
        missing = [
            kind for kind in self.kinds
            if not any(resolve_vcf_for_chrom(self.dir, c, self.build, kind) for c in chroms)
        ]
        if missing:
            raise PopFreqError(
                f"No {'/'.join(missing)} gnomAD VCF(s) found in {self.dir} for "
                f"build {self.build} (expected e.g. "
                f"{example_filename(self.build, missing[0])}). "
                f"Download them or choose a different --pop-data."
            )

    def _get(self, kind: str, bare: str):
        key = (kind, bare)
        if key in self._cache:
            return self._cache[key]

        path = resolve_vcf_for_chrom(self.dir, bare, self.build, kind)
        if path is None:
            log.warning(
                "No %s VCF for chr%s in %s%s masking skipped for this chromosome",
                kind, bare, self.dir.name, kind,
            )
            self._cache[key] = None
            return None

        if not Path(str(path) + ".tbi").exists():
            log.warning(
                "TBI index missing: %s%s masking skipped for chr%s (fix: tabix -p vcf %s)",
                path.name, kind, bare, path,
            )
            self._cache[key] = None
            return None

        try:
            import pysam

            tbx = pysam.TabixFile(str(path))
        except Exception as exc:  # noqa: BLE001 - surface as a warning, keep going
            log.warning("Could not open %s: %s", path.name, exc)
            self._cache[key] = None
            return None

        self._cache[key] = tbx
        self._has_chr[key] = any(c.startswith("chr") for c in tbx.contigs)
        log.debug(
            "Opened %s %s (contigs: %s)",
            kind, path.name, "chr-prefixed" if self._has_chr[key] else "bare",
        )
        return tbx

    def get_positions(
        self, bare: str, start_0based: int, end_0based: int, af_threshold: float
    ) -> list[int]:
        """1-based positions of common SNPs in ``[start, end)`` — union over kinds."""
        positions: set[int] = set()
        for kind in self.kinds:
            tbx = self._get(kind, bare)
            if tbx is None:
                continue
            contig = f"chr{bare}" if self._has_chr.get((kind, bare)) else bare
            try:
                rows = tbx.fetch(contig, start_0based, end_0based)
            except (ValueError, KeyError):
                # Contig absent in this VCF (e.g. MT). Expected often enough to be
                # DEBUG, not WARNING — but never wholly silent.
                log.debug(
                    "Contig %r not in %s VCF for chr%s — no masking for this region",
                    contig, kind, bare,
                )
                continue
            positions.update(parse_common_snp_positions(rows, af_threshold))
        return sorted(positions)

    def close(self) -> None:
        for tbx in self._cache.values():
            if tbx is not None:
                tbx.close()

preflight

preflight(chroms)

Fail fast if a requested data kind is wholly absent for these chroms.

This is what prevents a silent genome-only fallback when --pop-data exome/both is requested without the exome files present. Per-chrom gaps (some chromosomes missing) remain warnings at query time.

Source code in src/vflank/core/popfreq.py
def preflight(self, chroms: Iterable[str]) -> None:
    """Fail fast if a requested data kind is wholly absent for these chroms.

    This is what prevents a silent genome-only fallback when ``--pop-data
    exome``/``both`` is requested without the exome files present. Per-chrom
    gaps (some chromosomes missing) remain warnings at query time.
    """
    chroms = list(chroms)
    missing = [
        kind for kind in self.kinds
        if not any(resolve_vcf_for_chrom(self.dir, c, self.build, kind) for c in chroms)
    ]
    if missing:
        raise PopFreqError(
            f"No {'/'.join(missing)} gnomAD VCF(s) found in {self.dir} for "
            f"build {self.build} (expected e.g. "
            f"{example_filename(self.build, missing[0])}). "
            f"Download them or choose a different --pop-data."
        )

get_positions

get_positions(bare, start_0based, end_0based, af_threshold)

1-based positions of common SNPs in [start, end) — union over kinds.

Source code in src/vflank/core/popfreq.py
def get_positions(
    self, bare: str, start_0based: int, end_0based: int, af_threshold: float
) -> list[int]:
    """1-based positions of common SNPs in ``[start, end)`` — union over kinds."""
    positions: set[int] = set()
    for kind in self.kinds:
        tbx = self._get(kind, bare)
        if tbx is None:
            continue
        contig = f"chr{bare}" if self._has_chr.get((kind, bare)) else bare
        try:
            rows = tbx.fetch(contig, start_0based, end_0based)
        except (ValueError, KeyError):
            # Contig absent in this VCF (e.g. MT). Expected often enough to be
            # DEBUG, not WARNING — but never wholly silent.
            log.debug(
                "Contig %r not in %s VCF for chr%s — no masking for this region",
                contig, kind, bare,
            )
            continue
        positions.update(parse_common_snp_positions(rows, af_threshold))
    return sorted(positions)

kinds_for

kinds_for(pop_data)

Map a --pop-data value to the data kinds it consults.

genome/exome -> that one; both -> both. Raises on anything else.

Source code in src/vflank/core/popfreq.py
def kinds_for(pop_data: str) -> tuple[str, ...]:
    """Map a ``--pop-data`` value to the data kinds it consults.

    ``genome``/``exome`` -> that one; ``both`` -> both. Raises on anything else.
    """
    if pop_data == "both":
        return POP_DATA_KINDS
    if pop_data in POP_DATA_KINDS:
        return (pop_data,)
    raise ValueError(f"invalid pop_data {pop_data!r}; expected genome|exome|both")

resolve_vcf_for_chrom

resolve_vcf_for_chrom(pop_vcf_dir, bare_chrom, genome_build, data_kind)

Return the first existing gnomAD file for this chrom/build/kind, else None.

Source code in src/vflank/core/popfreq.py
def resolve_vcf_for_chrom(
    pop_vcf_dir: Path, bare_chrom: str, genome_build: str, data_kind: str
) -> Path | None:
    """Return the first existing gnomAD file for this chrom/build/kind, else None."""
    for pattern in GNOMAD_PATTERNS.get(genome_build, {}).get(data_kind, []):
        candidate = pop_vcf_dir / pattern.format(chrom=bare_chrom)
        if candidate.exists():
            return candidate
    return None

example_filename

example_filename(genome_build, data_kind)

A representative expected filename (chr1), for error/help messages.

Source code in src/vflank/core/popfreq.py
def example_filename(genome_build: str, data_kind: str) -> str:
    """A representative expected filename (chr1), for error/help messages."""
    patterns = GNOMAD_PATTERNS.get(genome_build, {}).get(data_kind, [])
    return patterns[0].format(chrom="1") if patterns else f"gnomad.{data_kind}s.*.vcf.bgz"

build_chrom_vcf_map

build_chrom_vcf_map(pop_vcf_dir, genome_build, chroms, data_kind)

Resolve the VCF (of one kind) for each chromosome (for coverage reports).

Source code in src/vflank/core/popfreq.py
def build_chrom_vcf_map(
    pop_vcf_dir: Path, genome_build: str, chroms: list[str], data_kind: str
) -> dict[str, Path | None]:
    """Resolve the VCF (of one kind) for each chromosome (for coverage reports)."""
    return {c: resolve_vcf_for_chrom(pop_vcf_dir, c, genome_build, data_kind) for c in chroms}

parse_common_snp_positions

parse_common_snp_positions(rows, af_threshold)

Return 1-based positions of SNPs whose max AF/AF_grpmax >= threshold.

Only single-base substitutions are considered (REF and every ALT length 1). rows is an iterable of raw tab-delimited VCF data lines. Works for both genome and exome gnomAD VCFs (identical INFO AF fields).

Source code in src/vflank/core/popfreq.py
def parse_common_snp_positions(rows: Iterable[str], af_threshold: float) -> list[int]:
    """Return 1-based positions of SNPs whose max AF/AF_grpmax >= threshold.

    Only single-base substitutions are considered (REF and every ALT length 1).
    ``rows`` is an iterable of raw tab-delimited VCF data lines. Works for both
    genome and exome gnomAD VCFs (identical INFO AF fields).
    """
    positions: list[int] = []
    for row in rows:
        fields = row.split("\t")
        if len(fields) < 8:
            continue

        ref = fields[3]
        if len(ref) != 1:
            continue
        alts = fields[4].split(",")
        if not all(len(a) == 1 and a not in (".", "*") for a in alts):
            continue

        af_values: list[float] = []
        for token in fields[7].split(";"):
            if token.startswith("AF=") or token.startswith("AF_grpmax="):
                for v in token.split("=", 1)[1].split(","):
                    try:
                        f = float(v)
                    except ValueError:
                        continue
                    if f == f:  # reject NaN (NaN != NaN)
                        af_values.append(f)

        if af_values and max(af_values) >= af_threshold:
            positions.append(int(fields[1]))
    return positions

vflank.core.popfreq_api

vflank.core.popfreq_api

Population allele-frequency masking source via the gnomAD GraphQL API.

An alternative to :class:~vflank.core.popfreq.GnomadStore that needs no local VCF download — it queries https://gnomad.broadinstitute.org/api per flank region. Exposes the same duck-typed get_positions interface, so it drops in behind ReferenceFlankSource unchanged.

Trade-offs (see docs/research/gnomad-api.md): no download and both builds, but rate-limited to ~10 requests/IP/60s and not reproducible — best for small cohorts; prefer the VCF source for bulk/HPC/reproducible runs.

The parsing kernel (:func:parse_api_variants) is pure and unit-testable; HTTP and timing are injected so tests run offline.

GnomadApiSource

Masking source backed by the public gnomAD GraphQL API.

Region responses are cached (so the two flank queries of identical variants reuse one request), requests are throttled to respect the rate limit, and transient failures are retried with backoff before raising PopFreqError.

Source code in src/vflank/core/popfreq_api.py
class GnomadApiSource:
    """Masking source backed by the public gnomAD GraphQL API.

    Region responses are cached (so the two flank queries of identical variants
    reuse one request), requests are throttled to respect the rate limit, and
    transient failures are retried with backoff before raising ``PopFreqError``.
    """

    def __init__(
        self,
        genome_build: str,
        pop_data: str = "genome",
        *,
        url: str = API_URL,
        timeout: float = 30.0,
        min_interval: float = 6.0,   # ~10 requests / 60 s
        max_retries: int = 3,
        transport: Callable[[str, str, float], dict] | None = None,
        sleep_fn: Callable[[float], None] | None = None,
        clock: Callable[[], float] = monotonic,
    ) -> None:
        self.reference_genome, self.dataset = dataset_for_build(genome_build)
        self.kinds = kinds_for(pop_data)  # raises ValueError on bad input
        self.url = url
        self.timeout = timeout
        self.min_interval = min_interval
        self.max_retries = max_retries
        # Resolve defaults at init time (not as bound defaults) so tests can
        # monkeypatch the module-level transport/sleep.
        self._transport = transport if transport is not None else _http_transport
        self._sleep = sleep_fn if sleep_fn is not None else sleep
        self._clock = clock
        self._cache: dict[tuple[str, int, int], list[dict]] = {}
        self._last_call: float | None = None
        self.request_count = 0  # for monitoring / large-cohort warnings

    def _throttle(self) -> None:
        if self._last_call is not None:
            wait = self.min_interval - (self._clock() - self._last_call)
            if wait > 0:
                self._sleep(wait)
        self._last_call = self._clock()

    def _query_region(self, chrom: str, start: int, stop: int) -> list[dict]:
        query = build_query(chrom, start, stop, self.reference_genome, self.dataset, self.kinds)
        for attempt in range(self.max_retries + 1):
            self._throttle()
            try:
                self.request_count += 1
                body = self._transport(self.url, query, self.timeout)
            except _TransientApiError as exc:
                if attempt < self.max_retries:
                    self._sleep(2.0 * (attempt + 1))  # linear backoff
                    log.debug("gnomAD API transient error (%s), retrying", exc)
                    continue
                raise PopFreqError(
                    f"gnomAD API unreachable after {self.max_retries + 1} attempts: {exc}"
                ) from exc

            errors = body.get("errors")
            if errors:
                msg = errors[0].get("message", "unknown error")
                if "rate" in msg.lower() and attempt < self.max_retries:
                    self._sleep(2.0 * (attempt + 1))
                    log.debug("gnomAD API rate-limited, retrying")
                    continue
                raise PopFreqError(f"gnomAD API error: {msg}")

            variants = (((body.get("data") or {}).get("region") or {}).get("variants")) or []
            log.debug("gnomAD API %s:%d-%d -> %d variants", chrom, start, stop, len(variants))
            return variants
        return []  # unreachable; satisfies type checker

    def get_positions(
        self, bare: str, start_0based: int, end_0based: int, af_threshold: float
    ) -> list[int]:
        """1-based positions of common SNPs in ``[start, end)`` for this chrom."""
        if end_0based <= start_0based:
            return []
        start, stop = start_0based + 1, end_0based  # 0-based half-open -> 1-based inclusive
        key = (bare, start, stop)
        if key not in self._cache:
            self._cache[key] = self._query_region(bare, start, stop)
        return parse_api_variants(self._cache[key], self.kinds, af_threshold)

    def close(self) -> None:
        """No resources to release; present for interface symmetry with GnomadStore."""

get_positions

get_positions(bare, start_0based, end_0based, af_threshold)

1-based positions of common SNPs in [start, end) for this chrom.

Source code in src/vflank/core/popfreq_api.py
def get_positions(
    self, bare: str, start_0based: int, end_0based: int, af_threshold: float
) -> list[int]:
    """1-based positions of common SNPs in ``[start, end)`` for this chrom."""
    if end_0based <= start_0based:
        return []
    start, stop = start_0based + 1, end_0based  # 0-based half-open -> 1-based inclusive
    key = (bare, start, stop)
    if key not in self._cache:
        self._cache[key] = self._query_region(bare, start, stop)
    return parse_api_variants(self._cache[key], self.kinds, af_threshold)

close

close()

No resources to release; present for interface symmetry with GnomadStore.

Source code in src/vflank/core/popfreq_api.py
def close(self) -> None:
    """No resources to release; present for interface symmetry with GnomadStore."""

build_query

build_query(chrom, start, stop, reference_genome, dataset, kinds)

GraphQL query for SNP AFs in a region. start/stop are 1-based inclusive.

Source code in src/vflank/core/popfreq_api.py
def build_query(
    chrom: str, start: int, stop: int, reference_genome: str, dataset: str, kinds: Iterable[str]
) -> str:
    """GraphQL query for SNP AFs in a region. ``start``/``stop`` are 1-based inclusive."""
    seq = " ".join(f"{k} {{ af populations {{ id ac an }} }}" for k in kinds)
    return (
        f'{{ region(chrom: "{chrom}", start: {start}, stop: {stop}, '
        f"reference_genome: {reference_genome}) "
        f"{{ variants(dataset: {dataset}) {{ pos ref alt {seq} }} }} }}"
    )

parse_api_variants

parse_api_variants(variants, kinds, af_threshold)

1-based positions of SNPs whose max AF (over kinds) >= threshold.

Pure: variants is the GraphQL region.variants list. SNPs only.

Source code in src/vflank/core/popfreq_api.py
def parse_api_variants(
    variants: Iterable[dict], kinds: Iterable[str], af_threshold: float
) -> list[int]:
    """1-based positions of SNPs whose max AF (over kinds) >= threshold.

    Pure: ``variants`` is the GraphQL ``region.variants`` list. SNPs only.
    """
    kinds = tuple(kinds)
    positions: list[int] = []
    for v in variants:
        if len(v.get("ref", "")) != 1 or len(v.get("alt", "")) != 1:
            continue
        if _variant_af(v, kinds) >= af_threshold:
            positions.append(int(v["pos"]))
    return positions

vflank.core.reference_api

vflank.core.reference_api

Reference-sequence source backed by the UCSC REST API.

An alternative to :class:~vflank.io.reference.ReferenceFasta that needs no local FASTA download — it fetches each flank window from https://api.genome.ucsc.edu/getData/sequence. Exposes the surface the CLIs use on a reference (fetch / check_build / has_chr / close), so it drops in behind :class:~vflank.core.flanks.ReferenceFlankSource unchanged.

Why UCSC (see docs/research/genome-api.md): its genome ids are literally hg19 / hg38 (our --genome-build values) and its coordinates are 0-based half-open — identical to pysam — so the flank math in flanks.py needs no translation. Trade-offs: no download and both builds, but a ~1 req/s courtesy rate limit and network required — best for the hosted single-variant / small use; prefer a local FASTA for bulk/HPC/offline runs.

The parsing kernel (:func:parse_sequence_response) and URL builder (:func:build_url) are pure and unit-testable; HTTP and timing are injected so tests run offline.

ReferenceApiSource

Reference-sequence source backed by the UCSC getData/sequence API.

Window responses are cached, requests are throttled to respect UCSC's courtesy limit (~1 req/s), and transient failures are retried with backoff before raising ReferenceError. Drop-in for :class:ReferenceFasta across the surface the CLIs use.

Source code in src/vflank/core/reference_api.py
class ReferenceApiSource:
    """Reference-sequence source backed by the UCSC getData/sequence API.

    Window responses are cached, requests are throttled to respect UCSC's
    courtesy limit (~1 req/s), and transient failures are retried with backoff
    before raising ``ReferenceError``. Drop-in for :class:`ReferenceFasta` across
    the surface the CLIs use.
    """

    # UCSC serves chr-prefixed contigs; surfaced for the CLI's status line only.
    has_chr = True

    def __init__(
        self,
        genome_build: str,
        *,
        url: str = API_URL,
        timeout: float = 30.0,
        min_interval: float = 1.0,   # ~1 request / s (UCSC courtesy limit)
        max_retries: int = 3,
        transport: Callable[[str, float], dict] | None = None,
        sleep_fn: Callable[[float], None] | None = None,
        clock: Callable[[], float] = monotonic,
    ) -> None:
        self.genome = genome_for_build(genome_build)  # raises on bad build
        self.genome_build = genome_build
        self.url = url
        self.timeout = timeout
        self.min_interval = min_interval
        self.max_retries = max_retries
        # Resolve defaults at init time (not as bound defaults) so tests can
        # monkeypatch the module-level transport/sleep.
        self._transport = transport if transport is not None else _http_transport
        self._sleep = sleep_fn if sleep_fn is not None else sleep
        self._clock = clock
        self._cache: dict[tuple[str, int, int], str] = {}
        self._last_call: float | None = None
        self.request_count = 0  # for monitoring

    def _throttle(self) -> None:
        if self._last_call is not None:
            wait = self.min_interval - (self._clock() - self._last_call)
            if wait > 0:
                self._sleep(wait)
        self._last_call = self._clock()

    def _request(self, contig: str, start_0based: int, end_0based: int) -> str:
        url = build_url(self.genome, contig, start_0based, end_0based)
        for attempt in range(self.max_retries + 1):
            self._throttle()
            try:
                self.request_count += 1
                body = self._transport(url, self.timeout)
            except _TransientApiError as exc:
                if attempt < self.max_retries:
                    self._sleep(2.0 * (attempt + 1))  # linear backoff
                    log.debug("UCSC API transient error (%s), retrying", exc)
                    continue
                raise ReferenceError(
                    f"UCSC API unreachable after {self.max_retries + 1} attempts: {exc}"
                ) from exc
            dna = parse_sequence_response(body)
            log.debug("UCSC API %s:%d-%d -> %d bp", contig, start_0based, end_0based, len(dna))
            return dna
        return ""  # unreachable; satisfies the type checker

    def fetch(self, bare: str, start_0based: int, end_0based: int) -> str:
        """Reference bases for ``[start, end)`` (0-based half-open), bare chromosome."""
        if end_0based <= start_0based:
            return ""
        contig = ucsc_contig(bare)
        key = (contig, start_0based, end_0based)
        if key not in self._cache:
            self._cache[key] = self._request(contig, start_0based, end_0based)
        return self._cache[key]

    def check_build(self, declared: str) -> str | None:
        """No local sequence to fingerprint; the API serves the requested build.

        Returns ``None`` (no mismatch warning is possible). We trust ``declared``
        and pass it through as the UCSC ``genome``; a wrong build still surfaces
        downstream as a UCSC error rather than silent wrong sequence.
        """
        log.debug("Reference build %r trusted (UCSC API serves the requested genome)", declared)
        return None

    def close(self) -> None:
        """No resources to release; present for interface symmetry with ReferenceFasta."""

fetch

fetch(bare, start_0based, end_0based)

Reference bases for [start, end) (0-based half-open), bare chromosome.

Source code in src/vflank/core/reference_api.py
def fetch(self, bare: str, start_0based: int, end_0based: int) -> str:
    """Reference bases for ``[start, end)`` (0-based half-open), bare chromosome."""
    if end_0based <= start_0based:
        return ""
    contig = ucsc_contig(bare)
    key = (contig, start_0based, end_0based)
    if key not in self._cache:
        self._cache[key] = self._request(contig, start_0based, end_0based)
    return self._cache[key]

check_build

check_build(declared)

No local sequence to fingerprint; the API serves the requested build.

Returns None (no mismatch warning is possible). We trust declared and pass it through as the UCSC genome; a wrong build still surfaces downstream as a UCSC error rather than silent wrong sequence.

Source code in src/vflank/core/reference_api.py
def check_build(self, declared: str) -> str | None:
    """No local sequence to fingerprint; the API serves the requested build.

    Returns ``None`` (no mismatch warning is possible). We trust ``declared``
    and pass it through as the UCSC ``genome``; a wrong build still surfaces
    downstream as a UCSC error rather than silent wrong sequence.
    """
    log.debug("Reference build %r trusted (UCSC API serves the requested genome)", declared)
    return None

close

close()

No resources to release; present for interface symmetry with ReferenceFasta.

Source code in src/vflank/core/reference_api.py
def close(self) -> None:
    """No resources to release; present for interface symmetry with ReferenceFasta."""

ucsc_contig

ucsc_contig(bare)

Bare chromosome -> UCSC contig name.

UCSC hg19/hg38 use chr-prefixed names, and the mitochondrion is chrM (not chrMT). We cannot cheaply probe the contig list over the API, so we apply UCSC's known convention rather than guessing.

Source code in src/vflank/core/reference_api.py
def ucsc_contig(bare: str) -> str:
    """Bare chromosome -> UCSC contig name.

    UCSC hg19/hg38 use ``chr``-prefixed names, and the mitochondrion is ``chrM``
    (not ``chrMT``). We cannot cheaply probe the contig list over the API, so we
    apply UCSC's known convention rather than guessing.
    """
    if bare in ("MT", "M"):
        return "chrM"
    return f"chr{bare}"

build_url

build_url(genome, chrom, start_0based, end_0based)

UCSC getData/sequence URL. UCSC coords are 0-based half-open — same as ours.

Source code in src/vflank/core/reference_api.py
def build_url(genome: str, chrom: str, start_0based: int, end_0based: int) -> str:
    """UCSC getData/sequence URL. UCSC coords are 0-based half-open — same as ours."""
    query = urllib.parse.urlencode(
        {"genome": genome, "chrom": chrom, "start": start_0based, "end": end_0based}
    )
    return f"{API_URL}?{query}"

parse_sequence_response

parse_sequence_response(body)

Extract the sequence from a UCSC getData/sequence JSON body.

Pure. Raises :class:ReferenceError on an API error payload or a missing/ non-string dna field. A shorter-than-requested sequence is not an error here — like a pysam fetch off a contig end, truncation is a real condition the caller reports (see the flank-truncation check in the CLIs).

Source code in src/vflank/core/reference_api.py
def parse_sequence_response(body: dict) -> str:
    """Extract the sequence from a UCSC getData/sequence JSON body.

    Pure. Raises :class:`ReferenceError` on an API error payload or a missing/
    non-string ``dna`` field. A *shorter-than-requested* sequence is **not** an
    error here — like a pysam fetch off a contig end, truncation is a real
    condition the caller reports (see the flank-truncation check in the CLIs).
    """
    if isinstance(body.get("error"), str):
        raise ReferenceError(f"UCSC API error: {body['error']}")
    dna = body.get("dna")
    if not isinstance(dna, str):
        raise ReferenceError(
            f"UCSC API response missing a 'dna' string (got keys: {sorted(body)})"
        )
    return dna

vflank.core.fusion

vflank.core.fusion

Fusion / SV junction construction from breakpoint pairs.

Builds the chimeric junction sequence for a fusion so a ddPCR probe can span it.

Strand convention (matches iCallSV dellyVcf2Tab.py): 0 = plus/reference, 1 = minus/complement. The fused product reads 5'->3' as partner1 + partner2 with no separator; partner 1 ends at the junction, partner 2 starts at it. See docs/research/sv-vcf-input.md for the full derivation (validated against the unambiguous deletion case).

Breakpoint dataclass

A single SV breakpoint. pos is 1-based; strand is 0 (+) or 1 (−).

Source code in src/vflank/core/fusion.py
@dataclass(slots=True)
class Breakpoint:
    """A single SV breakpoint. ``pos`` is 1-based; ``strand`` is 0 (+) or 1 (−)."""

    chrom: str
    pos: int
    strand: int

Fusion dataclass

Two breakpoints forming a junction, with optional labels.

Source code in src/vflank/core/fusion.py
@dataclass(slots=True)
class Fusion:
    """Two breakpoints forming a junction, with optional labels."""

    bp1: Breakpoint
    bp2: Breakpoint
    name: str = ""
    sample: str = ""

build_junction

build_junction(reference, fusion, flank, gnomad=None, af_threshold=0.001, bam_source=None)

Construct the fusion junction sequence (partner1 + partner2).

flank is the bases taken from each partner, so the junction is up to 2*flank bp (shorter if a partner runs off a contig end). The probe is designed to span junction_index. masked_sequence is the gnomAD-masked junction, or — when bam_source is given — the per-sample patient consensus (built in genomic space before reverse-complement).

Source code in src/vflank/core/fusion.py
def build_junction(
    reference, fusion: Fusion, flank: int, gnomad=None, af_threshold: float = 0.001,
    bam_source=None,
) -> JunctionResult:
    """Construct the fusion junction sequence (partner1 + partner2).

    ``flank`` is the bases taken from each partner, so the junction is up to
    ``2*flank`` bp (shorter if a partner runs off a contig end). The probe is
    designed to span ``junction_index``. ``masked_sequence`` is the gnomAD-masked
    junction, or — when ``bam_source`` is given — the per-sample patient consensus
    (built in genomic space before reverse-complement).
    """
    raw1, masked1 = _segment(reference, fusion.bp1, flank, donor=True,
                             gnomad=gnomad, af_threshold=af_threshold, bam_source=bam_source)
    raw2, masked2 = _segment(reference, fusion.bp2, flank, donor=False,
                             gnomad=gnomad, af_threshold=af_threshold, bam_source=bam_source)
    sequence = raw1 + raw2
    masked_sequence = masked1 + masked2
    return JunctionResult(
        sequence=sequence,
        masked_sequence=masked_sequence,
        junction_index=len(raw1),
        n_masked=masked_sequence.count("N") - sequence.count("N"),
    )

vflank.core.skips

vflank.core.skips

Categorise per-variant skip reasons for a compact run summary.

The reasons are free-text (they come from several validation points), so this buckets them by keyword into a small, stable set of categories. Pure and testable.

categorize_skip

categorize_skip(reason)

Map a free-text skip reason to a stable category label.

Source code in src/vflank/core/skips.py
def categorize_skip(reason: str) -> str:
    """Map a free-text skip reason to a stable category label."""
    r = reason.lower()
    for keyword, label in _CATEGORIES:
        if keyword in r:
            return label
    return "other"

I/O

vflank.io.maf

vflank.io.maf

MAF loading, column remapping/validation, and row -> Variant parsing.

MafColumns dataclass

User-overridable mapping from MAF column names to canonical names.

Source code in src/vflank/io/maf.py
@dataclass
class MafColumns:
    """User-overridable mapping from MAF column names to canonical names."""

    chrom: str = MAF_CHR
    start: str = MAF_START
    end: str = MAF_END
    ref: str = MAF_REF
    alt: str = MAF_ALT
    gene: str = MAF_GENE
    protein: str = MAF_PROT
    cdna: str = MAF_CDNA
    sample: str = MAF_SAMPLE

read_maf

read_maf(path, *, n_rows=None)

Read a MAF into a DataFrame (tab-separated, '#'-comment aware).

path is a filesystem path or an open text/binary buffer.

Source code in src/vflank/io/maf.py
def read_maf(path: MafInput, *, n_rows: int | None = None):
    """Read a MAF into a DataFrame (tab-separated, '#'-comment aware).

    ``path`` is a filesystem path or an open text/binary buffer.
    """
    import pandas as pd

    try:
        return pd.read_csv(path, sep="\t", comment="#", nrows=n_rows, low_memory=False)
    except Exception as exc:  # noqa: BLE001
        raise MafError(f"Could not read MAF: {exc}") from exc

load_maf

load_maf(path, cols)

Read a MAF, remap required columns to canonical names, and validate.

Returns the DataFrame with canonical required-column names guaranteed present, and optional metadata columns filled with defaults if absent.

Source code in src/vflank/io/maf.py
def load_maf(path: MafInput, cols: MafColumns):
    """Read a MAF, remap required columns to canonical names, and validate.

    Returns the DataFrame with canonical required-column names guaranteed
    present, and optional metadata columns filled with defaults if absent.
    """
    df = read_maf(path)
    if df.empty:
        raise MafError("MAF file is empty.")

    remap = {}
    for user_col, canonical in [
        (cols.chrom, MAF_CHR), (cols.start, MAF_START), (cols.end, MAF_END),
        (cols.ref, MAF_REF), (cols.alt, MAF_ALT),
    ]:
        if user_col != canonical and user_col in df.columns:
            remap[user_col] = canonical
    if remap:
        df = df.rename(columns=remap)

    missing = [c for c in REQUIRED_MAF_COLS if c not in df.columns]
    if missing:
        raise MafError(
            f"Missing required columns: {', '.join(missing)}. "
            "Run `vflank small inspect` to see column names, then use "
            "--chrom-col / --start-col etc. to remap."
        )

    for col, default in {
        cols.gene: "UNKNOWN", cols.protein: "", cols.cdna: "", cols.sample: "SAMPLE",
    }.items():
        if col not in df.columns:
            df[col] = default

    return df

parse_variant_row

parse_variant_row(row, cols)

Convert a MAF row to a :class:Variant, or return a skip reason.

Returns (variant, None) on success or (None, reason) on a bad row.

Source code in src/vflank/io/maf.py
def parse_variant_row(row, cols: MafColumns) -> tuple[Variant | None, str | None]:
    """Convert a MAF row to a :class:`Variant`, or return a skip reason.

    Returns ``(variant, None)`` on success or ``(None, reason)`` on a bad row.
    """
    import pandas as pd

    ref = str(row[MAF_REF]) if pd.notna(row[MAF_REF]) else "-"
    alt = str(row[MAF_ALT]) if pd.notna(row[MAF_ALT]) else "-"
    gene = str(row.get(cols.gene, "UNKNOWN"))

    bare, chrom_err = normalise_chrom(row[MAF_CHR])
    if bare is None:
        # By contract, normalise_chrom sets chrom_err whenever bare is None.
        return None, f"{gene}{chrom_err}"

    raw = str(row[MAF_CHR]).strip()
    raw_display = raw if raw.upper().startswith("CHR") else bare

    try:
        start = int(float(row[MAF_START]))
        end = int(float(row[MAF_END]))
    except (ValueError, TypeError):
        return None, (
            f"{gene}: non-numeric position "
            f"(start={row[MAF_START]!r}, end={row[MAF_END]!r})"
        )

    coord_err = validate_coordinates(start, end)
    if coord_err:
        return None, f"{gene} {raw_display}:{start}{coord_err}"

    if not validate_allele(ref) or not validate_allele(alt):
        return None, f"{gene} {raw_display}:{start} — invalid allele (ref={ref!r}, alt={alt!r})"

    return (
        Variant(
            chrom=bare, start=start, end=end, ref=ref, alt=alt,
            gene=gene,
            protein=str(row.get(cols.protein, "")),
            cdna=str(row.get(cols.cdna, "")),
            sample=str(row.get(cols.sample, "SAMPLE")),
            raw_chrom=raw_display,
        ),
        None,
    )

vflank.io.reference

vflank.io.reference

Reference FASTA access with chr-notation detection and build fingerprinting.

The build-mismatch guard addresses the scariest silent failure in this domain: running hg19 coordinates against an hg38 FASTA (or vice versa) returns the wrong sequence with no error. We fingerprint by the length of chromosome 1.

ReferenceFasta

Thin wrapper over pysam.FastaFile keyed by bare chromosome.

Source code in src/vflank/io/reference.py
class ReferenceFasta:
    """Thin wrapper over ``pysam.FastaFile`` keyed by bare chromosome."""

    def __init__(self, path: Path | str):
        self.path = Path(path)
        if not self.path.exists():
            raise ReferenceError(f"Reference FASTA not found: {self.path}")
        fai = Path(str(self.path) + ".fai")
        if not fai.exists():
            raise ReferenceError(
                f"FASTA index not found: {fai}  (fix: samtools faidx {self.path})"
            )
        try:
            import pysam

            self.fa = pysam.FastaFile(str(self.path))
        except Exception as exc:  # noqa: BLE001
            raise ReferenceError(f"Could not open FASTA: {exc}") from exc

        self._refs = set(self.fa.references)
        self.has_chr = contigs_have_chr(self.fa.references)

    def contig(self, bare: str) -> str:
        """Resolve a bare chromosome to this FASTA's actual contig name.

        Auto-detection of ``chr`` prefixing is best-effort (it probes chr1-5).
        If the detected form is absent but the other form is present — which can
        happen with unusual or single-contig references — fall back to it rather
        than letting pysam raise a confusing KeyError for every variant.
        """
        primary = chrom_for_contigs(bare, self.has_chr)
        if primary in self._refs:
            return primary
        alternate = bare if self.has_chr else f"chr{bare}"
        return alternate if alternate in self._refs else primary

    def fetch(self, bare: str, start_0based: int, end_0based: int) -> str:
        return self.fa.fetch(self.contig(bare), start_0based, end_0based)

    def detect_build(self) -> str | None:
        """Infer 'hg19'/'hg38' from chr1 length, or None if undeterminable."""
        for name in ("chr1", "1"):
            if name in self.fa.references:
                return _CHR1_LENGTH_TO_BUILD.get(self.fa.get_reference_length(name))
        return None

    def check_build(self, declared: str) -> str | None:
        """Return a warning string if the declared build disagrees with the FASTA."""
        detected = self.detect_build()
        if detected is not None and detected != declared:
            return (
                f"Declared --genome-build {declared} but the FASTA looks like "
                f"{detected} (by chr1 length). Coordinates may map to the wrong "
                f"sequence. Double-check the reference."
            )
        return None

    def close(self) -> None:
        self.fa.close()

contig

contig(bare)

Resolve a bare chromosome to this FASTA's actual contig name.

Auto-detection of chr prefixing is best-effort (it probes chr1-5). If the detected form is absent but the other form is present — which can happen with unusual or single-contig references — fall back to it rather than letting pysam raise a confusing KeyError for every variant.

Source code in src/vflank/io/reference.py
def contig(self, bare: str) -> str:
    """Resolve a bare chromosome to this FASTA's actual contig name.

    Auto-detection of ``chr`` prefixing is best-effort (it probes chr1-5).
    If the detected form is absent but the other form is present — which can
    happen with unusual or single-contig references — fall back to it rather
    than letting pysam raise a confusing KeyError for every variant.
    """
    primary = chrom_for_contigs(bare, self.has_chr)
    if primary in self._refs:
        return primary
    alternate = bare if self.has_chr else f"chr{bare}"
    return alternate if alternate in self._refs else primary

detect_build

detect_build()

Infer 'hg19'/'hg38' from chr1 length, or None if undeterminable.

Source code in src/vflank/io/reference.py
def detect_build(self) -> str | None:
    """Infer 'hg19'/'hg38' from chr1 length, or None if undeterminable."""
    for name in ("chr1", "1"):
        if name in self.fa.references:
            return _CHR1_LENGTH_TO_BUILD.get(self.fa.get_reference_length(name))
    return None

check_build

check_build(declared)

Return a warning string if the declared build disagrees with the FASTA.

Source code in src/vflank/io/reference.py
def check_build(self, declared: str) -> str | None:
    """Return a warning string if the declared build disagrees with the FASTA."""
    detected = self.detect_build()
    if detected is not None and detected != declared:
        return (
            f"Declared --genome-build {declared} but the FASTA looks like "
            f"{detected} (by chr1 length). Coordinates may map to the wrong "
            f"sequence. Double-check the reference."
        )
    return None

vflank.io.fasta

vflank.io.fasta

FASTA record formatting and writing for the small-variant path.

safe_header

safe_header(s)

Replace characters unsafe in FASTA headers with underscores.

Source code in src/vflank/io/fasta.py
def safe_header(s: str) -> str:
    """Replace characters unsafe in FASTA headers with underscores."""
    if not s or s in ("nan", "None", "."):
        return "."
    return _UNSAFE_RE.sub("_", s.strip())

record_id

record_id(variant, ref, alt, sample=None)

The shared record key: [{SAMPLE}__]{GENE}__{HGVSp}__{HGVSc}__{CHROM}_{POS}_{REF}_{ALT}.

Used for both FASTA headers and Primer3 SEQUENCE_ID so records cross-reference.

Source code in src/vflank/io/fasta.py
def record_id(variant: Variant, ref: str, alt: str, sample: str | None = None) -> str:
    """The shared record key: ``[{SAMPLE}__]{GENE}__{HGVSp}__{HGVSc}__{CHROM}_{POS}_{REF}_{ALT}``.

    Used for both FASTA headers and Primer3 ``SEQUENCE_ID`` so records cross-reference.
    """
    key = f"{variant.chrom}_{variant.start}_{ref}_{alt}"
    fields = [variant.gene, variant.protein, variant.cdna, key]
    if sample is not None:
        fields.insert(0, sample)
    return "__".join(safe_header(s) for s in fields)

format_records

format_records(variant, flanks, ref, alt, sample=None)

Two FASTA records per variant: raw and masked.

Keyed on the variant identity (CHR_POS_REF_ALT). When sample is given (BAM-consensus mode, where the sequence is patient-specific), the sample is prefixed so per-(variant, sample) records stay distinct:

[{SAMPLE}]{GENE}{HGVSc}{POS}{ALT} {left}[REF/ALT]{right} Masked__[{SAMPLE}__]{GENE}__{HGVSp}__{HGVSc}__{CHROM}{REF} {masked_left}[REF/ALT]{masked_right}

Source code in src/vflank/io/fasta.py
def format_records(
    variant: Variant, flanks: FlankResult, ref: str, alt: str, sample: str | None = None
) -> list[str]:
    """Two FASTA records per variant: raw and masked.

    Keyed on the variant identity (CHR_POS_REF_ALT). When ``sample`` is given
    (BAM-consensus mode, where the sequence is patient-specific), the sample is
    prefixed so per-(variant, sample) records stay distinct:

    >[{SAMPLE}__]{GENE}__{HGVSp}__{HGVSc}__{CHROM}_{POS}_{REF}_{ALT}
    {left}[REF/ALT]{right}
    >Masked__[{SAMPLE}__]{GENE}__{HGVSp}__{HGVSc}__{CHROM}_{POS}_{REF}_{ALT}
    {masked_left}[REF/ALT]{masked_right}
    """
    base = record_id(variant, ref, alt, sample)
    return [
        f">{base}\n{flanks.left}[{ref}/{alt}]{flanks.right}\n",
        f">Masked__{base}\n{flanks.masked_left}[{ref}/{alt}]{flanks.masked_right}\n",
    ]

vflank.io.breakpoints

vflank.io.breakpoints

Read SV breakpoints from the simple iCallSV / iAnnotateSV TSV.

Columns are matched by header name, not position (SvColumns), so a file works regardless of column order or extra columns, as long as the named columns are present. Mirrors io/maf.MafColumns.

SvColumns dataclass

Logical field -> header column name (all overridable).

Source code in src/vflank/io/breakpoints.py
@dataclass
class SvColumns:
    """Logical field -> header column name (all overridable)."""

    chr1: str = "chr1"
    pos1: str = "pos1"
    str1: str = "str1"
    chr2: str = "chr2"
    pos2: str = "pos2"
    str2: str = "str2"
    name: str = "name"      # optional
    sample: str = "sample"  # optional

read_sv_table

read_sv_table(path)

Read the TSV into a DataFrame (tab-separated, '#'-comment aware).

path is a filesystem path or an open text/binary buffer.

Source code in src/vflank/io/breakpoints.py
def read_sv_table(path: SvInput):
    """Read the TSV into a DataFrame (tab-separated, '#'-comment aware).

    ``path`` is a filesystem path or an open text/binary buffer.
    """
    import pandas as pd

    try:
        return pd.read_csv(path, sep="\t", comment="#", low_memory=False)
    except Exception as exc:  # noqa: BLE001
        raise SvError(f"Could not read SV table: {exc}") from exc

load_sv_table

load_sv_table(path, cols)

Read and validate that the required columns exist (by name).

Source code in src/vflank/io/breakpoints.py
def load_sv_table(path: SvInput, cols: SvColumns):
    """Read and validate that the required columns exist (by name)."""
    df = read_sv_table(path)
    if df.empty:
        raise SvError("SV table is empty.")

    missing = [getattr(cols, f) for f in _REQUIRED_FIELDS if getattr(cols, f) not in df.columns]
    if missing:
        raise SvError(
            f"Missing required column(s): {', '.join(missing)}. "
            f"Header has: {', '.join(map(str, df.columns))}. "
            "Use the column overrides to map differently-named columns."
        )
    return df

parse_fusion_row

parse_fusion_row(row, cols)

Convert one row to a :class:Fusion, or return a skip reason.

Source code in src/vflank/io/breakpoints.py
def parse_fusion_row(row, cols: SvColumns) -> tuple[Fusion | None, str | None]:
    """Convert one row to a :class:`Fusion`, or return a skip reason."""
    c1, err1 = normalise_chrom(row[cols.chr1])
    if c1 is None:
        return None, f"breakpoint 1 — {err1}"
    c2, err2 = normalise_chrom(row[cols.chr2])
    if c2 is None:
        return None, f"breakpoint 2 — {err2}"

    try:
        p1 = int(float(row[cols.pos1]))
        p2 = int(float(row[cols.pos2]))
    except (ValueError, TypeError):
        return None, f"non-numeric position (pos1={row[cols.pos1]!r}, pos2={row[cols.pos2]!r})"
    if p1 < 1 or p2 < 1:
        return None, f"position < 1 (pos1={p1}, pos2={p2})"

    s1 = _parse_strand(row[cols.str1])
    s2 = _parse_strand(row[cols.str2])
    if s1 is None or s2 is None:
        return None, f"strand must be 0 or 1 (str1={row[cols.str1]!r}, str2={row[cols.str2]!r})"

    return (
        Fusion(
            Breakpoint(c1, p1, s1),
            Breakpoint(c2, p2, s2),
            name=_optional(row, cols.name),
            sample=_optional(row, cols.sample),
        ),
        None,
    )

vflank.io.report

vflank.io.report

Write a machine-readable TSV run report alongside the FASTA output.

Aggregate stats and the skip breakdown go in #-comment header lines; the per-variant table follows as proper TSV. Columns are taken from the row keys (insertion order), so callers control the columns per run mode.

write_report

write_report(path, summary_rows, stats, skip_breakdown)

Write the run report TSV. Raises OSError on write failure (never silent).

Source code in src/vflank/io/report.py
def write_report(
    path: Path,
    summary_rows: list[dict],
    stats: dict[str, object],
    skip_breakdown: dict[str, int],
) -> None:
    """Write the run report TSV. Raises OSError on write failure (never silent)."""
    lines: list[str] = ["# vflank small run report"]
    for key, value in stats.items():
        lines.append(f"# {key}\t{value}")
    for category, count in skip_breakdown.items():
        lines.append(f"# skip:{category}\t{count}")

    columns = list(summary_rows[0].keys()) if summary_rows else []
    lines.append("\t".join(columns))
    for r in summary_rows:
        lines.append("\t".join(str(r.get(c, "")) for c in columns))

    try:
        path.write_text("\n".join(lines) + "\n")
    except OSError as exc:
        raise OSError(f"Could not write report: {exc}") from exc