From 08113355a55d65befcb9454955a09025aae80386 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 14:52:58 -0400
Subject: [PATCH 01/19] Add pyramid_builder: auto-build missing OME-NGFF
 multiscale levels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New module ``mesh_n_bone/util/pyramid_builder.py`` that pre-computes
missing s_k arrays for the downsample multires strategy when the
input zarr exposes only s0. Replaces the slow per-LOD in-worker
fallback that reads the entire s0 once per LOD.

Key design:
- Per-axis factors (per_lod_factors_for_anisotropy) — anisotropic voxels
  downsample 2x along the finest axes first until ~isotropic, then 2x
  uniformly. Standard FIB-SEM practice.
- ROI alignment with two modes (align_roi_voxels):
    snap   — round inward to max-factor multiples; drops up to
             max_factor-1 voxels per ROI edge.
    halo   — round outward; reads beyond the ROI into surrounding
             dataset to complete cubes at unaligned edges.
- OME-NGFF v0.4 metadata matching cellmap-analyze conventions:
  tr_k = tr_0 + 0.5 * vs_0 * (F_k - 1), per axis.
- Single-pass super-chunk worker (downsample_super_chunk): reads each
  s0 super-chunk ONCE and emits one output chunk at every LOD.
- Local s0 is symlinked into the pyramid group for a complete OME
  multiscale layout; remote sources fall back to discovery-side merging.

Tests (16, all passing):
- per_lod_factors_for_anisotropy: isotropic, 2.5x z anisotropy, 5x z
  extreme.
- align_roi_voxels: aligned ROI identity, snap drops edges, halo rounds
  outward, anisotropic factor.
- build_multiscales_metadata: isotropic + anisotropic translation
  formulas; axes block.
- Fence-post correctness: per-block super-chunk downsample matches a
  global single-pass downsample voxel-by-voxel, isotropic + anisotropic.
- End-to-end build: writes OME .zattrs + s_k arrays; existing factors
  are skipped; local s0 is symlinked.

No wiring into Meshify yet — that's a follow-up commit on this branch.
---
 src/mesh_n_bone/util/pyramid_builder.py | 432 ++++++++++++++++++++++++
 tests/test_pyramid_builder.py           | 368 ++++++++++++++++++++
 2 files changed, 800 insertions(+)
 create mode 100644 src/mesh_n_bone/util/pyramid_builder.py
 create mode 100644 tests/test_pyramid_builder.py

diff --git a/src/mesh_n_bone/util/pyramid_builder.py b/src/mesh_n_bone/util/pyramid_builder.py
new file mode 100644
index 0000000..cbabdc0
--- /dev/null
+++ b/src/mesh_n_bone/util/pyramid_builder.py
@@ -0,0 +1,432 @@
+"""Auto-build missing OME-NGFF multiscale levels for the input zarr.
+
+When ``Meshify(multires_strategy="downsample")`` runs on an input that
+only exposes ``s0`` (no pre-computed coarser scales), every LOD beyond 0
+has to downsample s0 in-worker, which serializes I/O and re-reads the
+full volume per LOD. This module pre-computes the missing ``s_k``
+arrays once, in parallel chunks, so each LOD reads pre-built
+properly-chunked data.
+
+Output layout: ``{output_directory}/_intermediate_scales.zarr`` is an
+OME-NGFF v0.4 multiscales group containing ``s0`` (symlinked to the
+source on local filesystems; copy or skip for remote) plus ``s1..s_n``.
+
+Per-axis factors handle anisotropic voxels. Standard practice:
+downsample by 2x along axes whose voxel size is within ~50% of the
+finest, until the volume is approximately isotropic, then by 2x
+uniformly. ``per_lod_factors_for_anisotropy`` implements this.
+
+Fence-post correctness: each ``s_k`` voxel is the mode of a per-axis
+``factor_k_axis`` cube of s0 voxels. If the cube straddles the ROI
+edge, two modes are available:
+
+- ``snap`` (default): snap the ROI origin/extent to multiples of the
+  max per-axis factor times s0 voxel size, dropping a small fringe of
+  data (up to ``max_factor - 1`` s0 voxels per edge).
+- ``halo``: keep the ROI exact, read beyond the ROI when a cube needs
+  it (clipped to dataset bounds). No data loss.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import math
+import os
+import shutil
+from typing import Any
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Per-LOD factor calculation for anisotropic voxels
+# ---------------------------------------------------------------------------
+
+
+def per_lod_factors_for_anisotropy(
+    voxel_size_zyx: np.ndarray, num_lods: int,
+    anisotropy_tolerance: float = 1.5,
+) -> list[tuple[int, int, int]]:
+    """Compute per-axis downsample factors for each LOD.
+
+    At each step, an axis is downsampled by 2 if its current voxel size
+    is at or below ``anisotropy_tolerance * min(voxel_size)``. If no
+    axis qualifies (already isotropic), all axes downsample by 2.
+
+    Returns a list of cumulative (Fz, Fy, Fx) factors per LOD, including
+    LOD 0 which is always (1, 1, 1).
+
+    Example: voxel_size=[8, 8, 20], num_lods=4
+        LOD 0: (1, 1, 1)  voxel=[8, 8, 20]
+        LOD 1: (2, 2, 1)  voxel=[16, 16, 20]  → near-isotropic
+        LOD 2: (4, 4, 2)  voxel=[32, 32, 40]
+        LOD 3: (8, 8, 4)  voxel=[64, 64, 80]
+    """
+    cur_vs = np.asarray(voxel_size_zyx, dtype=float).copy()
+    cur_factor = np.array([1, 1, 1], dtype=int)
+    out = [tuple(cur_factor.tolist())]
+    for _ in range(1, num_lods):
+        min_vs = cur_vs.min()
+        step = np.ones(3, dtype=int)
+        for ax in range(3):
+            if cur_vs[ax] <= anisotropy_tolerance * min_vs:
+                step[ax] = 2
+        if np.all(step == 1):
+            step[:] = 2
+        cur_factor = cur_factor * step
+        cur_vs = cur_vs * step
+        out.append(tuple(cur_factor.tolist()))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# ROI alignment
+# ---------------------------------------------------------------------------
+
+
+def align_roi_voxels(
+    roi_origin_voxels: np.ndarray, roi_shape_voxels: np.ndarray,
+    max_per_axis_factor: np.ndarray, mode: str,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Align an ROI to ``max_per_axis_factor`` voxel boundaries.
+
+    Returns ``(aligned_origin, aligned_shape, read_origin, read_shape)``
+    where ``aligned_*`` is the OUTPUT region (always 2^k-aligned to the
+    s0 grid) and ``read_*`` is the s0 region the pyramid worker reads:
+
+    - ``mode="snap"``: read_* = aligned_*. Voxels in the original ROI
+      but outside the snapped region are not processed.
+    - ``mode="halo"``: aligned_* = original ROI snapped INWARD to factor
+      multiples; read_* extends beyond aligned_* OUTWARD by up to
+      ``max_per_axis_factor - 1`` voxels so every output cube reads its
+      full s0 input. Caller is responsible for clipping ``read_*`` to
+      the dataset bounds.
+    """
+    roi_origin = np.asarray(roi_origin_voxels, dtype=np.int64)
+    roi_shape = np.asarray(roi_shape_voxels, dtype=np.int64)
+    factor = np.asarray(max_per_axis_factor, dtype=np.int64)
+
+    if mode == "snap":
+        new_origin = ((roi_origin + factor - 1) // factor) * factor
+        end = roi_origin + roi_shape
+        new_end = (end // factor) * factor
+        new_shape = np.maximum(new_end - new_origin, 0)
+        return new_origin, new_shape, new_origin.copy(), new_shape.copy()
+
+    if mode == "halo":
+        # Output region: aligned-inward (so output cubes fit inside the ROI's
+        # aligned envelope), but we extend READS outward to complete the
+        # boundary cubes. Final output covers the ROI rounded to factor
+        # boundaries OUTWARD (so the ROI is fully covered).
+        out_origin = (roi_origin // factor) * factor
+        out_end = ((roi_origin + roi_shape + factor - 1) // factor) * factor
+        out_shape = out_end - out_origin
+        return out_origin, out_shape, out_origin.copy(), out_shape.copy()
+
+    raise ValueError(f"unknown alignment mode: {mode!r}")
+
+
+# ---------------------------------------------------------------------------
+# OME-NGFF multiscales metadata
+# ---------------------------------------------------------------------------
+
+
+def build_multiscales_metadata(
+    s0_voxel_size_zyx: list[float],
+    s0_translation_zyx: list[float],
+    per_lod_factors: list[tuple[int, int, int]],
+    axes: list[str] = ("z", "y", "x"),
+    unit: str = "nanometer",
+    version: str = "0.4",
+) -> dict[str, Any]:
+    """Build an OME-NGFF multiscales metadata dict for a pyramid.
+
+    Translation per level uses the OME convention from cellmap-analyze
+    (voxel centers): ``tr_k_axis = tr_0_axis + 0.5 * vs0_axis * (F_k_axis - 1)``
+    where ``F_k_axis`` is the cumulative downsample factor at LOD k.
+    """
+    s0_vs = np.asarray(s0_voxel_size_zyx, dtype=float)
+    s0_tr = np.asarray(s0_translation_zyx, dtype=float)
+    datasets = []
+    for k, factor in enumerate(per_lod_factors):
+        f = np.asarray(factor, dtype=float)
+        vs_k = (s0_vs * f).tolist()
+        tr_k = (s0_tr + 0.5 * s0_vs * (f - 1.0)).tolist()
+        datasets.append({
+            "path": f"s{k}",
+            "coordinateTransformations": [
+                {"type": "scale", "scale": vs_k},
+                {"type": "translation", "translation": tr_k},
+            ],
+        })
+    return {
+        "multiscales": [{
+            "version": version,
+            "name": "",
+            "axes": [{"name": a, "type": "space", "unit": unit} for a in axes],
+            "datasets": datasets,
+        }],
+    }
+
+
+def write_multiscales_metadata(
+    group_path: str, metadata: dict[str, Any], zarr_format: int = 2,
+) -> None:
+    """Persist multiscales attrs at the zarr group root.
+
+    Writes ``.zattrs`` for zarr v2 layout or ``zarr.json`` for v3.
+    """
+    os.makedirs(group_path, exist_ok=True)
+    if zarr_format == 2:
+        with open(os.path.join(group_path, ".zattrs"), "w") as f:
+            json.dump(metadata, f, indent=2)
+        # Also need a minimal .zgroup
+        zg_path = os.path.join(group_path, ".zgroup")
+        if not os.path.exists(zg_path):
+            with open(zg_path, "w") as f:
+                json.dump({"zarr_format": 2}, f)
+    elif zarr_format == 3:
+        with open(os.path.join(group_path, "zarr.json"), "w") as f:
+            json.dump({
+                "zarr_format": 3,
+                "node_type": "group",
+                "attributes": metadata,
+            }, f, indent=2)
+    else:
+        raise ValueError(f"unsupported zarr_format: {zarr_format}")
+
+
+# ---------------------------------------------------------------------------
+# Super-chunk worker
+# ---------------------------------------------------------------------------
+
+
+def downsample_super_chunk(
+    s0_block: np.ndarray,
+    super_chunk_origin_voxels: np.ndarray,
+    per_lod_factors: list[tuple[int, int, int]],
+    downsample_func,
+    out_chunk_shape: np.ndarray,
+) -> dict[int, tuple[np.ndarray, np.ndarray]]:
+    """Compute all-LOD downsamples for a single super-chunk of s0.
+
+    ``s0_block`` is the s0 voxel data covering the super-chunk extent
+    (caller is responsible for reading + clipping/zero-padding).
+    Returns ``{lod: (output_block, output_origin_voxels)}`` for every
+    LOD ≥ 1. LOD 0 isn't emitted — the caller writes/links s0 separately.
+    """
+    out = {}
+    for k, factor in enumerate(per_lod_factors):
+        if k == 0:
+            continue
+        f = np.asarray(factor, dtype=int)
+        # Trim s0_block to the largest extent that's a multiple of f.
+        trim = (np.array(s0_block.shape) // f) * f
+        block = s0_block[: trim[0], : trim[1], : trim[2]]
+        if block.size == 0:
+            continue
+        ds_block, _ = downsample_func(block, tuple(f.tolist()))
+        out_origin = super_chunk_origin_voxels // f
+        out[k] = (ds_block, out_origin)
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+
+
+def build_missing_pyramid_levels(
+    *,
+    s0_reader,  # callable(origin_voxels, shape_voxels) -> np.ndarray
+    s0_dataset_shape_voxels: np.ndarray,
+    s0_voxel_size_zyx: list[float],
+    s0_translation_zyx: list[float],
+    dtype: np.dtype,
+    num_lods: int,
+    existing_factors: set[tuple[int, int, int]],
+    output_zarr_path: str,
+    downsample_func,
+    roi_origin_voxels: np.ndarray | None = None,
+    roi_shape_voxels: np.ndarray | None = None,
+    out_chunk_shape_voxels: tuple[int, int, int] = (64, 64, 64),
+    alignment_mode: str = "snap",
+    anisotropy_tolerance: float = 1.5,
+    zarr_format: int = 2,
+    s0_source_path: str | None = None,
+    dispatch=None,
+) -> str:
+    """Build missing OME-NGFF pyramid levels and return the group path.
+
+    Reads s0 via ``s0_reader`` in chunk-aligned super-chunks (each
+    containing one output chunk worth of voxels at every LOD), computes
+    the per-LOD downsamples in memory, and writes each LOD's voxels to
+    ``output_zarr_path/s_k``.
+
+    On local filesystems, ``s0_source_path`` (if provided) is symlinked
+    in as ``output_zarr_path/s0``. For remote sources or symlink
+    failures, ``s0`` is left absent — discovery is responsible for
+    merging the source's own s0 metadata with this pyramid's s_k>0.
+
+    ``dispatch`` is an optional ``callable(func, args_iter) -> results``
+    for parallelizing the super-chunk pass. If ``None``, the pass is
+    sequential.
+
+    Returns the path to the pyramid group.
+    """
+    if num_lods <= 1:
+        return output_zarr_path  # nothing to build
+
+    voxel_size = np.asarray(s0_voxel_size_zyx, dtype=float)
+    translation = np.asarray(s0_translation_zyx, dtype=float)
+    dataset_shape = np.asarray(s0_dataset_shape_voxels, dtype=np.int64)
+
+    # Per-LOD cumulative factors (per-axis)
+    factors_per_lod = per_lod_factors_for_anisotropy(
+        voxel_size, num_lods, anisotropy_tolerance=anisotropy_tolerance,
+    )
+    max_factor = np.max(np.array(factors_per_lod), axis=0)  # per-axis
+
+    # ROI handling
+    if roi_origin_voxels is None:
+        roi_origin_voxels = np.zeros(3, dtype=np.int64)
+    if roi_shape_voxels is None:
+        roi_shape_voxels = dataset_shape - roi_origin_voxels
+
+    out_origin, out_shape, read_origin, read_shape = align_roi_voxels(
+        roi_origin_voxels, roi_shape_voxels, max_factor, alignment_mode,
+    )
+
+    if not np.array_equal(out_origin, roi_origin_voxels) or not np.array_equal(
+        out_shape, roi_shape_voxels
+    ):
+        logger.info(
+            "pyramid_builder: %s aligned ROI from origin=%s shape=%s to "
+            "origin=%s shape=%s (max per-axis factor=%s)",
+            alignment_mode, roi_origin_voxels.tolist(), roi_shape_voxels.tolist(),
+            out_origin.tolist(), out_shape.tolist(), max_factor.tolist(),
+        )
+
+    # Decide which levels need building (factors not in existing_factors)
+    missing = [
+        (k, factor) for k, factor in enumerate(factors_per_lod)
+        if k > 0 and factor not in existing_factors
+    ]
+    if not missing:
+        logger.info("pyramid_builder: all required scales already present")
+        return output_zarr_path
+
+    # Emit OME-NGFF metadata (covering ALL levels including s0 for completeness)
+    metadata = build_multiscales_metadata(
+        s0_voxel_size_zyx=voxel_size.tolist(),
+        s0_translation_zyx=translation.tolist(),
+        per_lod_factors=factors_per_lod,
+        version="0.4",
+    )
+    write_multiscales_metadata(output_zarr_path, metadata, zarr_format=zarr_format)
+
+    # Create empty zarr arrays for each missing level
+    out_chunk = np.asarray(out_chunk_shape_voxels, dtype=np.int64)
+    out_arrays = {}
+    for k, factor in missing:
+        f = np.asarray(factor, dtype=np.int64)
+        lod_shape = ((out_shape + f - 1) // f).tolist()
+        ds_path = os.path.join(output_zarr_path, f"s{k}")
+        if os.path.exists(ds_path):
+            shutil.rmtree(ds_path)
+        out_arrays[k] = _create_zarr_v2_array(
+            ds_path, shape=lod_shape, chunks=out_chunk.tolist(), dtype=dtype,
+        )
+
+    # Optionally symlink s0
+    if s0_source_path is not None:
+        _try_symlink_s0(output_zarr_path, s0_source_path)
+
+    # Super-chunk grid in s0 voxels
+    super_chunk_shape = out_chunk * max_factor
+    sc_grid = []
+    for z0 in range(int(out_origin[0]), int(out_origin[0] + out_shape[0]), int(super_chunk_shape[0])):
+        for y0 in range(int(out_origin[1]), int(out_origin[1] + out_shape[1]), int(super_chunk_shape[1])):
+            for x0 in range(int(out_origin[2]), int(out_origin[2] + out_shape[2]), int(super_chunk_shape[2])):
+                sc_grid.append(np.array([z0, y0, x0], dtype=np.int64))
+
+    def _process_super_chunk(sc_origin):
+        # Read range in s0 voxels — clip to dataset bounds for halo / dataset edge
+        read_end = np.minimum(sc_origin + super_chunk_shape, dataset_shape)
+        read_size = read_end - sc_origin
+        s0_block = s0_reader(sc_origin, read_size)
+        # Downsample to all missing LODs
+        downsamples = downsample_super_chunk(
+            s0_block, sc_origin, factors_per_lod,
+            downsample_func, out_chunk,
+        )
+        # Write each LOD's chunk
+        for k, (ds_block, ds_origin) in downsamples.items():
+            if k not in out_arrays:
+                continue
+            arr = out_arrays[k]
+            # Output position relative to out_origin/factor
+            f = np.asarray(factors_per_lod[k], dtype=np.int64)
+            local_origin = ds_origin - (out_origin // f)
+            _write_zarr_v2_region(arr, local_origin, ds_block)
+
+    if dispatch is None:
+        for sc in sc_grid:
+            _process_super_chunk(sc)
+    else:
+        dispatch(_process_super_chunk, sc_grid)
+
+    logger.info(
+        "pyramid_builder: built %d new scales at %s (factors=%s)",
+        len(missing), output_zarr_path,
+        [factor for _, factor in missing],
+    )
+    return output_zarr_path
+
+
+# ---------------------------------------------------------------------------
+# Minimal zarr v2 array helpers (no extra deps)
+# ---------------------------------------------------------------------------
+
+
+def _create_zarr_v2_array(path, shape, chunks, dtype):
+    """Create a zarr v2 array on disk and return a handle."""
+    import zarr
+    arr = zarr.open_array(
+        store=path, mode="w", shape=shape, chunks=chunks, dtype=dtype,
+        fill_value=0,
+    )
+    return arr
+
+
+def _write_zarr_v2_region(arr, origin_voxels, data):
+    """Write ``data`` into ``arr`` at the given origin (zyx voxel coords)."""
+    z, y, x = origin_voxels.tolist()
+    zE, yE, xE = z + data.shape[0], y + data.shape[1], x + data.shape[2]
+    # Clip in case the block extends past array bounds (boundary chunks)
+    zE = min(zE, arr.shape[0]); yE = min(yE, arr.shape[1]); xE = min(xE, arr.shape[2])
+    arr[z:zE, y:yE, x:xE] = data[: zE - z, : yE - y, : xE - x]
+
+
+def _try_symlink_s0(pyramid_path, s0_source_path):
+    """Symlink the pyramid's s0 to the source s0 array.
+
+    Returns True on success. Logs a warning and returns False otherwise
+    (e.g. cross-filesystem, remote source, permission denied).
+    """
+    target = os.path.join(pyramid_path, "s0")
+    if os.path.exists(target):
+        return True
+    try:
+        os.symlink(s0_source_path, target)
+        return True
+    except OSError as e:
+        logger.warning(
+            "pyramid_builder: could not symlink s0 from %s into %s: %s. "
+            "Discovery will fall back to merging the source group's own s0.",
+            s0_source_path, target, e,
+        )
+        return False
diff --git a/tests/test_pyramid_builder.py b/tests/test_pyramid_builder.py
new file mode 100644
index 0000000..63c8a19
--- /dev/null
+++ b/tests/test_pyramid_builder.py
@@ -0,0 +1,368 @@
+"""Tests for the OME-NGFF multiscale pyramid auto-builder.
+
+Focus areas:
+- Per-LOD per-axis factor calculation for anisotropic voxels.
+- ROI alignment (snap mode + halo mode).
+- OME-NGFF metadata correctness (scale + translation per level).
+- Fence-post correctness: per-block downsamples match a global single-pass
+  downsample bit-exact when blocks are 2^k-aligned.
+"""
+
+import json
+import os
+
+import numpy as np
+import pytest
+
+from mesh_n_bone.meshify.downsample import downsample_labels_3d
+from mesh_n_bone.util.pyramid_builder import (
+    align_roi_voxels,
+    build_missing_pyramid_levels,
+    build_multiscales_metadata,
+    downsample_super_chunk,
+    per_lod_factors_for_anisotropy,
+    write_multiscales_metadata,
+)
+
+
+class TestAnisotropyFactors:
+    def test_isotropic_doubles_every_axis(self):
+        f = per_lod_factors_for_anisotropy(np.array([8.0, 8.0, 8.0]), num_lods=4)
+        assert f == [(1, 1, 1), (2, 2, 2), (4, 4, 4), (8, 8, 8)]
+
+    def test_anisotropic_2to1_z_axis(self):
+        # voxel z=20, xy=8: z is coarsest → downsample xy first
+        f = per_lod_factors_for_anisotropy(np.array([20.0, 8.0, 8.0]), num_lods=4)
+        # LOD 0: (1,1,1)         voxel = [20, 8, 8]
+        # LOD 1: only axes with 8<=1.5*8=12 qualify → y,x → step (1,2,2)
+        #   voxel = [20, 16, 16]
+        # LOD 2: 16<=1.5*16=24, 20<=24 → all qualify → step (2,2,2)
+        #   voxel = [40, 32, 32]
+        # LOD 3: 32<=48, 40<=48 → all → step (2,2,2)
+        #   voxel = [80, 64, 64]
+        assert f[0] == (1, 1, 1)
+        assert f[1] == (1, 2, 2)
+        assert f[2] == (2, 4, 4)
+        assert f[3] == (4, 8, 8)
+
+    def test_extreme_anisotropy_multiple_xy_steps(self):
+        # voxel z=40, xy=8: z is 5x — needs two xy downsamples before z
+        f = per_lod_factors_for_anisotropy(np.array([40.0, 8.0, 8.0]), num_lods=4)
+        # LOD 0: (1,1,1) voxel=[40,8,8]
+        # LOD 1: step (1,2,2) → [40,16,16]
+        # LOD 2: 16<=1.5*16=24, 40>24 → step (1,2,2) → [40,32,32]
+        # LOD 3: 32<=48, 40<=48 → step (2,2,2) → [80,64,64]
+        assert f[1] == (1, 2, 2)
+        assert f[2] == (1, 4, 4)
+        assert f[3] == (2, 8, 8)
+
+
+class TestRoiAlignment:
+    def test_aligned_roi_snap_is_identity(self):
+        origin = np.array([0, 0, 0])
+        shape = np.array([64, 64, 64])
+        out_o, out_s, _, _ = align_roi_voxels(
+            origin, shape, np.array([8, 8, 8]), "snap",
+        )
+        assert out_o.tolist() == [0, 0, 0]
+        assert out_s.tolist() == [64, 64, 64]
+
+    def test_unaligned_roi_snap_drops_edges(self):
+        # origin 3 → snap up to 8 (drop 5 voxels off the start)
+        # extent 70 → end 73 → snap down to 72 (drop 1 voxel off the end)
+        # net snapped: origin 8, extent 64
+        origin = np.array([3, 3, 3])
+        shape = np.array([70, 70, 70])
+        out_o, out_s, _, _ = align_roi_voxels(
+            origin, shape, np.array([8, 8, 8]), "snap",
+        )
+        assert out_o.tolist() == [8, 8, 8]
+        assert out_s.tolist() == [64, 64, 64]
+
+    def test_unaligned_roi_halo_rounds_outward(self):
+        # origin 3 → halo round DOWN to 0
+        # extent 70 → end 73 → round UP to 80
+        # net: output spans 0..80
+        origin = np.array([3, 3, 3])
+        shape = np.array([70, 70, 70])
+        out_o, out_s, read_o, read_s = align_roi_voxels(
+            origin, shape, np.array([8, 8, 8]), "halo",
+        )
+        assert out_o.tolist() == [0, 0, 0]
+        assert out_s.tolist() == [80, 80, 80]
+        # In halo mode, read_* matches out_* (caller clips to dataset bounds)
+        assert read_o.tolist() == [0, 0, 0]
+        assert read_s.tolist() == [80, 80, 80]
+
+    def test_anisotropic_factor(self):
+        origin = np.array([3, 5, 5])
+        shape = np.array([20, 32, 32])
+        # max factor (4, 8, 8)
+        out_o, out_s, _, _ = align_roi_voxels(
+            origin, shape, np.array([4, 8, 8]), "snap",
+        )
+        # z: snap 3 -> 4, end 23 -> 20, shape 16
+        # y: snap 5 -> 8, end 37 -> 32, shape 24
+        # x: snap 5 -> 8, end 37 -> 32, shape 24
+        assert out_o.tolist() == [4, 8, 8]
+        assert out_s.tolist() == [16, 24, 24]
+
+
+class TestMultiscalesMetadata:
+    def test_isotropic_translations(self):
+        # s0_vs = 8, s0_tr = 4 (= corner + vs/2 of an origin at 0)
+        md = build_multiscales_metadata(
+            s0_voxel_size_zyx=[8.0, 8.0, 8.0],
+            s0_translation_zyx=[4.0, 4.0, 4.0],
+            per_lod_factors=[(1, 1, 1), (2, 2, 2), (4, 4, 4), (8, 8, 8)],
+        )
+        ds = md["multiscales"][0]["datasets"]
+        # s0: scale=8, tr=4
+        assert ds[0]["coordinateTransformations"][0]["scale"] == [8.0, 8.0, 8.0]
+        assert ds[0]["coordinateTransformations"][1]["translation"] == [4.0, 4.0, 4.0]
+        # s1: scale=16, tr = 4 + 0.5*8*(2-1) = 8
+        assert ds[1]["coordinateTransformations"][0]["scale"] == [16.0, 16.0, 16.0]
+        assert ds[1]["coordinateTransformations"][1]["translation"] == [8.0, 8.0, 8.0]
+        # s2: scale=32, tr = 4 + 0.5*8*(4-1) = 16
+        assert ds[2]["coordinateTransformations"][0]["scale"] == [32.0, 32.0, 32.0]
+        assert ds[2]["coordinateTransformations"][1]["translation"] == [16.0, 16.0, 16.0]
+        # s3: scale=64, tr = 4 + 0.5*8*(8-1) = 32
+        assert ds[3]["coordinateTransformations"][0]["scale"] == [64.0, 64.0, 64.0]
+        assert ds[3]["coordinateTransformations"][1]["translation"] == [32.0, 32.0, 32.0]
+
+    def test_anisotropic_translations_per_axis(self):
+        # s0_vs = [20, 8, 8], s0_tr = [10, 4, 4]
+        # LOD 1 cumulative factor (1, 2, 2) → vs=[20,16,16], tr=[10, 4+0.5*8*1, 4+0.5*8*1]=[10, 8, 8]
+        md = build_multiscales_metadata(
+            s0_voxel_size_zyx=[20.0, 8.0, 8.0],
+            s0_translation_zyx=[10.0, 4.0, 4.0],
+            per_lod_factors=[(1, 1, 1), (1, 2, 2), (2, 4, 4)],
+        )
+        ds = md["multiscales"][0]["datasets"]
+        assert ds[1]["coordinateTransformations"][0]["scale"] == [20.0, 16.0, 16.0]
+        assert ds[1]["coordinateTransformations"][1]["translation"] == [10.0, 8.0, 8.0]
+        # LOD 2 factor (2, 4, 4) → vs=[40,32,32], tr=[10+0.5*20*1, 4+0.5*8*3, 4+0.5*8*3]=[20, 16, 16]
+        assert ds[2]["coordinateTransformations"][0]["scale"] == [40.0, 32.0, 32.0]
+        assert ds[2]["coordinateTransformations"][1]["translation"] == [20.0, 16.0, 16.0]
+
+    def test_axes_metadata(self):
+        md = build_multiscales_metadata(
+            s0_voxel_size_zyx=[1.0, 1.0, 1.0],
+            s0_translation_zyx=[0.0, 0.0, 0.0],
+            per_lod_factors=[(1, 1, 1)],
+        )
+        axes = md["multiscales"][0]["axes"]
+        assert [a["name"] for a in axes] == ["z", "y", "x"]
+        assert all(a["type"] == "space" and a["unit"] == "nanometer" for a in axes)
+
+
+class TestFencepostCorrectness:
+    """Block-by-block downsample of an aligned volume must equal a
+    single-pass global downsample, voxel-by-voxel."""
+
+    def _build_synthetic_volume(self, shape=(64, 64, 64), num_labels=8, seed=42):
+        rng = np.random.default_rng(seed)
+        return rng.integers(low=0, high=num_labels, size=shape, dtype=np.uint8)
+
+    def test_super_chunk_matches_global_isotropic(self, tmp_path):
+        vol = self._build_synthetic_volume(shape=(32, 32, 32))
+        per_lod = [(1, 1, 1), (2, 2, 2), (4, 4, 4)]
+        # Global single-pass reference
+        ref_s1, _ = downsample_labels_3d(vol, (2, 2, 2))
+        ref_s2, _ = downsample_labels_3d(vol, (4, 4, 4))
+
+        # Build per-chunk via the super-chunk worker on 16-voxel super-chunks
+        super_chunk_shape = np.array([16, 16, 16])  # = out_chunk(4)*max_factor(4)
+        out_chunk = np.array([4, 4, 4])
+        result_s1 = np.zeros((16, 16, 16), dtype=vol.dtype)
+        result_s2 = np.zeros((8, 8, 8), dtype=vol.dtype)
+        for z0 in range(0, 32, 16):
+            for y0 in range(0, 32, 16):
+                for x0 in range(0, 32, 16):
+                    s0_block = vol[z0:z0+16, y0:y0+16, x0:x0+16]
+                    sc_origin = np.array([z0, y0, x0])
+                    downs = downsample_super_chunk(
+                        s0_block, sc_origin, per_lod, downsample_labels_3d, out_chunk,
+                    )
+                    s1_block, s1_origin = downs[1]
+                    s2_block, s2_origin = downs[2]
+                    result_s1[
+                        s1_origin[0]:s1_origin[0]+s1_block.shape[0],
+                        s1_origin[1]:s1_origin[1]+s1_block.shape[1],
+                        s1_origin[2]:s1_origin[2]+s1_block.shape[2],
+                    ] = s1_block
+                    result_s2[
+                        s2_origin[0]:s2_origin[0]+s2_block.shape[0],
+                        s2_origin[1]:s2_origin[1]+s2_block.shape[1],
+                        s2_origin[2]:s2_origin[2]+s2_block.shape[2],
+                    ] = s2_block
+
+        np.testing.assert_array_equal(result_s1, ref_s1)
+        np.testing.assert_array_equal(result_s2, ref_s2)
+
+    def test_super_chunk_matches_global_anisotropic(self):
+        vol = self._build_synthetic_volume(shape=(16, 64, 64))
+        per_lod = [(1, 1, 1), (1, 2, 2), (2, 4, 4)]
+        ref_s1, _ = downsample_labels_3d(vol, (1, 2, 2))
+        ref_s2, _ = downsample_labels_3d(vol, (2, 4, 4))
+
+        # Super-chunk size = out_chunk * max per-axis factor = (8*2, 8*4, 8*4) = (16, 32, 32)
+        super_chunk = np.array([16, 32, 32])
+        out_chunk = np.array([8, 8, 8])
+        result_s1 = np.zeros((16, 32, 32), dtype=vol.dtype)
+        result_s2 = np.zeros((8, 16, 16), dtype=vol.dtype)
+        for z0 in range(0, 16, 16):
+            for y0 in range(0, 64, 32):
+                for x0 in range(0, 64, 32):
+                    s0_block = vol[z0:z0+16, y0:y0+32, x0:x0+32]
+                    downs = downsample_super_chunk(
+                        s0_block, np.array([z0, y0, x0]),
+                        per_lod, downsample_labels_3d, out_chunk,
+                    )
+                    s1b, s1o = downs[1]
+                    s2b, s2o = downs[2]
+                    result_s1[
+                        s1o[0]:s1o[0]+s1b.shape[0],
+                        s1o[1]:s1o[1]+s1b.shape[1],
+                        s1o[2]:s1o[2]+s1b.shape[2],
+                    ] = s1b
+                    result_s2[
+                        s2o[0]:s2o[0]+s2b.shape[0],
+                        s2o[1]:s2o[1]+s2b.shape[1],
+                        s2o[2]:s2o[2]+s2b.shape[2],
+                    ] = s2b
+        np.testing.assert_array_equal(result_s1, ref_s1)
+        np.testing.assert_array_equal(result_s2, ref_s2)
+
+
+class TestEndToEndPyramidBuild:
+    """Drive ``build_missing_pyramid_levels`` end-to-end against a
+    synthetic volume backed by an in-memory s0_reader. Verify the on-disk
+    output matches a single-pass reference downsample."""
+
+    def test_build_isotropic_pyramid(self, tmp_path):
+        rng = np.random.default_rng(7)
+        vol = rng.integers(low=0, high=4, size=(32, 32, 32), dtype=np.uint8)
+
+        def s0_reader(origin, shape):
+            z, y, x = origin.tolist()
+            sz, sy, sx = shape.tolist()
+            return vol[z:z+sz, y:y+sy, x:x+sx].copy()
+
+        out_path = str(tmp_path / "pyramid.zarr")
+        result = build_missing_pyramid_levels(
+            s0_reader=s0_reader,
+            s0_dataset_shape_voxels=np.array(vol.shape),
+            s0_voxel_size_zyx=[8.0, 8.0, 8.0],
+            s0_translation_zyx=[4.0, 4.0, 4.0],
+            dtype=vol.dtype,
+            num_lods=3,
+            existing_factors=set(),
+            output_zarr_path=out_path,
+            downsample_func=downsample_labels_3d,
+            out_chunk_shape_voxels=(4, 4, 4),
+        )
+
+        assert result == out_path
+        # OME metadata file present
+        assert os.path.exists(os.path.join(out_path, ".zattrs"))
+        with open(os.path.join(out_path, ".zattrs")) as f:
+            md = json.load(f)
+        assert len(md["multiscales"][0]["datasets"]) == 3
+
+        # s1 and s2 zarrs exist and match the global downsample
+        import zarr
+        s1 = zarr.open_array(os.path.join(out_path, "s1"), mode="r")
+        s2 = zarr.open_array(os.path.join(out_path, "s2"), mode="r")
+        ref_s1, _ = downsample_labels_3d(vol, (2, 2, 2))
+        ref_s2, _ = downsample_labels_3d(vol, (4, 4, 4))
+        np.testing.assert_array_equal(s1[:], ref_s1)
+        np.testing.assert_array_equal(s2[:], ref_s2)
+
+    def test_symlink_s0_when_local(self, tmp_path):
+        # Set up a dummy "source" s0 directory
+        s0_src = tmp_path / "source_s0"
+        s0_src.mkdir()
+        (s0_src / "marker.txt").write_text("hi")
+
+        rng = np.random.default_rng(0)
+        vol = rng.integers(low=0, high=2, size=(16, 16, 16), dtype=np.uint8)
+
+        out_path = str(tmp_path / "pyramid.zarr")
+        build_missing_pyramid_levels(
+            s0_reader=lambda o, s: vol[o[0]:o[0]+s[0], o[1]:o[1]+s[1], o[2]:o[2]+s[2]].copy(),
+            s0_dataset_shape_voxels=np.array(vol.shape),
+            s0_voxel_size_zyx=[1.0, 1.0, 1.0],
+            s0_translation_zyx=[0.5, 0.5, 0.5],
+            dtype=vol.dtype,
+            num_lods=2,
+            existing_factors=set(),
+            output_zarr_path=out_path,
+            downsample_func=downsample_labels_3d,
+            out_chunk_shape_voxels=(4, 4, 4),
+            s0_source_path=str(s0_src),
+        )
+
+        s0_link = os.path.join(out_path, "s0")
+        assert os.path.islink(s0_link)
+        assert os.path.exists(os.path.join(s0_link, "marker.txt"))
+
+    def test_existing_factor_skipped(self, tmp_path):
+        """If a factor is already present (existing_factors), it shouldn't
+        be re-built."""
+        rng = np.random.default_rng(0)
+        vol = rng.integers(low=0, high=4, size=(16, 16, 16), dtype=np.uint8)
+        out_path = str(tmp_path / "pyramid.zarr")
+        build_missing_pyramid_levels(
+            s0_reader=lambda o, s: vol[o[0]:o[0]+s[0], o[1]:o[1]+s[1], o[2]:o[2]+s[2]].copy(),
+            s0_dataset_shape_voxels=np.array(vol.shape),
+            s0_voxel_size_zyx=[1.0, 1.0, 1.0],
+            s0_translation_zyx=[0.5, 0.5, 0.5],
+            dtype=vol.dtype,
+            num_lods=3,
+            # Pretend s1 already exists
+            existing_factors={(2, 2, 2)},
+            output_zarr_path=out_path,
+            downsample_func=downsample_labels_3d,
+            out_chunk_shape_voxels=(4, 4, 4),
+        )
+        # s1 should NOT have been written, but s2 should
+        assert not os.path.isdir(os.path.join(out_path, "s1"))
+        assert os.path.isdir(os.path.join(out_path, "s2"))
+
+
+class TestUnalignedRoiSnap:
+    """When the ROI is unaligned, snap mode drops up to max_factor-1
+    voxels per edge; downsampled output covers the snapped region only."""
+
+    def test_snap_mode_unaligned_roi(self, tmp_path):
+        rng = np.random.default_rng(0)
+        vol = rng.integers(low=0, high=4, size=(40, 40, 40), dtype=np.uint8)
+
+        def reader(o, s):
+            return vol[o[0]:o[0]+s[0], o[1]:o[1]+s[1], o[2]:o[2]+s[2]].copy()
+
+        out_path = str(tmp_path / "pyramid.zarr")
+        build_missing_pyramid_levels(
+            s0_reader=reader,
+            s0_dataset_shape_voxels=np.array(vol.shape),
+            s0_voxel_size_zyx=[1.0, 1.0, 1.0],
+            s0_translation_zyx=[0.5, 0.5, 0.5],
+            dtype=vol.dtype,
+            num_lods=3,                          # max factor 4
+            existing_factors=set(),
+            output_zarr_path=out_path,
+            downsample_func=downsample_labels_3d,
+            out_chunk_shape_voxels=(4, 4, 4),
+            roi_origin_voxels=np.array([3, 3, 3]),
+            roi_shape_voxels=np.array([34, 34, 34]),
+            alignment_mode="snap",
+        )
+        # Snap: origin -> 4, end -> 36 -> shape 32
+        # s1: 16x16x16 from vol[4:36, 4:36, 4:36] downsampled by 2
+        import zarr
+        s1 = zarr.open_array(os.path.join(out_path, "s1"), mode="r")
+        assert s1.shape == (16, 16, 16)
+        # Compare to global downsample of the snapped region
+        ref_s1, _ = downsample_labels_3d(vol[4:36, 4:36, 4:36], (2, 2, 2))
+        np.testing.assert_array_equal(s1[:], ref_s1)

From 5fa8debbe1753595d9d11674061a68b40905aa4d Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 15:09:09 -0400
Subject: [PATCH 02/19] Wire pyramid auto-build into Meshify downsample
 multires
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the input zarr exposes only s0, _generate_multires_downsample now
calls _build_missing_pyramid_scales() upfront to pre-compute the
missing s_k arrays into ``{output_directory}/_intermediate_scales.zarr``
via a single parallel super-chunk pass. Each LOD then reads from the
pre-built pyramid instead of re-reading s0 once per LOD.

New Meshify kwargs:
- keep_intermediate_scales: bool = False — clean up the pyramid zarr
  after the pipeline finishes (default) or keep for reuse on
  subsequent runs over the same data.
- pyramid_alignment_mode: str = "snap" — "snap" rounds unaligned ROIs
  INWARD (drops up to max_factor-1 voxels per edge); "halo" rounds
  OUTWARD and reads beyond the ROI to complete boundary cubes (no
  data loss).

Implementation details:
- _build_missing_pyramid_scales() opens a tensorstore handle for s0,
  builds an (origin_voxels, shape_voxels) → ndarray closure that uses
  to_ndarray_tensorstore, dispatches on downsample_method, computes
  per-axis factors via per_lod_factors_for_anisotropy, and calls
  build_missing_pyramid_levels.
- ValueError from invalid downsample_method propagates so test
  expectations stay valid; transient I/O errors fall back to the
  per-LOD in-worker downsample path.
- downsample_method="nearest" skips pyramid build (worker stride is
  free; cache adds no value).
- Cleanup at end of get_multiscale_meshes removes the pyramid unless
  keep_intermediate_scales is set.

Tests (4 new, 352 total):
- test_auto_builds_pyramid_for_single_scale_input — exercises the
  end-to-end auto-build path; spy on _generate_meshes_at_scale calls
  confirms LODs 1+ read from `_intermediate_scales.zarr/s_k`.
- test_keep_intermediate_scales_false_cleans_up — default cleanup
  removes the pyramid after the pipeline.
- test_halo_mode_reads_beyond_roi — halo mode logs reads starting at
  voxel 0 even when ROI origin is voxel 3; output matches the
  halo-extended global downsample bit-exact.
- test_halo_clipped_at_dataset_bounds — halo extending past dataset
  bounds clips cleanly without exceptions; output shape is correct
  rounded-outward size.

README documents the new knobs.
---
 README.md                          |  12 ++
 src/mesh_n_bone/meshify/meshify.py | 182 +++++++++++++++++++++++++++++
 tests/test_integration_meshify.py  | 131 +++++++++++++++++++++
 tests/test_pyramid_builder.py      |  95 +++++++++++++++
 4 files changed, 420 insertions(+)

diff --git a/README.md b/README.md
index d99daa8..23ef2ba 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,18 @@ use_existing_scales: true        # Set false to FORCE in-worker downsampling at
                                  # when you don't trust the source's downsampling and want
                                  # consistent `downsample_method` behavior at every LOD.
                                  # (default: true)
+# When the input zarr exposes only s0 (no pre-built coarser levels),
+# the downsample multires strategy auto-builds the missing s_k arrays
+# into `{output_directory}/_intermediate_scales.zarr` in a single
+# parallel super-chunk pass over s0. Each LOD then reads from the
+# pre-built pyramid instead of repeatedly re-reading s0.
+keep_intermediate_scales: false  # Keep the auto-built `_intermediate_scales.zarr` after the
+                                 # pipeline finishes. Useful for re-running with different mesh
+                                 # params on the same data; disk-heavy on big datasets. (default: false)
+pyramid_alignment_mode: snap     # "snap" rounds an unaligned ROI INWARD to multiples of the
+                                 # max per-axis factor (drops up to max_factor-1 voxels per edge);
+                                 # "halo" rounds OUTWARD and reads beyond the ROI to complete the
+                                 # boundary cubes (no data loss). (default: snap)
 decimation_factor: 6             # Per-LOD face reduction factor. In decimate strategy: literal ratio
                                  # between LODs. In downsample strategy: target ratio used to derive
                                  # per-LOD target_reduction so the LOD-to-LOD face count drops by this
diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index c5012f7..939f54d 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -882,6 +882,8 @@ def __init__(
         downsample_method: str = "mode",
         multires_strategy: str = "downsample",
         use_existing_scales: bool = True,
+        keep_intermediate_scales: bool = False,
+        pyramid_alignment_mode: str = "snap",
         decimation_factor: int = 6,
         decimation_aggressiveness: int = 7,
         delete_decimated_meshes: bool = True,
@@ -1055,6 +1057,8 @@ def __init__(
         self.input_path = input_path
         self.multires_strategy = multires_strategy
         self.use_existing_scales = use_existing_scales
+        self.keep_intermediate_scales = keep_intermediate_scales
+        self.pyramid_alignment_mode = pyramid_alignment_mode
         self.decimation_factor = decimation_factor
         self.decimation_aggressiveness = decimation_aggressiveness
         self.delete_decimated_meshes = delete_decimated_meshes
@@ -2013,6 +2017,155 @@ def _discover_existing_scales(self):
         self._existing_scales_cache = result
         return result
 
+    def _build_missing_pyramid_scales(self, num_lods):
+        """Auto-build missing OME-NGFF multiscale levels for the
+        downsample multires strategy.
+
+        Looks at what the source group exposes, computes the per-axis
+        cumulative factors needed for ``num_lods``, and builds whatever's
+        missing under ``{output_directory}/_intermediate_scales.zarr``.
+
+        Returns ``{int_factor: dataset_path}`` mapping ONLY the
+        newly-built scales (so the caller can merge with the source's
+        existing scales). The mapping key is a single int — only valid
+        for isotropic-factor levels — to match the existing
+        ``_discover_existing_scales`` cache shape. Anisotropic factor
+        tuples aren't yet honored by the downstream LOD dispatch, so
+        the builder skips emitting them into the cache (they still get
+        written to disk for future use); falls back to in-worker
+        downsampling for those LODs.
+        """
+        from mesh_n_bone.util.pyramid_builder import (
+            per_lod_factors_for_anisotropy,
+            build_missing_pyramid_levels,
+        )
+        from mesh_n_bone.util.image_data_interface import to_ndarray_tensorstore
+        from funlib.geometry import Coordinate, Roi
+
+        voxel_size = np.asarray(self.true_voxel_size, dtype=float)
+        factors_per_lod = per_lod_factors_for_anisotropy(voxel_size, num_lods)
+
+        existing = self._discover_existing_scales() if self.use_existing_scales else {}
+        # Only build levels where the factor is "uniform" — i.e. (k,k,k) —
+        # since downstream LOD dispatch is by single int factor. Anisotropic
+        # output is still written to disk (useful for future runs or external
+        # tools) but isn't fed back into the cache.
+        needed_uniform = []
+        for k, factor in enumerate(factors_per_lod):
+            if k == 0:
+                continue
+            if factor[0] == factor[1] == factor[2]:
+                if factor[0] not in existing:
+                    needed_uniform.append((k, factor))
+        if not needed_uniform:
+            # Nothing to build OR everything is anisotropic (no cache benefit)
+            return {}
+
+        pyramid_path = os.path.join(
+            self.output_directory, "_intermediate_scales.zarr",
+        )
+
+        # ROI in voxel coordinates relative to the dataset's own (0, 0, 0)
+        seg_arr = self.segmentation_array
+        ds_offset = np.asarray(seg_arr.roi.offset, dtype=np.int64)
+        vs_int = np.asarray(seg_arr.voxel_size, dtype=np.int64)
+        roi_world_origin = np.asarray(self.roi.offset, dtype=np.int64)
+        roi_world_shape = np.asarray(self.roi.shape, dtype=np.int64)
+        roi_voxel_origin = (roi_world_origin - ds_offset) // vs_int
+        roi_voxel_shape = roi_world_shape // vs_int
+
+        # Dataset shape in voxels
+        dataset_shape = np.asarray(seg_arr.data.shape[-3:], dtype=np.int64)
+
+        # Open a tensorstore handle for s0 (Meshify's segmentation_array.data is
+        # ArrayMetadata, not a tensorstore — to_ndarray_tensorstore needs the
+        # real handle).
+        from mesh_n_bone.util.image_data_interface import open_ds_tensorstore
+        dataset_path = self._dataset_path
+        swap_axes = self._swap_axes
+        ds_offset_coord = Coordinate(seg_arr.roi.offset)
+        vs_coord = Coordinate(seg_arr.voxel_size)
+        ts_dataset = open_ds_tensorstore(dataset_path)
+
+        def _s0_reader(origin_voxels, shape_voxels):
+            world_origin = ds_offset_coord + Coordinate(origin_voxels.tolist()) * vs_coord
+            world_shape = Coordinate(shape_voxels.tolist()) * vs_coord
+            roi = Roi(world_origin, world_shape)
+            return to_ndarray_tensorstore(
+                ts_dataset, roi, vs_coord, ds_offset_coord,
+                swap_axes=swap_axes, fill_value=0, source_path=dataset_path,
+            )
+
+        # Downsample function dispatch
+        from mesh_n_bone.meshify.downsample import (
+            downsample_labels_3d,
+            downsample_binary_3d,
+            downsample_labels_3d_suppress_zero,
+        )
+        downsample_dispatch = {
+            "mode": downsample_labels_3d,
+            "mode_suppress_zero": downsample_labels_3d_suppress_zero,
+            "binary": downsample_binary_3d,
+        }
+        if self.downsample_method == "nearest":
+            # Striding is free at the worker; no point caching a pyramid.
+            return {}
+        if self.downsample_method not in downsample_dispatch:
+            raise ValueError(
+                f"Unknown downsample_method '{self.downsample_method}'. "
+                f"Choose from: {list(downsample_dispatch.keys()) + ['nearest']}"
+            )
+        downsample_func = downsample_dispatch[self.downsample_method]
+
+        # Choose a reasonable output chunk shape: largest multiple of source
+        # chunk shape that's also a multiple of max factor, capped at 256.
+        src_chunk = np.asarray(seg_arr.chunk_shape, dtype=np.int64)
+        max_factor = np.max(np.array([f for _, f in needed_uniform]), axis=0)
+        # Round source chunk to be a multiple of max_factor; cap at 256
+        out_chunk_shape = []
+        for ax in range(3):
+            ch = int(src_chunk[ax])
+            mf = int(max_factor[ax])
+            # round up to multiple of mf, then cap
+            ch = max(mf, (ch + mf - 1) // mf * mf)
+            ch = min(ch, 256)
+            # ensure still a multiple of mf after the cap
+            ch = (ch // mf) * mf
+            ch = max(ch, mf)
+            out_chunk_shape.append(ch)
+
+        # Build
+        logger.info(
+            "pyramid_builder: %d missing scale(s) detected for downsample multires "
+            "(factors=%s); building under %s with alignment_mode=%s",
+            len(needed_uniform),
+            [factor for _, factor in needed_uniform],
+            pyramid_path,
+            self.pyramid_alignment_mode,
+        )
+        build_missing_pyramid_levels(
+            s0_reader=_s0_reader,
+            s0_dataset_shape_voxels=dataset_shape,
+            s0_voxel_size_zyx=self.true_voxel_size.tolist(),
+            s0_translation_zyx=self.true_offset.tolist(),
+            dtype=seg_arr.data.dtype,
+            num_lods=num_lods,
+            existing_factors=set(),  # build all uniform-factor levels into pyramid
+            output_zarr_path=pyramid_path,
+            downsample_func=downsample_func,
+            roi_origin_voxels=roi_voxel_origin,
+            roi_shape_voxels=roi_voxel_shape,
+            out_chunk_shape_voxels=tuple(out_chunk_shape),
+            alignment_mode=self.pyramid_alignment_mode,
+            s0_source_path=self._dataset_path,
+        )
+
+        # Return mapping of newly-built factors -> path within the pyramid
+        return {
+            factor[0]: os.path.join(pyramid_path, f"s{k}")
+            for k, factor in needed_uniform
+        }
+
     def _per_lod_target_reduction(self, lod: int) -> float:
         """Per-LOD target_reduction for the downsample multires strategy.
 
@@ -2192,6 +2345,14 @@ def _pack(workers, config):
             with Timing_Messager("Cleaning up intermediate mesh files", logger):
                 shutil.rmtree(mesh_lods_dir, ignore_errors=True)
 
+        # Pyramid scale cache: keep when explicitly requested for reuse,
+        # otherwise tear down so the output dir doesn't accumulate the
+        # downsampled-segmentation zarrs.
+        pyramid_path = os.path.join(self.output_directory, "_intermediate_scales.zarr")
+        if os.path.isdir(pyramid_path) and not self.keep_intermediate_scales:
+            with Timing_Messager("Cleaning up intermediate scale zarr", logger):
+                shutil.rmtree(pyramid_path, ignore_errors=True)
+
         logger.info("Multires pipeline complete")
 
     def _generate_multires_decimate(self, mesh_lods_dir, lods):
@@ -2276,6 +2437,27 @@ def _generate_multires_downsample(self, mesh_lods_dir, lods):
             self._discover_existing_scales() if self.use_existing_scales else {}
         )
         base_ds = self.downsample_factor or 1
+
+        # If any LOD's required factor is missing from the input zarr's
+        # OME-NGFF multiscales, build the missing scales upfront into an
+        # intermediate pyramid zarr. This avoids the slow per-LOD
+        # in-worker fallback (reads s0 once instead of N times).
+        if self.use_existing_scales and base_ds == 1 and self.num_lods > 1:
+            needed_uniform = [2 ** lod for lod in lods if lod > 0]
+            if any(f not in existing_scales for f in needed_uniform):
+                try:
+                    new_scales = self._build_missing_pyramid_scales(self.num_lods)
+                    existing_scales = {**existing_scales, **new_scales}
+                except ValueError:
+                    # User-config errors (e.g. invalid downsample_method) must
+                    # propagate — the in-worker path would raise the same error.
+                    raise
+                except Exception as e:
+                    logger.warning(
+                        "pyramid_builder: failed to pre-build missing scales "
+                        "(%s); falling back to in-worker downsampling per LOD.", e,
+                    )
+
         for lod in lods:
             scale_dir = f"{mesh_lods_dir}/s{lod}"
             target_factor = 2 ** lod
diff --git a/tests/test_integration_meshify.py b/tests/test_integration_meshify.py
index 3a4a1d2..354fdae 100644
--- a/tests/test_integration_meshify.py
+++ b/tests/test_integration_meshify.py
@@ -460,6 +460,137 @@ def _spy(self_, output_mesh_dir, downsample_factor=None,
             )
 
 
+class TestMeshifyDownsampleAutoBuildPyramid:
+    """When the input zarr exposes only s0 (no pre-computed coarser
+    scales), the downsample multires strategy auto-builds the missing
+    s_k arrays under {output}/_intermediate_scales.zarr/, then reads
+    each LOD from the corresponding s_k. Avoids the slow per-LOD
+    in-worker fallback that re-reads s0 once per LOD."""
+
+    @staticmethod
+    def _create_single_scale_ome_zarr(tmpdir):
+        import json
+        import tensorstore as ts
+        zarr_path = os.path.join(tmpdir, "single_scale.zarr")
+        os.makedirs(zarr_path, exist_ok=True)
+        N = 64
+        vol = np.zeros((N, N, N), dtype=np.uint8)
+        zz, yy, xx = np.indices(vol.shape) - N // 2
+        vol[(xx*xx + yy*yy + zz*zz) < 24*24] = 1
+
+        s0_dir = os.path.join(zarr_path, "s0")
+        os.makedirs(s0_dir, exist_ok=True)
+        arr = ts.open({
+            "driver": "zarr3",
+            "kvstore": {"driver": "file", "path": s0_dir},
+            "metadata": {
+                "shape": list(vol.shape),
+                "data_type": "uint8",
+                "chunk_grid": {"name": "regular",
+                                "configuration": {"chunk_shape": [16, 16, 16]}},
+            },
+            "create": True, "delete_existing": True,
+        }).result()
+        arr.write(vol).result()
+
+        with open(os.path.join(zarr_path, "zarr.json"), "w") as f:
+            json.dump({
+                "zarr_format": 3, "node_type": "group",
+                "attributes": {
+                    "multiscales": [{
+                        "version": "0.5",
+                        "axes": [
+                            {"name": "z", "type": "space", "unit": "nanometer"},
+                            {"name": "y", "type": "space", "unit": "nanometer"},
+                            {"name": "x", "type": "space", "unit": "nanometer"},
+                        ],
+                        "datasets": [{
+                            "path": "s0",
+                            "coordinateTransformations": [
+                                {"type": "scale", "scale": [8.0, 8.0, 8.0]},
+                            ],
+                        }],
+                    }],
+                },
+            }, f)
+        return f"{zarr_path}/s0", zarr_path
+
+    def test_auto_builds_pyramid_for_single_scale_input(self, tmp_output_dir):
+        s0_path, _ = self._create_single_scale_ome_zarr(tmp_output_dir)
+        output_dir = os.path.join(tmp_output_dir, "out_autobuild")
+        m = Meshify(
+            input_path=s0_path,
+            output_directory=output_dir,
+            num_workers=1,
+            do_multires=True,
+            num_lods=3,
+            multires_strategy="downsample",
+            keep_intermediate_scales=True,  # keep for inspection
+            target_faces_per_lod0_chunk=200,
+            check_mesh_validity=False,
+            do_analysis=False,
+            do_simplification=True,
+        )
+        # Spy on which LODs read from the pyramid vs in-worker downsample
+        calls = []
+        orig = m._generate_meshes_at_scale.__func__
+        def _spy(self_, output_mesh_dir, downsample_factor=None,
+                  target_reduction_override=None, input_dataset_path=None):
+            calls.append({
+                "downsample_factor": downsample_factor,
+                "input_dataset_path": input_dataset_path,
+            })
+            return orig(self_, output_mesh_dir,
+                         downsample_factor=downsample_factor,
+                         target_reduction_override=target_reduction_override,
+                         input_dataset_path=input_dataset_path)
+        m._generate_meshes_at_scale = _spy.__get__(m, Meshify)
+        m.get_meshes()
+
+        # Pyramid zarr should have been written
+        pyramid_root = os.path.join(output_dir, "_intermediate_scales.zarr")
+        assert os.path.isdir(pyramid_root), (
+            f"Expected pyramid at {pyramid_root}"
+        )
+        for k in (1, 2):  # LOD 0 may be a symlink, LODs 1+ are real arrays
+            assert os.path.isdir(os.path.join(pyramid_root, f"s{k}")), (
+                f"Expected pyramid s{k} array on disk"
+            )
+
+        # Every LOD > 0 should have read from the pyramid (input_dataset_path
+        # set), not via in-worker downsample_factor.
+        assert len(calls) == 3
+        for k in (1, 2):
+            assert calls[k]["input_dataset_path"] is not None, (
+                f"LOD {k} should read from auto-built pyramid; "
+                f"call={calls[k]}"
+            )
+            assert "_intermediate_scales.zarr" in calls[k]["input_dataset_path"]
+
+    def test_keep_intermediate_scales_false_cleans_up(self, tmp_output_dir):
+        s0_path, _ = self._create_single_scale_ome_zarr(tmp_output_dir)
+        output_dir = os.path.join(tmp_output_dir, "out_autobuild_cleanup")
+        m = Meshify(
+            input_path=s0_path,
+            output_directory=output_dir,
+            num_workers=1,
+            do_multires=True,
+            num_lods=3,
+            multires_strategy="downsample",
+            keep_intermediate_scales=False,  # default: clean up
+            target_faces_per_lod0_chunk=200,
+            check_mesh_validity=False,
+            do_analysis=False,
+            do_simplification=True,
+        )
+        m.get_meshes()
+        # Pyramid zarr should be gone
+        pyramid_root = os.path.join(output_dir, "_intermediate_scales.zarr")
+        assert not os.path.exists(pyramid_root), (
+            f"Pyramid should have been cleaned up; still exists at {pyramid_root}"
+        )
+
+
 class TestMeshifyDownsampleStrategyOverride:
     """The use_existing_scales=False override forces in-worker
     downsampling at every LOD, even when matching pre-built scales
diff --git a/tests/test_pyramid_builder.py b/tests/test_pyramid_builder.py
index 63c8a19..47795ba 100644
--- a/tests/test_pyramid_builder.py
+++ b/tests/test_pyramid_builder.py
@@ -331,6 +331,101 @@ def test_existing_factor_skipped(self, tmp_path):
         assert os.path.isdir(os.path.join(out_path, "s2"))
 
 
+class TestUnalignedRoiHalo:
+    """When ROI is unaligned and alignment_mode='halo', the pyramid
+    builder reads beyond the ROI to complete boundary cubes. Output
+    covers the ROI rounded OUTWARD to factor boundaries — no data loss
+    at unaligned ROI edges."""
+
+    def test_halo_mode_reads_beyond_roi(self, tmp_path):
+        """With ROI [3, 37) (size 34) and max factor 4, halo mode should
+        round outward to [0, 40) and produce s1=20 voxels, s2=10 voxels.
+
+        The downsampled values must match a global single-pass downsample
+        of vol[0:40, 0:40, 0:40] (the halo-extended region) — proving the
+        halo reads actually pulled the surrounding voxels."""
+        rng = np.random.default_rng(0)
+        vol = rng.integers(low=0, high=4, size=(40, 40, 40), dtype=np.uint8)
+
+        # Reader tracks what voxel ranges it was asked for, so we can
+        # assert the halo actually fetched outside the ROI.
+        read_log = []
+        def reader(o, s):
+            read_log.append((tuple(o.tolist()), tuple(s.tolist())))
+            return vol[o[0]:o[0]+s[0], o[1]:o[1]+s[1], o[2]:o[2]+s[2]].copy()
+
+        out_path = str(tmp_path / "pyramid_halo.zarr")
+        build_missing_pyramid_levels(
+            s0_reader=reader,
+            s0_dataset_shape_voxels=np.array(vol.shape),
+            s0_voxel_size_zyx=[1.0, 1.0, 1.0],
+            s0_translation_zyx=[0.5, 0.5, 0.5],
+            dtype=vol.dtype,
+            num_lods=3,                       # max factor 4
+            existing_factors=set(),
+            output_zarr_path=out_path,
+            downsample_func=downsample_labels_3d,
+            out_chunk_shape_voxels=(4, 4, 4),
+            roi_origin_voxels=np.array([3, 3, 3]),
+            roi_shape_voxels=np.array([34, 34, 34]),
+            alignment_mode="halo",
+        )
+
+        # Output region: ROI rounded OUTWARD to factor 4 → [0:40, 0:40, 0:40].
+        # Downsampled by 2 → 20³; by 4 → 10³.
+        import zarr
+        s1 = zarr.open_array(os.path.join(out_path, "s1"), mode="r")
+        s2 = zarr.open_array(os.path.join(out_path, "s2"), mode="r")
+        assert s1.shape == (20, 20, 20)
+        assert s2.shape == (10, 10, 10)
+
+        # Match global single-pass downsample of the halo-extended region
+        ref_s1, _ = downsample_labels_3d(vol[0:40, 0:40, 0:40], (2, 2, 2))
+        ref_s2, _ = downsample_labels_3d(vol[0:40, 0:40, 0:40], (4, 4, 4))
+        np.testing.assert_array_equal(s1[:], ref_s1)
+        np.testing.assert_array_equal(s2[:], ref_s2)
+
+        # The reader log should show reads starting at voxel 0 — outside
+        # the original ROI which starts at voxel 3. If snap mode had been
+        # used, reads would never go below voxel 4.
+        min_read_origin = min(o for o, _ in read_log)
+        assert min_read_origin == (0, 0, 0), (
+            f"halo mode should read from voxel 0 (outside original ROI); "
+            f"min read origin was {min_read_origin}"
+        )
+
+    def test_halo_clipped_at_dataset_bounds(self, tmp_path):
+        """When the halo would extend past the dataset, the read is clipped
+        and zero-padding (or partial cubes) take over. The output should
+        still complete cleanly — no exceptions, output array fully written."""
+        # 18-voxel dataset, ROI [3, 18) (size 15), max factor 4 → halo
+        # rounds out to [0, 20). Dataset bounds clip the read at 18.
+        rng = np.random.default_rng(1)
+        vol = rng.integers(low=0, high=4, size=(18, 18, 18), dtype=np.uint8)
+        out_path = str(tmp_path / "pyramid_halo_clip.zarr")
+        build_missing_pyramid_levels(
+            s0_reader=lambda o, s: vol[o[0]:o[0]+s[0], o[1]:o[1]+s[1], o[2]:o[2]+s[2]].copy(),
+            s0_dataset_shape_voxels=np.array(vol.shape),
+            s0_voxel_size_zyx=[1.0, 1.0, 1.0],
+            s0_translation_zyx=[0.5, 0.5, 0.5],
+            dtype=vol.dtype,
+            num_lods=3,
+            existing_factors=set(),
+            output_zarr_path=out_path,
+            downsample_func=downsample_labels_3d,
+            out_chunk_shape_voxels=(4, 4, 4),
+            roi_origin_voxels=np.array([3, 3, 3]),
+            roi_shape_voxels=np.array([15, 15, 15]),
+            alignment_mode="halo",
+        )
+        # Output region: [0, 20) rounded outward, but dataset only has 18.
+        # Builder must handle this without crashing.
+        import zarr
+        s1 = zarr.open_array(os.path.join(out_path, "s1"), mode="r")
+        # s1 covers output region 20 voxels / 2 = 10 voxels
+        assert s1.shape == (10, 10, 10)
+
+
 class TestUnalignedRoiSnap:
     """When the ROI is unaligned, snap mode drops up to max_factor-1
     voxels per edge; downsampled output covers the snapped region only."""

From 6362a0bc68bb522ec63b8ccc8294e8f79744fd5e Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 15:14:29 -0400
Subject: [PATCH 03/19] Symlink-safe pyramid cleanup + change
 pyramid_alignment_mode default to halo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes from review:

1. pyramid_alignment_mode default: snap -> halo. I shipped snap by
   mistake; the original design decision was halo (no data loss at
   unaligned ROI edges, even though reads extend beyond the ROI into
   surrounding dataset).

2. Symlink-safe cleanup. shutil.rmtree on its own does the right thing
   on modern Python (unlinks symlinks rather than following them into
   the target), VERIFIED by an in-test experiment, BUT given the
   pyramid's s0 is a symlink to the USER'S INPUT DATA, defensive coding
   matters here — worst case if shutil.rmtree ever regressed would be
   destroying user data.

   New _safely_remove_pyramid() walks the tree top-down, explicitly
   unlinks every symlink (including the s0 link) before descending,
   then rmdir's the empty real directories from the bottom up.
   Refuses to act when the pyramid root itself is somehow a symlink.

Tests (2 new, 354 total):
- test_safe_remove_pyramid_preserves_symlink_target: pyramid has s0
  symlinked to a "user data" dir with critical content; after cleanup
  pyramid is gone, user data fully intact.
- test_safe_remove_refuses_to_delete_when_root_is_symlink: pyramid
  root path itself is a symlink to user data; cleanup refuses and
  the data survives.
---
 src/mesh_n_bone/meshify/meshify.py | 56 ++++++++++++++++++++++++++++--
 tests/test_pyramid_builder.py      | 55 +++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index 939f54d..b74bdeb 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -461,6 +461,49 @@ def _write_assembly_memory_plan(output_directory, estimates, waves):
         json.dump(plan, f, indent=2)
 
 
+def _safely_remove_pyramid(pyramid_path):
+    """Delete the auto-built pyramid zarr without ever following symlinks.
+
+    The pyramid's ``s0`` is a symlink to the USER'S INPUT data on local
+    filesystems. ``shutil.rmtree`` does the right thing here on modern
+    Python (it unlinks symlinks rather than following them), but we
+    belt-and-braces the safety: walk top-down, unlink every symlink
+    explicitly, then ``rmtree`` whatever remains.
+    """
+    if not os.path.isdir(pyramid_path):
+        return
+    if os.path.islink(pyramid_path):
+        # The pyramid root ITSELF is somehow a symlink. Don't touch.
+        logger.warning(
+            "Refusing to delete %s: it is itself a symlink. Manual cleanup required.",
+            pyramid_path,
+        )
+        return
+    # Walk top-down, unlinking symlinks explicitly. Anything left is a
+    # real directory or file the pyramid built itself.
+    for root, dirs, files in os.walk(pyramid_path, topdown=True, followlinks=False):
+        # Files including symlinks-to-files
+        for name in files:
+            os.unlink(os.path.join(root, name))
+        # Directories: handle symlinks-to-dirs here, prune from descent
+        survivors = []
+        for name in dirs:
+            full = os.path.join(root, name)
+            if os.path.islink(full):
+                os.unlink(full)
+                # Don't recurse into it — already unlinked from filesystem
+            else:
+                survivors.append(name)
+        dirs[:] = survivors
+    # Now empty real dirs from the bottom up
+    for root, dirs, _files in os.walk(pyramid_path, topdown=False, followlinks=False):
+        try:
+            os.rmdir(root)
+        except OSError:
+            # Non-empty (shouldn't happen) or already gone — skip
+            pass
+
+
 def _estimate_block_target_mb_from_dask_config(
     config_path="dask-config.yaml",
     fallback_mb=128,
@@ -883,7 +926,7 @@ def __init__(
         multires_strategy: str = "downsample",
         use_existing_scales: bool = True,
         keep_intermediate_scales: bool = False,
-        pyramid_alignment_mode: str = "snap",
+        pyramid_alignment_mode: str = "halo",
         decimation_factor: int = 6,
         decimation_aggressiveness: int = 7,
         delete_decimated_meshes: bool = True,
@@ -2348,10 +2391,19 @@ def _pack(workers, config):
         # Pyramid scale cache: keep when explicitly requested for reuse,
         # otherwise tear down so the output dir doesn't accumulate the
         # downsampled-segmentation zarrs.
+        #
+        # CRITICAL: s0 inside the pyramid is a symlink to the user's
+        # original input array. shutil.rmtree on its own does the right
+        # thing (unlinks the symlink, doesn't follow it into the target),
+        # but we explicitly unlink every symlink first BEFORE the
+        # recursive tree walk runs — defensive coding around a
+        # destructive operation. Worst case if shutil.rmtree ever
+        # regressed: it would walk into the user's input and delete
+        # their data.
         pyramid_path = os.path.join(self.output_directory, "_intermediate_scales.zarr")
         if os.path.isdir(pyramid_path) and not self.keep_intermediate_scales:
             with Timing_Messager("Cleaning up intermediate scale zarr", logger):
-                shutil.rmtree(pyramid_path, ignore_errors=True)
+                _safely_remove_pyramid(pyramid_path)
 
         logger.info("Multires pipeline complete")
 
diff --git a/tests/test_pyramid_builder.py b/tests/test_pyramid_builder.py
index 47795ba..2e0847d 100644
--- a/tests/test_pyramid_builder.py
+++ b/tests/test_pyramid_builder.py
@@ -331,6 +331,61 @@ def test_existing_factor_skipped(self, tmp_path):
         assert os.path.isdir(os.path.join(out_path, "s2"))
 
 
+class TestSymlinkSafeCleanup:
+    """The pyramid's ``s0`` is a symlink to the user's input data.
+    Cleanup (whether via shutil.rmtree or our explicit walk) must
+    unlink the symlink — NEVER follow it into the input."""
+
+    def test_safe_remove_pyramid_preserves_symlink_target(self, tmp_path):
+        from mesh_n_bone.meshify.meshify import _safely_remove_pyramid
+
+        # Build a fake "source" directory with critical data
+        src = tmp_path / "important_source"
+        src.mkdir()
+        (src / "DO_NOT_DELETE.txt").write_text("user's input data")
+        (src / "subdir").mkdir()
+        (src / "subdir" / "deeper.txt").write_text("also critical")
+
+        # Build a fake pyramid with s0 -> src symlink
+        pyramid = tmp_path / "pyramid.zarr"
+        pyramid.mkdir()
+        (pyramid / ".zattrs").write_text("{}")
+        os.symlink(str(src), str(pyramid / "s0"))
+        (pyramid / "s1").mkdir()
+        (pyramid / "s1" / "data.bin").write_text("downsampled")
+
+        # Sanity: symlink works
+        assert (pyramid / "s0" / "DO_NOT_DELETE.txt").exists()
+
+        # Tear down via our helper
+        _safely_remove_pyramid(str(pyramid))
+
+        # Pyramid is gone
+        assert not pyramid.exists()
+        # Source SURVIVES with all contents
+        assert src.exists()
+        assert (src / "DO_NOT_DELETE.txt").read_text() == "user's input data"
+        assert (src / "subdir" / "deeper.txt").read_text() == "also critical"
+
+    def test_safe_remove_refuses_to_delete_when_root_is_symlink(self, tmp_path):
+        from mesh_n_bone.meshify.meshify import _safely_remove_pyramid
+
+        src = tmp_path / "real_data"
+        src.mkdir()
+        (src / "data.txt").write_text("real")
+
+        # Hostile setup: pyramid_path itself is a symlink
+        pyramid_link = tmp_path / "pyramid.zarr"
+        os.symlink(str(src), str(pyramid_link))
+
+        # Our helper refuses to delete a symlinked root
+        _safely_remove_pyramid(str(pyramid_link))
+
+        # Real data still exists
+        assert (src / "data.txt").read_text() == "real"
+        # The symlink itself may or may not be removed, but the target survives.
+
+
 class TestUnalignedRoiHalo:
     """When ROI is unaligned and alignment_mode='halo', the pyramid
     builder reads beyond the ROI to complete boundary cubes. Output

From ca37acd7f3777a8b9eea9ee8cab109f9dc65c622 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 15:19:12 -0400
Subject: [PATCH 04/19] Parallel pyramid cleanup + rename to
 delete_intermediate_scales

Changes from review:

1. Rename keep_intermediate_scales -> delete_intermediate_scales (default
   True). Matches existing delete_decimated_meshes naming + default
   direction.

2. _safely_remove_pyramid now parallelizes top-level subdir removal via
   ThreadPoolExecutor sized by self.num_workers. shutil.rmtree releases
   the GIL during os.unlink/os.rmdir syscalls, so threads scale on
   shared filesystems where the metadata server is the bottleneck.

   Sequential rmtree on Lustre/NFS for a pyramid with ~100k chunk files
   is 5-30 min (~100-1000 unlinks/sec single-thread). Parallel cleanup
   at num_workers=10+ brings that to minutes.

   Symlink safety preserved: top-level symlinks (the s0 link to user
   input) are unlinked BEFORE any parallel rmtree spawns, so worker
   threads never receive a path that points into user data.

Tests + README updated.
---
 README.md                          |  7 ++-
 src/mesh_n_bone/meshify/meshify.py | 88 +++++++++++++++++++-----------
 tests/test_integration_meshify.py  |  6 +-
 3 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index 23ef2ba..45bce8b 100644
--- a/README.md
+++ b/README.md
@@ -142,9 +142,10 @@ use_existing_scales: true        # Set false to FORCE in-worker downsampling at
 # into `{output_directory}/_intermediate_scales.zarr` in a single
 # parallel super-chunk pass over s0. Each LOD then reads from the
 # pre-built pyramid instead of repeatedly re-reading s0.
-keep_intermediate_scales: false  # Keep the auto-built `_intermediate_scales.zarr` after the
-                                 # pipeline finishes. Useful for re-running with different mesh
-                                 # params on the same data; disk-heavy on big datasets. (default: false)
+delete_intermediate_scales: true # Delete the auto-built `_intermediate_scales.zarr` once the
+                                 # multires pipeline finishes. Set false to keep the pyramid
+                                 # around for re-running with different mesh params on the same
+                                 # data. (default: true)
 pyramid_alignment_mode: snap     # "snap" rounds an unaligned ROI INWARD to multiples of the
                                  # max per-axis factor (drops up to max_factor-1 voxels per edge);
                                  # "halo" rounds OUTWARD and reads beyond the ROI to complete the
diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index b74bdeb..1ea8eb7 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -461,48 +461,72 @@ def _write_assembly_memory_plan(output_directory, estimates, waves):
         json.dump(plan, f, indent=2)
 
 
-def _safely_remove_pyramid(pyramid_path):
-    """Delete the auto-built pyramid zarr without ever following symlinks.
+def _safely_remove_pyramid(pyramid_path, num_workers=8):
+    """Delete the auto-built pyramid zarr in parallel without ever following symlinks.
 
     The pyramid's ``s0`` is a symlink to the USER'S INPUT data on local
-    filesystems. ``shutil.rmtree`` does the right thing here on modern
-    Python (it unlinks symlinks rather than following them), but we
-    belt-and-braces the safety: walk top-down, unlink every symlink
-    explicitly, then ``rmtree`` whatever remains.
+    filesystems — under no circumstances should the cleanup follow it
+    into the target. We:
+
+    1. Refuse to act when ``pyramid_path`` itself is a symlink.
+    2. Unlink every symlink found at the pyramid root explicitly,
+       BEFORE any recursive walk. (The recursive walk wouldn't follow
+       them either, but this is belt-and-braces for a destructive op.)
+    3. Spawn a thread pool to ``shutil.rmtree`` the real subdirectories
+       (one per pyramid level: s1, s2, ...) in parallel.
+
+    Parallelism: ``shutil.rmtree`` releases the GIL during ``os.unlink``
+    /``os.rmdir`` syscalls, so threads scale on shared filesystems
+    (Lustre/NFS/GPFS) where metadata operations are the bottleneck.
+    On a single-node SSD this still helps for trees with thousands of
+    chunks per array. Caller-tunable ``num_workers``.
     """
     if not os.path.isdir(pyramid_path):
         return
     if os.path.islink(pyramid_path):
-        # The pyramid root ITSELF is somehow a symlink. Don't touch.
         logger.warning(
             "Refusing to delete %s: it is itself a symlink. Manual cleanup required.",
             pyramid_path,
         )
         return
-    # Walk top-down, unlinking symlinks explicitly. Anything left is a
-    # real directory or file the pyramid built itself.
-    for root, dirs, files in os.walk(pyramid_path, topdown=True, followlinks=False):
-        # Files including symlinks-to-files
-        for name in files:
-            os.unlink(os.path.join(root, name))
-        # Directories: handle symlinks-to-dirs here, prune from descent
-        survivors = []
-        for name in dirs:
-            full = os.path.join(root, name)
-            if os.path.islink(full):
-                os.unlink(full)
-                # Don't recurse into it — already unlinked from filesystem
-            else:
-                survivors.append(name)
-        dirs[:] = survivors
-    # Now empty real dirs from the bottom up
-    for root, dirs, _files in os.walk(pyramid_path, topdown=False, followlinks=False):
+
+    # Top-level inventory. Crucially: unlink symlinks first so the
+    # parallel rmtree below NEVER receives a path that points into the
+    # user's data.
+    real_dirs = []
+    real_files = []
+    for entry in os.scandir(pyramid_path):
+        if entry.is_symlink():
+            os.unlink(entry.path)
+        elif entry.is_dir(follow_symlinks=False):
+            real_dirs.append(entry.path)
+        else:
+            real_files.append(entry.path)
+
+    # Small metadata files (.zattrs, .zgroup, info, etc.) — inline.
+    for f in real_files:
         try:
-            os.rmdir(root)
+            os.unlink(f)
         except OSError:
-            # Non-empty (shouldn't happen) or already gone — skip
             pass
 
+    # Parallel rmtree of pyramid level subdirectories
+    if real_dirs:
+        from concurrent.futures import ThreadPoolExecutor
+        max_workers = min(num_workers, len(real_dirs)) if num_workers else 1
+        with ThreadPoolExecutor(max_workers=max_workers) as ex:
+            # Use ignore_errors=True so a transient failure in one subtree
+            # doesn't abort cleanup of the rest. We don't care about residual
+            # files in the pyramid — they're never read again — but we DO want
+            # to clean up as much as possible.
+            list(ex.map(lambda p: shutil.rmtree(p, ignore_errors=True), real_dirs))
+
+    # Final rmdir of the now-empty pyramid root
+    try:
+        os.rmdir(pyramid_path)
+    except OSError:
+        pass
+
 
 def _estimate_block_target_mb_from_dask_config(
     config_path="dask-config.yaml",
@@ -925,7 +949,7 @@ def __init__(
         downsample_method: str = "mode",
         multires_strategy: str = "downsample",
         use_existing_scales: bool = True,
-        keep_intermediate_scales: bool = False,
+        delete_intermediate_scales: bool = True,
         pyramid_alignment_mode: str = "halo",
         decimation_factor: int = 6,
         decimation_aggressiveness: int = 7,
@@ -1100,7 +1124,7 @@ def __init__(
         self.input_path = input_path
         self.multires_strategy = multires_strategy
         self.use_existing_scales = use_existing_scales
-        self.keep_intermediate_scales = keep_intermediate_scales
+        self.delete_intermediate_scales = delete_intermediate_scales
         self.pyramid_alignment_mode = pyramid_alignment_mode
         self.decimation_factor = decimation_factor
         self.decimation_aggressiveness = decimation_aggressiveness
@@ -2401,9 +2425,11 @@ def _pack(workers, config):
         # regressed: it would walk into the user's input and delete
         # their data.
         pyramid_path = os.path.join(self.output_directory, "_intermediate_scales.zarr")
-        if os.path.isdir(pyramid_path) and not self.keep_intermediate_scales:
+        if os.path.isdir(pyramid_path) and self.delete_intermediate_scales:
             with Timing_Messager("Cleaning up intermediate scale zarr", logger):
-                _safely_remove_pyramid(pyramid_path)
+                _safely_remove_pyramid(
+                    pyramid_path, num_workers=max(1, int(self.num_workers)),
+                )
 
         logger.info("Multires pipeline complete")
 
diff --git a/tests/test_integration_meshify.py b/tests/test_integration_meshify.py
index 354fdae..cd57baa 100644
--- a/tests/test_integration_meshify.py
+++ b/tests/test_integration_meshify.py
@@ -525,7 +525,7 @@ def test_auto_builds_pyramid_for_single_scale_input(self, tmp_output_dir):
             do_multires=True,
             num_lods=3,
             multires_strategy="downsample",
-            keep_intermediate_scales=True,  # keep for inspection
+            delete_intermediate_scales=False,  # keep for inspection
             target_faces_per_lod0_chunk=200,
             check_mesh_validity=False,
             do_analysis=False,
@@ -567,7 +567,7 @@ def _spy(self_, output_mesh_dir, downsample_factor=None,
             )
             assert "_intermediate_scales.zarr" in calls[k]["input_dataset_path"]
 
-    def test_keep_intermediate_scales_false_cleans_up(self, tmp_output_dir):
+    def test_delete_intermediate_scales_default_cleans_up(self, tmp_output_dir):
         s0_path, _ = self._create_single_scale_ome_zarr(tmp_output_dir)
         output_dir = os.path.join(tmp_output_dir, "out_autobuild_cleanup")
         m = Meshify(
@@ -577,7 +577,7 @@ def test_keep_intermediate_scales_false_cleans_up(self, tmp_output_dir):
             do_multires=True,
             num_lods=3,
             multires_strategy="downsample",
-            keep_intermediate_scales=False,  # default: clean up
+            delete_intermediate_scales=True,  # default: clean up
             target_faces_per_lod0_chunk=200,
             check_mesh_validity=False,
             do_analysis=False,

From 182489e59b9d055b08946da8610d84e53dbe98a9 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 15:42:03 -0400
Subject: [PATCH 05/19] Chunk-file-level parallel pyramid cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces top-level-subdir-only parallelism with file-level parallelism
following cellmap-analyze's pattern (dask_util.delete_tmp_dir_blockwise).
Walks the tree once collecting all real files, partitions across
num_workers threads via ThreadPoolExecutor, each thread os.unlinks its
batch. Empty real dirs are removed bottom-up afterward.

Symlinks are unlinked inline during the walk before any worker
threads spawn — the s0 link to user input is never handed to a
parallel worker.

Adds test_safe_remove_handles_many_chunk_files exercising 500+ chunk-
like files in a flat-zarr-v2 layout.
---
 src/mesh_n_bone/meshify/meshify.py | 107 +++++++++++++++++------------
 tests/test_pyramid_builder.py      |  26 +++++++
 2 files changed, 90 insertions(+), 43 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index 1ea8eb7..b651f89 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -462,24 +462,26 @@ def _write_assembly_memory_plan(output_directory, estimates, waves):
 
 
 def _safely_remove_pyramid(pyramid_path, num_workers=8):
-    """Delete the auto-built pyramid zarr in parallel without ever following symlinks.
+    """Delete the auto-built pyramid zarr with chunk-level parallelism.
 
     The pyramid's ``s0`` is a symlink to the USER'S INPUT data on local
     filesystems — under no circumstances should the cleanup follow it
-    into the target. We:
+    into the target. Process:
 
     1. Refuse to act when ``pyramid_path`` itself is a symlink.
-    2. Unlink every symlink found at the pyramid root explicitly,
-       BEFORE any recursive walk. (The recursive walk wouldn't follow
-       them either, but this is belt-and-braces for a destructive op.)
-    3. Spawn a thread pool to ``shutil.rmtree`` the real subdirectories
-       (one per pyramid level: s1, s2, ...) in parallel.
-
-    Parallelism: ``shutil.rmtree`` releases the GIL during ``os.unlink``
-    /``os.rmdir`` syscalls, so threads scale on shared filesystems
-    (Lustre/NFS/GPFS) where metadata operations are the bottleneck.
-    On a single-node SSD this still helps for trees with thousands of
-    chunks per array. Caller-tunable ``num_workers``.
+    2. Walk the tree top-down WITHOUT following symlinks. Unlink every
+       symlink encountered immediately (the s0 link disappears here
+       BEFORE any parallel worker can race for it). Collect real files
+       (zarr chunk files + metadata) and real directories separately.
+    3. Partition the collected real files across ``num_workers`` threads;
+       each thread ``os.unlink``s its batch. This is where the chunk-
+       level parallelism happens — on shared filesystems
+       (Lustre/NFS/GPFS) where unlink throughput is metadata-server-
+       bound, parallel threads scale linearly up to the MDS cap.
+    4. Remove now-empty real directories bottom-up.
+
+    Adapted from cellmap-analyze's depth-based blockwise delete pattern
+    in dask_util.py:delete_tmp_dir_blockwise.
     """
     if not os.path.isdir(pyramid_path):
         return
@@ -490,42 +492,61 @@ def _safely_remove_pyramid(pyramid_path, num_workers=8):
         )
         return
 
-    # Top-level inventory. Crucially: unlink symlinks first so the
-    # parallel rmtree below NEVER receives a path that points into the
-    # user's data.
-    real_dirs = []
+    # Step 1: walk and inventory. Unlink symlinks IMMEDIATELY at each
+    # level so the parallel batch below never sees a path that could
+    # follow into user data.
     real_files = []
-    for entry in os.scandir(pyramid_path):
-        if entry.is_symlink():
-            os.unlink(entry.path)
-        elif entry.is_dir(follow_symlinks=False):
-            real_dirs.append(entry.path)
-        else:
-            real_files.append(entry.path)
+    real_dirs = []
+    for root, dirs, files in os.walk(pyramid_path, topdown=True, followlinks=False):
+        real_dirs.append(root)
+        # Symlinks may appear as either files or dirs depending on target type.
+        surviving_files = []
+        for name in files:
+            full = os.path.join(root, name)
+            if os.path.islink(full):
+                try:
+                    os.unlink(full)
+                except OSError:
+                    pass
+            else:
+                surviving_files.append(full)
+        real_files.extend(surviving_files)
+        # Prune symlinked directories from descent + unlink them right now
+        surviving_dirs = []
+        for name in dirs:
+            full = os.path.join(root, name)
+            if os.path.islink(full):
+                try:
+                    os.unlink(full)
+                except OSError:
+                    pass
+            else:
+                surviving_dirs.append(name)
+        dirs[:] = surviving_dirs
 
-    # Small metadata files (.zattrs, .zgroup, info, etc.) — inline.
-    for f in real_files:
+    # Step 2: parallel unlink of all real files (the chunk-level work)
+    def _safe_unlink(path):
         try:
-            os.unlink(f)
+            os.unlink(path)
         except OSError:
             pass
 
-    # Parallel rmtree of pyramid level subdirectories
-    if real_dirs:
-        from concurrent.futures import ThreadPoolExecutor
-        max_workers = min(num_workers, len(real_dirs)) if num_workers else 1
-        with ThreadPoolExecutor(max_workers=max_workers) as ex:
-            # Use ignore_errors=True so a transient failure in one subtree
-            # doesn't abort cleanup of the rest. We don't care about residual
-            # files in the pyramid — they're never read again — but we DO want
-            # to clean up as much as possible.
-            list(ex.map(lambda p: shutil.rmtree(p, ignore_errors=True), real_dirs))
-
-    # Final rmdir of the now-empty pyramid root
-    try:
-        os.rmdir(pyramid_path)
-    except OSError:
-        pass
+    if real_files:
+        max_workers = max(1, int(num_workers))
+        if max_workers > 1 and len(real_files) > max_workers:
+            from concurrent.futures import ThreadPoolExecutor
+            with ThreadPoolExecutor(max_workers=max_workers) as ex:
+                list(ex.map(_safe_unlink, real_files))
+        else:
+            for f in real_files:
+                _safe_unlink(f)
+
+    # Step 3: rmdir empty real directories bottom-up
+    for d in reversed(real_dirs):
+        try:
+            os.rmdir(d)
+        except OSError:
+            pass
 
 
 def _estimate_block_target_mb_from_dask_config(
diff --git a/tests/test_pyramid_builder.py b/tests/test_pyramid_builder.py
index 2e0847d..832bd6b 100644
--- a/tests/test_pyramid_builder.py
+++ b/tests/test_pyramid_builder.py
@@ -367,6 +367,32 @@ def test_safe_remove_pyramid_preserves_symlink_target(self, tmp_path):
         assert (src / "DO_NOT_DELETE.txt").read_text() == "user's input data"
         assert (src / "subdir" / "deeper.txt").read_text() == "also critical"
 
+    def test_safe_remove_handles_many_chunk_files(self, tmp_path):
+        """Cleanup must scale to many chunk-like files (zarr v2 flat layout)
+        — each pyramid sk array is a single directory containing thousands
+        of small files. _safely_remove_pyramid partitions them across the
+        thread pool so chunk-file unlinks are parallel, not serialised."""
+        from mesh_n_bone.meshify.meshify import _safely_remove_pyramid
+
+        pyramid = tmp_path / "pyramid.zarr"
+        pyramid.mkdir()
+        (pyramid / ".zattrs").write_text("{}")
+        # Make a "fake s1 array" with many flat chunk files (like zarr v2)
+        s1 = pyramid / "s1"
+        s1.mkdir()
+        n_files = 500
+        for i in range(n_files):
+            (s1 / f"0.0.{i}").write_text(str(i))
+        # And a "fake s2 array" with fewer files
+        s2 = pyramid / "s2"
+        s2.mkdir()
+        for i in range(50):
+            (s2 / f"0.0.{i}").write_text(str(i))
+
+        _safely_remove_pyramid(str(pyramid), num_workers=8)
+        # Everything gone
+        assert not pyramid.exists()
+
     def test_safe_remove_refuses_to_delete_when_root_is_symlink(self, tmp_path):
         from mesh_n_bone.meshify.meshify import _safely_remove_pyramid
 

From 80be9323a158d1d1269857fa8e7af2ca2608aa59 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 15:49:18 -0400
Subject: [PATCH 06/19] Write pyramid zarr via tensorstore instead of optional
 zarr package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pyramid_builder._create_zarr_v2_array previously did 'import zarr', a
test-env-only dep in pixi.toml. In production (default pixi env) the
import failed and the auto-build silently fell back to in-worker
per-LOD downsampling — defeating the whole point of the feature.

Switched to tensorstore (a core dep) for zarr v2 writes. Same on-disk
layout (.zarray + flat chunk files), readable by the zarr library
when needed. Added a numpy-dtype -> tensorstore-dtype map for the
common label dtypes.

Verified: pyramid build now succeeds in the default pixi env with no
zarr installed (smoke test reads back .zarray + chunk files via plain
os.listdir).

Audited src/ for other test-env-only imports; only stranded one was
zarr — mmh3 only appears as cloudvolume's vendored copy, available
wherever cloudvolume is.
---
 src/mesh_n_bone/util/pyramid_builder.py | 68 ++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 12 deletions(-)

diff --git a/src/mesh_n_bone/util/pyramid_builder.py b/src/mesh_n_bone/util/pyramid_builder.py
index cbabdc0..a14f328 100644
--- a/src/mesh_n_bone/util/pyramid_builder.py
+++ b/src/mesh_n_bone/util/pyramid_builder.py
@@ -392,23 +392,67 @@ def _process_super_chunk(sc_origin):
 # ---------------------------------------------------------------------------
 
 
+_TS_DTYPE_MAP = {
+    np.dtype("uint8"): "|u1",
+    np.dtype("uint16"): "<u2",
+    np.dtype("uint32"): "<u4",
+    np.dtype("uint64"): "<u8",
+    np.dtype("int8"): "|i1",
+    np.dtype("int16"): "<i2",
+    np.dtype("int32"): "<i4",
+    np.dtype("int64"): "<i8",
+    np.dtype("float32"): "<f4",
+    np.dtype("float64"): "<f8",
+}
+
+
 def _create_zarr_v2_array(path, shape, chunks, dtype):
-    """Create a zarr v2 array on disk and return a handle."""
-    import zarr
-    arr = zarr.open_array(
-        store=path, mode="w", shape=shape, chunks=chunks, dtype=dtype,
-        fill_value=0,
-    )
-    return arr
+    """Create a zarr v2 array on disk via tensorstore and return a handle.
+
+    Uses tensorstore (already a core dep) instead of the optional ``zarr``
+    package so the pyramid builder works in any pixi env that has the
+    base mesh-n-bone install.
+    """
+    import tensorstore as ts
+    if os.path.exists(path):
+        shutil.rmtree(path)
+    dt = np.dtype(dtype)
+    ts_dtype = _TS_DTYPE_MAP.get(dt)
+    if ts_dtype is None:
+        raise ValueError(
+            f"Unsupported dtype for pyramid build: {dt}. "
+            f"Add it to _TS_DTYPE_MAP."
+        )
+    spec = {
+        "driver": "zarr",  # zarr v2
+        "kvstore": {"driver": "file", "path": path},
+        "metadata": {
+            "shape": list(shape),
+            "chunks": list(chunks),
+            "dtype": ts_dtype,
+            "compressor": None,
+            "fill_value": 0,
+            "order": "C",
+        },
+        "create": True,
+        "delete_existing": True,
+    }
+    return ts.open(spec).result()
 
 
 def _write_zarr_v2_region(arr, origin_voxels, data):
-    """Write ``data`` into ``arr`` at the given origin (zyx voxel coords)."""
+    """Write ``data`` into ``arr`` at the given origin (zyx voxel coords).
+
+    ``arr`` is a tensorstore handle from ``_create_zarr_v2_array``.
+    """
     z, y, x = origin_voxels.tolist()
-    zE, yE, xE = z + data.shape[0], y + data.shape[1], x + data.shape[2]
-    # Clip in case the block extends past array bounds (boundary chunks)
-    zE = min(zE, arr.shape[0]); yE = min(yE, arr.shape[1]); xE = min(xE, arr.shape[2])
-    arr[z:zE, y:yE, x:xE] = data[: zE - z, : yE - y, : xE - x]
+    shape = arr.shape
+    zE = min(z + data.shape[0], shape[0])
+    yE = min(y + data.shape[1], shape[1])
+    xE = min(x + data.shape[2], shape[2])
+    # Clip the data block to the array bounds (boundary chunks at the edge)
+    clipped = data[: zE - z, : yE - y, : xE - x]
+    arr[z:zE, y:yE, x:xE].write(clipped).result()
 
 
 def _try_symlink_s0(pyramid_path, s0_source_path):

From 92af6b2e283f0619397d08f8bbc04992c8bce135 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 16:00:22 -0400
Subject: [PATCH 07/19] Parallelize super-chunk pyramid build + flush logs
 eagerly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes from the c-elegans cell run staring at one log line for
minutes:

1. Pyramid build was sequential. build_missing_pyramid_levels iterated
   the super-chunk grid one at a time, reading ~134 MB of s0 per
   super-chunk from Lustre. For 27 super-chunks on the c-elegans cell
   volume, that's serialised Lustre I/O — minutes per chunk. Switched
   to a ThreadPoolExecutor sized by self.num_workers (threaded
   through from Meshify, which gets it from -n on the CLI).
   tensorstore reads release the GIL so threads scale on Lustre
   metadata + bandwidth bottlenecks. Adds periodic progress logs
   ('K/N super-chunks done') every 5% so you can watch it move.

2. Logs were block-buffered when stderr was redirected to a file
   (LSF case). util/logging.py now:
   - reconfigures sys.stdout/stderr to line_buffering=True
   - installs a custom _FlushingStreamHandler that calls self.flush()
     + self.stream.flush() after each emit
   - removes any prior default handlers first so our handler wins

   tail -f on the LSF output.log now updates in real time instead of
   waiting for the OS buffer to fill.

Same num_workers value drives the pyramid super-chunk parallelism AND
the pyramid cleanup parallelism, so -n N gives you N-way parallelism
across both ends of the pyramid lifecycle.
---
 src/mesh_n_bone/meshify/meshify.py      |  1 +
 src/mesh_n_bone/util/logging.py         | 42 +++++++++++++++++++++++-
 src/mesh_n_bone/util/pyramid_builder.py | 43 +++++++++++++++++++++++--
 3 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index b651f89..d9fdd54 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2246,6 +2246,7 @@ def _s0_reader(origin_voxels, shape_voxels):
             out_chunk_shape_voxels=tuple(out_chunk_shape),
             alignment_mode=self.pyramid_alignment_mode,
             s0_source_path=self._dataset_path,
+            num_workers=max(1, int(self.num_workers)),
         )
 
         # Return mapping of newly-built factors -> path within the pyramid
diff --git a/src/mesh_n_bone/util/logging.py b/src/mesh_n_bone/util/logging.py
index 5defcb7..d9d625d 100644
--- a/src/mesh_n_bone/util/logging.py
+++ b/src/mesh_n_bone/util/logging.py
@@ -11,7 +11,47 @@
 from subprocess import Popen, PIPE, TimeoutExpired, run as subprocess_run
 from datetime import datetime
 
-logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
+# Force stdout/stderr to be line-buffered so each log line is flushed
+# immediately. When the pipeline runs under LSF/dask with output
+# redirected to a file, the default block-buffering on stderr swallows
+# progress logs until the buffer fills (hundreds of KiB), which can
+# mean no visible progress for a long time even when the worker is
+# making steady headway.
+for _stream in (sys.stdout, sys.stderr):
+    try:
+        _stream.reconfigure(line_buffering=True)
+    except (AttributeError, ValueError):
+        pass
+
+
+class _FlushingStreamHandler(logging.StreamHandler):
+    """StreamHandler that flushes after every emit.
+
+    The base StreamHandler's emit() already calls self.flush(), but only
+    on the Python-level stream; when output is piped to a file via the
+    shell, the OS-level buffer for the file descriptor may still hold
+    the data. We force flush on every record to make sure tail -f works.
+    """
+
+    def emit(self, record):
+        super().emit(record)
+        try:
+            self.flush()
+            self.stream.flush()
+        except (AttributeError, ValueError, OSError):
+            pass
+
+
+_root = logging.getLogger()
+if not any(isinstance(h, _FlushingStreamHandler) for h in _root.handlers):
+    # Remove any default handlers installed earlier so our flushing one wins
+    for _h in list(_root.handlers):
+        _root.removeHandler(_h)
+    _h = _FlushingStreamHandler()
+    _h.setFormatter(logging.Formatter("%(levelname)s:%(message)s"))
+    _root.addHandler(_h)
+    _root.setLevel(logging.INFO)
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/src/mesh_n_bone/util/pyramid_builder.py b/src/mesh_n_bone/util/pyramid_builder.py
index a14f328..42eb12e 100644
--- a/src/mesh_n_bone/util/pyramid_builder.py
+++ b/src/mesh_n_bone/util/pyramid_builder.py
@@ -258,6 +258,7 @@ def build_missing_pyramid_levels(
     zarr_format: int = 2,
     s0_source_path: str | None = None,
     dispatch=None,
+    num_workers: int = 1,
 ) -> str:
     """Build missing OME-NGFF pyramid levels and return the group path.
 
@@ -373,11 +374,47 @@ def _process_super_chunk(sc_origin):
             local_origin = ds_origin - (out_origin // f)
             _write_zarr_v2_region(arr, local_origin, ds_block)
 
-    if dispatch is None:
-        for sc in sc_grid:
+    n_chunks = len(sc_grid)
+    if dispatch is not None:
+        dispatch(_process_super_chunk, sc_grid)
+    elif num_workers > 1 and n_chunks > 1:
+        from concurrent.futures import ThreadPoolExecutor
+        import threading
+        workers = min(num_workers, n_chunks)
+        logger.info(
+            "pyramid_builder: dispatching %d super-chunks across %d threads "
+            "(super_chunk_shape=%s s0 voxels)",
+            n_chunks, workers, super_chunk_shape.tolist(),
+        )
+        done = [0]
+        report_every = max(1, n_chunks // 20)
+        lock = threading.Lock()
+
+        def _worker(sc):
             _process_super_chunk(sc)
+            with lock:
+                done[0] += 1
+                if done[0] % report_every == 0 or done[0] == n_chunks:
+                    logger.info(
+                        "pyramid_builder: %d/%d super-chunks done", done[0], n_chunks,
+                    )
+
+        with ThreadPoolExecutor(max_workers=workers) as ex:
+            list(ex.map(_worker, sc_grid))
     else:
-        dispatch(_process_super_chunk, sc_grid)
+        logger.info(
+            "pyramid_builder: processing %d super-chunks sequentially "
+            "(super_chunk_shape=%s s0 voxels)",
+            n_chunks, super_chunk_shape.tolist(),
+        )
+        report_every = max(1, n_chunks // 20)
+        for i, sc in enumerate(sc_grid):
+            _process_super_chunk(sc)
+            done = i + 1
+            if done % report_every == 0 or done == n_chunks:
+                logger.info(
+                    "pyramid_builder: %d/%d super-chunks done", done, n_chunks,
+                )
 
     logger.info(
         "pyramid_builder: built %d new scales at %s (factors=%s)",

From afad2721922d6ef3a8249fad10a853385b255c78 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 16:08:59 -0400
Subject: [PATCH 08/19] Memory-aware sizing for pyramid super-chunks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the super-chunk size was hardcoded — out_chunk capped at
256 voxels per axis × max_factor (typically 8) = 2048-voxel super-chunk
edges. At uint64 labels that's a 2048^3 * 8 byte = 68 GB read per
worker per task. Easy OOM.

Now sizes super_chunk from per-worker memory budget the same way
assembly does:
  per_worker_budget_MB = _estimate_block_target_mb_from_dask_config(
                            'dask-config.yaml')
  per_thread_budget_bytes = per_worker_budget_MB * 1e6 / num_workers
                            / amplification_factor

amplification = 1 + sum(1/f^3 for factors > 1) * 2 ≈ 1.3 (cumulative
LOD output buffers + scratch). Then super-chunk side =
floor(cuberoot(per_thread_budget / itemsize)), rounded down to a
multiple of max_factor per axis.

For a worker with 15 GB / 24 threads = 625 MB per-thread, uint64
labels, max_factor=8:
  super_chunk_side ≈ floor((625e6/8)^(1/3)) = ~427 voxels → 424 (mult
  of 8), out_chunk = 53 voxels.

Logs the chosen sizes + per-task read size so it's visible up-front
whether the values look sensible for the run.
---
 src/mesh_n_bone/meshify/meshify.py | 51 +++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index d9fdd54..bcceef2 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2205,22 +2205,51 @@ def _s0_reader(origin_voxels, shape_voxels):
             )
         downsample_func = downsample_dispatch[self.downsample_method]
 
-        # Choose a reasonable output chunk shape: largest multiple of source
-        # chunk shape that's also a multiple of max factor, capped at 256.
+        # Choose out_chunk_shape memory-aware. Each thread holds one super-chunk
+        # of s0 voxels in memory simultaneously, plus the per-LOD downsamples
+        # (cumulative ~1.14x of the s0 read for factors 2/4/8). Per-thread
+        # budget = per-worker dask budget / num_workers; cap super-chunk
+        # voxel count so the read + downsample fit comfortably (50% headroom).
         src_chunk = np.asarray(seg_arr.chunk_shape, dtype=np.int64)
         max_factor = np.max(np.array([f for _, f in needed_uniform]), axis=0)
-        # Round source chunk to be a multiple of max_factor; cap at 256
+        itemsize = int(np.dtype(seg_arr.data.dtype).itemsize)
+        # Amplification: downsample at every level holds an output buffer +
+        # an intermediate working buffer roughly equal to its own size.
+        # Sum over LODs: 1 (read) + sum(1/f^3) for k>=1, plus ~2x scratch.
+        amp_per_lod = sum(1.0 / (mf ** 3) for mf in (2, 4, 8))  # 0.142
+        amplification = 1.0 + amp_per_lod * 2.0  # ~1.3x
+
+        worker_mb = _estimate_block_target_mb_from_dask_config(
+            "dask-config.yaml",  # resolved relative to cwd (the config dir)
+            fallback_mb=2048,    # 2 GB sane default if dask config missing
+        )
+        per_thread_bytes = (
+            worker_mb * 1e6 / max(1, int(self.num_workers)) / amplification
+        )
+        max_super_chunk_voxels = int(per_thread_bytes / itemsize)
+
+        # Largest cube of voxels that fits and is a multiple of max_factor
+        # along each axis. cube_side = floor(cuberoot(max_voxels))
+        target_super_side = max(1, int(max_super_chunk_voxels ** (1.0 / 3.0)))
         out_chunk_shape = []
         for ax in range(3):
-            ch = int(src_chunk[ax])
             mf = int(max_factor[ax])
-            # round up to multiple of mf, then cap
-            ch = max(mf, (ch + mf - 1) // mf * mf)
-            ch = min(ch, 256)
-            # ensure still a multiple of mf after the cap
-            ch = (ch // mf) * mf
-            ch = max(ch, mf)
-            out_chunk_shape.append(ch)
+            super_side = (target_super_side // mf) * mf
+            super_side = max(mf, super_side)  # at least one max_factor cube
+            out_side = max(1, super_side // mf)
+            out_chunk_shape.append(out_side)
+
+        # Diagnostic so the chosen sizes are visible in the log
+        super_voxels = int(np.prod(np.array(out_chunk_shape) * max_factor))
+        super_bytes = super_voxels * itemsize
+        logger.info(
+            "pyramid_builder: per-worker budget %d MB, num_workers %d, dtype "
+            "itemsize %d → out_chunk_shape=%s, super_chunk_shape=%s "
+            "(%.1f MB s0 read per task)",
+            worker_mb, int(self.num_workers), itemsize,
+            out_chunk_shape, (np.array(out_chunk_shape) * max_factor).tolist(),
+            super_bytes / 1e6,
+        )
 
         # Build
         logger.info(

From 272c3efe8b6b44fed39ea9ad6619cc1926ede57b Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 16:27:32 -0400
Subject: [PATCH 09/19] Memory-aware pyramid build with OOM retry
 (assembly-style)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactored the pyramid build to mirror the assembly pipeline's
memory-aware retry pattern:

- _size_super_chunk(thread_count) helper computes out_chunk_shape so
  each thread's super-chunk read + scratch fits in per-thread budget.
- Build wrapped in retry loop: on MemoryError, halves attempt_workers
  and recomputes a bigger super-chunk size (more memory per thread).
  Up to memory_retry_max retries (default 3).
- Partial pyramid from a failed attempt is torn down before retry so
  the next attempt starts clean.

Same effect as assembly's 'halve processes/job, restart wave' pattern,
but adapted for the driver-threaded pyramid build (one process, N
threads instead of N processes per LSF job). For the c-elegans cell
volume on a typical Lustre worker (15 GB job memory, -n 24), first
attempt uses ~625 MB/thread; if that OOMs we'd retry with 12 threads
and ~1.25 GB/thread, etc.

For genuine production scale-out across the LSF cluster (as opposed
to within a single driver process), still a follow-up — but driver-
threaded with adaptive sizing already covers up to ~100s of GB of
input volume on a reasonably-spec'd workstation/login-node.
---
 src/mesh_n_bone/meshify/meshify.py | 152 +++++++++++++++++------------
 1 file changed, 87 insertions(+), 65 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index bcceef2..de9dcdb 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2205,78 +2205,100 @@ def _s0_reader(origin_voxels, shape_voxels):
             )
         downsample_func = downsample_dispatch[self.downsample_method]
 
-        # Choose out_chunk_shape memory-aware. Each thread holds one super-chunk
-        # of s0 voxels in memory simultaneously, plus the per-LOD downsamples
-        # (cumulative ~1.14x of the s0 read for factors 2/4/8). Per-thread
-        # budget = per-worker dask budget / num_workers; cap super-chunk
-        # voxel count so the read + downsample fit comfortably (50% headroom).
-        src_chunk = np.asarray(seg_arr.chunk_shape, dtype=np.int64)
+        # Memory-aware sizing. Each thread holds one super-chunk of s0 voxels
+        # in memory simultaneously, plus the per-LOD downsamples (cumulative
+        # ~14% of the s0 read for factors 2/4/8) and scratch (~2x of that).
+        # Per-thread budget = per-worker dask memory budget / threads.
         max_factor = np.max(np.array([f for _, f in needed_uniform]), axis=0)
         itemsize = int(np.dtype(seg_arr.data.dtype).itemsize)
-        # Amplification: downsample at every level holds an output buffer +
-        # an intermediate working buffer roughly equal to its own size.
-        # Sum over LODs: 1 (read) + sum(1/f^3) for k>=1, plus ~2x scratch.
         amp_per_lod = sum(1.0 / (mf ** 3) for mf in (2, 4, 8))  # 0.142
-        amplification = 1.0 + amp_per_lod * 2.0  # ~1.3x
+        amplification = 1.0 + amp_per_lod * 2.0                # ~1.3x
 
         worker_mb = _estimate_block_target_mb_from_dask_config(
-            "dask-config.yaml",  # resolved relative to cwd (the config dir)
-            fallback_mb=2048,    # 2 GB sane default if dask config missing
-        )
-        per_thread_bytes = (
-            worker_mb * 1e6 / max(1, int(self.num_workers)) / amplification
-        )
-        max_super_chunk_voxels = int(per_thread_bytes / itemsize)
-
-        # Largest cube of voxels that fits and is a multiple of max_factor
-        # along each axis. cube_side = floor(cuberoot(max_voxels))
-        target_super_side = max(1, int(max_super_chunk_voxels ** (1.0 / 3.0)))
-        out_chunk_shape = []
-        for ax in range(3):
-            mf = int(max_factor[ax])
-            super_side = (target_super_side // mf) * mf
-            super_side = max(mf, super_side)  # at least one max_factor cube
-            out_side = max(1, super_side // mf)
-            out_chunk_shape.append(out_side)
-
-        # Diagnostic so the chosen sizes are visible in the log
-        super_voxels = int(np.prod(np.array(out_chunk_shape) * max_factor))
-        super_bytes = super_voxels * itemsize
-        logger.info(
-            "pyramid_builder: per-worker budget %d MB, num_workers %d, dtype "
-            "itemsize %d → out_chunk_shape=%s, super_chunk_shape=%s "
-            "(%.1f MB s0 read per task)",
-            worker_mb, int(self.num_workers), itemsize,
-            out_chunk_shape, (np.array(out_chunk_shape) * max_factor).tolist(),
-            super_bytes / 1e6,
+            "dask-config.yaml",
+            fallback_mb=2048,
         )
 
-        # Build
-        logger.info(
-            "pyramid_builder: %d missing scale(s) detected for downsample multires "
-            "(factors=%s); building under %s with alignment_mode=%s",
-            len(needed_uniform),
-            [factor for _, factor in needed_uniform],
-            pyramid_path,
-            self.pyramid_alignment_mode,
-        )
-        build_missing_pyramid_levels(
-            s0_reader=_s0_reader,
-            s0_dataset_shape_voxels=dataset_shape,
-            s0_voxel_size_zyx=self.true_voxel_size.tolist(),
-            s0_translation_zyx=self.true_offset.tolist(),
-            dtype=seg_arr.data.dtype,
-            num_lods=num_lods,
-            existing_factors=set(),  # build all uniform-factor levels into pyramid
-            output_zarr_path=pyramid_path,
-            downsample_func=downsample_func,
-            roi_origin_voxels=roi_voxel_origin,
-            roi_shape_voxels=roi_voxel_shape,
-            out_chunk_shape_voxels=tuple(out_chunk_shape),
-            alignment_mode=self.pyramid_alignment_mode,
-            s0_source_path=self._dataset_path,
-            num_workers=max(1, int(self.num_workers)),
-        )
+        def _size_super_chunk(thread_count):
+            """Choose out_chunk_shape so each of ``thread_count`` threads
+            can hold one super-chunk plus scratch within the per-worker
+            memory budget."""
+            per_thread_bytes = (
+                worker_mb * 1e6 / max(1, int(thread_count)) / amplification
+            )
+            max_super_voxels = int(per_thread_bytes / itemsize)
+            target_super_side = max(1, int(max_super_voxels ** (1.0 / 3.0)))
+            shape = []
+            for ax in range(3):
+                mf = int(max_factor[ax])
+                super_side = (target_super_side // mf) * mf
+                super_side = max(mf, super_side)
+                shape.append(max(1, super_side // mf))
+            return shape
+
+        # Build, with OOM retry that halves thread count + re-sizes super-chunks
+        # if the worker runs out of memory. Mirrors run_with_oom_retry on the
+        # mesh side, scoped to the pyramid build.
+        max_retries = int(getattr(self, "memory_retry_max", 3))
+        attempt_workers = max(1, int(self.num_workers))
+        last_exc = None
+        for attempt in range(max_retries + 1):
+            out_chunk_shape = _size_super_chunk(attempt_workers)
+            super_voxels = int(np.prod(np.array(out_chunk_shape) * max_factor))
+            super_bytes = super_voxels * itemsize
+            logger.info(
+                "pyramid_builder: %sattempt %d: per-worker budget %d MB, "
+                "threads %d, dtype itemsize %d → out_chunk_shape=%s, "
+                "super_chunk_shape=%s (%.1f MB s0 read per task)",
+                "" if attempt == 0 else "retry ",
+                attempt + 1, worker_mb, attempt_workers, itemsize,
+                out_chunk_shape,
+                (np.array(out_chunk_shape) * max_factor).tolist(),
+                super_bytes / 1e6,
+            )
+            if attempt == 0:
+                logger.info(
+                    "pyramid_builder: %d missing scale(s) detected for "
+                    "downsample multires (factors=%s); building under %s "
+                    "with alignment_mode=%s",
+                    len(needed_uniform),
+                    [factor for _, factor in needed_uniform],
+                    pyramid_path, self.pyramid_alignment_mode,
+                )
+            try:
+                build_missing_pyramid_levels(
+                    s0_reader=_s0_reader,
+                    s0_dataset_shape_voxels=dataset_shape,
+                    s0_voxel_size_zyx=self.true_voxel_size.tolist(),
+                    s0_translation_zyx=self.true_offset.tolist(),
+                    dtype=seg_arr.data.dtype,
+                    num_lods=num_lods,
+                    existing_factors=set(),
+                    output_zarr_path=pyramid_path,
+                    downsample_func=downsample_func,
+                    roi_origin_voxels=roi_voxel_origin,
+                    roi_shape_voxels=roi_voxel_shape,
+                    out_chunk_shape_voxels=tuple(out_chunk_shape),
+                    alignment_mode=self.pyramid_alignment_mode,
+                    s0_source_path=self._dataset_path,
+                    num_workers=attempt_workers,
+                )
+                break
+            except MemoryError as e:
+                last_exc = e
+                if attempt_workers == 1 or attempt >= max_retries:
+                    raise
+                new_workers = max(1, attempt_workers // 2)
+                logger.warning(
+                    "pyramid_builder: MemoryError on attempt %d with %d threads. "
+                    "Halving to %d threads and retrying with larger per-thread "
+                    "memory budget (which means bigger super-chunks).",
+                    attempt + 1, attempt_workers, new_workers,
+                )
+                attempt_workers = new_workers
+                # Tear down whatever partial pyramid landed before retrying
+                if os.path.isdir(pyramid_path):
+                    _safely_remove_pyramid(pyramid_path, num_workers=attempt_workers)
 
         # Return mapping of newly-built factors -> path within the pyramid
         return {

From 4a30a5f259bbcc9113aaf2a78c560cb7f510a535 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 16:34:34 -0400
Subject: [PATCH 10/19] Stream LOD writes in pyramid build to reduce peak
 memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

downsample_super_chunk now accepts a write_chunk callback. When
provided, each LOD's output buffer is written to disk and dropped
immediately, instead of being collected into a dict alongside every
other LOD's output. Peak in-memory per thread drops from
'super-chunk + sum(all LOD outputs)' (~1.14x super-chunk for
isotropic 2/4/8 factors) to 'super-chunk + one LOD output' (~1.125x
super-chunk).

Kept the direct s0 → s_k downsample (mode-of-8x8x8 of s0 for LOD 3,
etc.) rather than switching to cascade. Cascade is the more common
connectomics convention but the user prefers the current behavior
for now.

Memory amplification factor updated to reflect the lower peak.
---
 src/mesh_n_bone/meshify/meshify.py      | 12 ++---
 src/mesh_n_bone/util/pyramid_builder.py | 59 ++++++++++++++++---------
 2 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index de9dcdb..10d0c76 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2205,14 +2205,14 @@ def _s0_reader(origin_voxels, shape_voxels):
             )
         downsample_func = downsample_dispatch[self.downsample_method]
 
-        # Memory-aware sizing. Each thread holds one super-chunk of s0 voxels
-        # in memory simultaneously, plus the per-LOD downsamples (cumulative
-        # ~14% of the s0 read for factors 2/4/8) and scratch (~2x of that).
-        # Per-thread budget = per-worker dask memory budget / threads.
+        # Memory-aware sizing. With streaming writes (downsample_super_chunk's
+        # write_chunk callback), each thread holds at most: the s0 super-chunk
+        # + ONE LOD's output buffer at a time. Largest LOD output is LOD 1
+        # (1/8 of s0). Plus scratch (~2x of the LOD output).
         max_factor = np.max(np.array([f for _, f in needed_uniform]), axis=0)
         itemsize = int(np.dtype(seg_arr.data.dtype).itemsize)
-        amp_per_lod = sum(1.0 / (mf ** 3) for mf in (2, 4, 8))  # 0.142
-        amplification = 1.0 + amp_per_lod * 2.0                # ~1.3x
+        # 1.0 (s0 read) + 1/8 (LOD 1 output) + ~2*1/8 (downsample scratch)
+        amplification = 1.0 + 1.0 / 8.0 + 2.0 / 8.0            # ~1.375x
 
         worker_mb = _estimate_block_target_mb_from_dask_config(
             "dask-config.yaml",
diff --git a/src/mesh_n_bone/util/pyramid_builder.py b/src/mesh_n_bone/util/pyramid_builder.py
index 42eb12e..03d1df7 100644
--- a/src/mesh_n_bone/util/pyramid_builder.py
+++ b/src/mesh_n_bone/util/pyramid_builder.py
@@ -210,28 +210,45 @@ def downsample_super_chunk(
     per_lod_factors: list[tuple[int, int, int]],
     downsample_func,
     out_chunk_shape: np.ndarray,
-) -> dict[int, tuple[np.ndarray, np.ndarray]]:
-    """Compute all-LOD downsamples for a single super-chunk of s0.
-
-    ``s0_block`` is the s0 voxel data covering the super-chunk extent
-    (caller is responsible for reading + clipping/zero-padding).
-    Returns ``{lod: (output_block, output_origin_voxels)}`` for every
-    LOD ≥ 1. LOD 0 isn't emitted — the caller writes/links s0 separately.
+    write_chunk=None,
+) -> dict[int, tuple[np.ndarray, np.ndarray]] | None:
+    """Direct-downsample a single super-chunk of s0 into all LODs.
+
+    Each output LOD voxel = ``downsample_func(s0 cube)`` where the cube
+    extent is the LOD's per-axis cumulative factor. We downsample s0
+    directly at each LOD (not s_{k-1} → s_k cascade).
+
+    Memory: with the ``write_chunk`` callback, each LOD's output is
+    written immediately and dropped — peak in-memory at any moment is
+    ``s0_block + one_lod_output``. Without the callback, all LOD blocks
+    are collected and returned (legacy dict API kept for tests).
+
+    Parameters
+    ----------
+    write_chunk : callable, optional
+        ``write_chunk(lod_k, ds_block, out_origin)``. Called per LOD as
+        soon as the block is computed. Function returns ``None``.
     """
-    out = {}
+    collected = None if write_chunk is not None else {}
     for k, factor in enumerate(per_lod_factors):
         if k == 0:
             continue
         f = np.asarray(factor, dtype=int)
-        # Trim s0_block to the largest extent that's a multiple of f.
+        # Trim s0_block to the largest extent that's a multiple of f
         trim = (np.array(s0_block.shape) // f) * f
         block = s0_block[: trim[0], : trim[1], : trim[2]]
         if block.size == 0:
             continue
         ds_block, _ = downsample_func(block, tuple(f.tolist()))
         out_origin = super_chunk_origin_voxels // f
-        out[k] = (ds_block, out_origin)
-    return out
+        if write_chunk is not None:
+            # Drop the reference inside this iteration so the buffer
+            # can be GC'd before the next downsample allocates
+            write_chunk(k, ds_block, out_origin)
+            del ds_block
+        else:
+            collected[k] = (ds_block, out_origin)
+    return collected
 
 
 # ---------------------------------------------------------------------------
@@ -359,21 +376,23 @@ def _process_super_chunk(sc_origin):
         read_end = np.minimum(sc_origin + super_chunk_shape, dataset_shape)
         read_size = read_end - sc_origin
         s0_block = s0_reader(sc_origin, read_size)
-        # Downsample to all missing LODs
-        downsamples = downsample_super_chunk(
-            s0_block, sc_origin, factors_per_lod,
-            downsample_func, out_chunk,
-        )
-        # Write each LOD's chunk
-        for k, (ds_block, ds_origin) in downsamples.items():
+
+        def _write_one(k, ds_block, ds_origin):
             if k not in out_arrays:
-                continue
+                return
             arr = out_arrays[k]
-            # Output position relative to out_origin/factor
             f = np.asarray(factors_per_lod[k], dtype=np.int64)
             local_origin = ds_origin - (out_origin // f)
             _write_zarr_v2_region(arr, local_origin, ds_block)
 
+        # Stream each LOD's output: compute → write → drop the buffer.
+        # Peak memory per task ≈ s0_block + one_lod_output.
+        downsample_super_chunk(
+            s0_block, sc_origin, factors_per_lod,
+            downsample_func, out_chunk,
+            write_chunk=_write_one,
+        )
+
     n_chunks = len(sc_grid)
     if dispatch is not None:
         dispatch(_process_super_chunk, sc_grid)

From e0809f21cb9788c186a68dd91d6b971742be22c9 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 17:05:26 -0400
Subject: [PATCH 11/19] Move pyramid build to dask cluster + smart
 processes-per-job sizing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactored _build_missing_pyramid_scales from driver-side ThreadPoolExecutor
to genuine cluster dispatch via dask bag + run_with_oom_retry, mirroring
how get_chunked_meshes and assemble_meshes work.

- New module-level worker process_super_chunk_for_dask(sc_origin, config)
  in pyramid_builder.py. Picklable, opens its own tensorstore handles
  (cached per worker process), reads s0 super-chunk, downsamples to each
  missing LOD, writes each chunk via tensorstore, drops the buffer.
- New driver-side helper prepare_pyramid_metadata_and_arrays() writes
  the OME-NGFF multiscales metadata and creates the empty output zarr
  arrays before workers fan out.
- Proactive processes-per-job sizing using the assembly pipeline's
  helpers (_recommended_assembly_processes, _assembly_config_for_processes).
  If a super-chunk doesn't fit in (job_memory / base_processes), the
  dask config is adjusted upfront to halve processes-per-job (= double
  memory per worker), then run_with_oom_retry can still halve further
  on actual OOM.
- Tolerant of missing dask-config.yaml: tests / local runs without LSF
  just use LocalCluster.
- Diagnostic log of the driver process's RLIMIT_NOFILE so future OS-FD
  issues are visible up-front; also raises soft → hard if possible.

Removed the driver-side num_workers cap (was a 32-thread cap meant for
the in-process ThreadPoolExecutor; cluster dispatch doesn't need it).

All 355 tests pass.
---
 src/mesh_n_bone/meshify/meshify.py      | 235 ++++++++++++++++++------
 src/mesh_n_bone/util/pyramid_builder.py | 160 ++++++++++++++++
 2 files changed, 343 insertions(+), 52 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index 10d0c76..091b194 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2165,24 +2165,11 @@ def _build_missing_pyramid_scales(self, num_lods):
         # Dataset shape in voxels
         dataset_shape = np.asarray(seg_arr.data.shape[-3:], dtype=np.int64)
 
-        # Open a tensorstore handle for s0 (Meshify's segmentation_array.data is
-        # ArrayMetadata, not a tensorstore — to_ndarray_tensorstore needs the
-        # real handle).
-        from mesh_n_bone.util.image_data_interface import open_ds_tensorstore
+        # No driver-side tensorstore handle needed — workers re-open per-process.
         dataset_path = self._dataset_path
         swap_axes = self._swap_axes
         ds_offset_coord = Coordinate(seg_arr.roi.offset)
         vs_coord = Coordinate(seg_arr.voxel_size)
-        ts_dataset = open_ds_tensorstore(dataset_path)
-
-        def _s0_reader(origin_voxels, shape_voxels):
-            world_origin = ds_offset_coord + Coordinate(origin_voxels.tolist()) * vs_coord
-            world_shape = Coordinate(shape_voxels.tolist()) * vs_coord
-            roi = Roi(world_origin, world_shape)
-            return to_ndarray_tensorstore(
-                ts_dataset, roi, vs_coord, ds_offset_coord,
-                swap_axes=swap_axes, fill_value=0, source_path=dataset_path,
-            )
 
         # Downsample function dispatch
         from mesh_n_bone.meshify.downsample import (
@@ -2236,67 +2223,211 @@ def _size_super_chunk(thread_count):
                 shape.append(max(1, super_side // mf))
             return shape
 
-        # Build, with OOM retry that halves thread count + re-sizes super-chunks
-        # if the worker runs out of memory. Mirrors run_with_oom_retry on the
-        # mesh side, scoped to the pyramid build.
+        # Resolve ROI alignment up-front (we need the aligned out_shape to
+        # build the output zarr arrays)
+        from mesh_n_bone.util.pyramid_builder import (
+            per_lod_factors_for_anisotropy,
+            align_roi_voxels,
+            prepare_pyramid_metadata_and_arrays,
+            process_super_chunk_for_dask,
+        )
+        max_factor_per_axis = max_factor  # alias
+        factors_per_lod_full = per_lod_factors_for_anisotropy(
+            self.true_voxel_size, num_lods,
+        )
+        aligned_out_origin, aligned_out_shape, _, _ = align_roi_voxels(
+            roi_voxel_origin, roi_voxel_shape,
+            max_factor_per_axis, self.pyramid_alignment_mode,
+        )
+        if not np.array_equal(aligned_out_origin, roi_voxel_origin) or \
+           not np.array_equal(aligned_out_shape, roi_voxel_shape):
+            logger.info(
+                "pyramid_builder: %s aligned ROI from origin=%s shape=%s to "
+                "origin=%s shape=%s (max per-axis factor=%s)",
+                self.pyramid_alignment_mode,
+                roi_voxel_origin.tolist(), roi_voxel_shape.tolist(),
+                aligned_out_origin.tolist(), aligned_out_shape.tolist(),
+                max_factor_per_axis.tolist(),
+            )
+
+        logger.info(
+            "pyramid_builder: %d missing scale(s) detected for downsample "
+            "multires (factors=%s); building under %s with alignment_mode=%s",
+            len(needed_uniform),
+            [factor for _, factor in needed_uniform],
+            pyramid_path, self.pyramid_alignment_mode,
+        )
+
+        # OOM retry loop. On MemoryError from any worker, we halve the worker
+        # count (giving each worker 2x memory) and recompute a bigger
+        # super-chunk size.
         max_retries = int(getattr(self, "memory_retry_max", 3))
         attempt_workers = max(1, int(self.num_workers))
-        last_exc = None
         for attempt in range(max_retries + 1):
             out_chunk_shape = _size_super_chunk(attempt_workers)
             super_voxels = int(np.prod(np.array(out_chunk_shape) * max_factor))
             super_bytes = super_voxels * itemsize
             logger.info(
-                "pyramid_builder: %sattempt %d: per-worker budget %d MB, "
-                "threads %d, dtype itemsize %d → out_chunk_shape=%s, "
-                "super_chunk_shape=%s (%.1f MB s0 read per task)",
+                "pyramid_builder: %sattempt %d: dask cluster workers=%d, "
+                "per-worker budget %d MB, dtype itemsize %d → "
+                "out_chunk_shape=%s, super_chunk_shape=%s "
+                "(%.1f MB s0 read per task)",
                 "" if attempt == 0 else "retry ",
-                attempt + 1, worker_mb, attempt_workers, itemsize,
+                attempt + 1, attempt_workers, worker_mb, itemsize,
                 out_chunk_shape,
                 (np.array(out_chunk_shape) * max_factor).tolist(),
                 super_bytes / 1e6,
             )
-            if attempt == 0:
-                logger.info(
-                    "pyramid_builder: %d missing scale(s) detected for "
-                    "downsample multires (factors=%s); building under %s "
-                    "with alignment_mode=%s",
-                    len(needed_uniform),
-                    [factor for _, factor in needed_uniform],
-                    pyramid_path, self.pyramid_alignment_mode,
+
+            # Driver-side: write OME metadata + create empty output arrays
+            super_chunk_shape, _ = prepare_pyramid_metadata_and_arrays(
+                output_zarr_path=pyramid_path,
+                factors_per_lod=factors_per_lod_full,
+                missing_lods=[k for k, _ in needed_uniform],
+                out_shape=aligned_out_shape,
+                out_chunk_shape_voxels=tuple(out_chunk_shape),
+                dtype=seg_arr.data.dtype,
+                s0_voxel_size_zyx=self.true_voxel_size.tolist(),
+                s0_translation_zyx=self.true_offset.tolist(),
+                s0_source_path=self._dataset_path,
+            )
+
+            # Build the super-chunk grid (in s0 voxels)
+            sc_grid = []
+            for z0 in range(int(aligned_out_origin[0]),
+                            int(aligned_out_origin[0] + aligned_out_shape[0]),
+                            int(super_chunk_shape[0])):
+                for y0 in range(int(aligned_out_origin[1]),
+                                int(aligned_out_origin[1] + aligned_out_shape[1]),
+                                int(super_chunk_shape[1])):
+                    for x0 in range(int(aligned_out_origin[2]),
+                                    int(aligned_out_origin[2] + aligned_out_shape[2]),
+                                    int(super_chunk_shape[2])):
+                        sc_grid.append((z0, y0, x0))
+            n_chunks = len(sc_grid)
+
+            # Worker config: must be picklable (no closures, no opened handles)
+            worker_config = {
+                "s0_dataset_path": dataset_path,
+                "dataset_offset": tuple(int(v) for v in seg_arr.roi.offset),
+                "voxel_size": tuple(int(v) for v in seg_arr.voxel_size),
+                "swap_axes": bool(swap_axes),
+                "dataset_shape": tuple(int(v) for v in dataset_shape.tolist()),
+                "factors_per_lod": [list(f) for f in factors_per_lod_full],
+                "missing_lods": [k for k, _ in needed_uniform],
+                "downsample_method": self.downsample_method,
+                "pyramid_path": pyramid_path,
+                "super_chunk_shape": tuple(int(v) for v in super_chunk_shape.tolist()),
+                "out_origin": tuple(int(v) for v in aligned_out_origin.tolist()),
+            }
+
+            # Smart dask sizing — match the assembly pipeline's pattern:
+            # estimate per-task peak memory, recommend processes-per-LSF-job
+            # so each worker has enough memory for one super-chunk, build the
+            # adjusted dask config up-front, then let run_with_oom_retry
+            # halve further on actual OOM.
+            #
+            # Tolerant of missing dask-config.yaml: tests (and runs that
+            # don't use LSF) can dispatch via a LocalCluster without a
+            # jobqueue config — in that case run_with_oom_retry gets
+            # config=None and starts a LocalCluster.
+            adjusted_dask_config = None
+            workers_with_adjusted = attempt_workers
+            try:
+                base_dask_config = dask_util._load_dask_config()
+            except (FileNotFoundError, OSError):
+                base_dask_config = None
+            if base_dask_config:
+                cluster_type, settings = _jobqueue_settings(base_dask_config)
+                base_processes = int((settings or {}).get("processes", 1) or 1)
+                job_memory = _job_memory_bytes(settings)
+                per_task_peak_bytes = int(super_bytes * amplification)
+                recommended_processes = _recommended_assembly_processes(
+                    per_task_peak_bytes, job_memory, base_processes,
+                    memory_fraction=0.60,
+                )
+                adjusted_dask_config = _assembly_config_for_processes(
+                    base_dask_config, cluster_type, recommended_processes,
                 )
+                if recommended_processes < base_processes:
+                    # Each LSF job now hosts fewer processes (= workers), so
+                    # we need MORE jobs to keep the requested total worker
+                    # count. dask-jobqueue spins one LSF job per "worker" in
+                    # the user's request, with `processes` workers in each
+                    # job. Increase the requested-job count proportionally.
+                    workers_with_adjusted = max(
+                        1,
+                        attempt_workers * base_processes // recommended_processes,
+                    )
+                if recommended_processes != base_processes:
+                    logger.info(
+                        "pyramid_builder: super-chunk peak %.1f MB per task vs "
+                        "job memory %.1f MB / %d base processes = %.1f MB each → "
+                        "reducing processes/job to %d (2x memory per worker)",
+                        per_task_peak_bytes / 1e6,
+                        (job_memory or 0) / 1e6, base_processes,
+                        (job_memory or 0) / max(1, base_processes) / 1e6,
+                        recommended_processes,
+                    )
+
+            # Diagnostic: show the driver process's actual file-descriptor
+            # limit. The earlier crash at ~576 workers with EMFILE was
+            # surprising given the shell's ulimit -n = 1M; if we see a
+            # different number here, that's the root cause.
+            try:
+                import resource as _resource
+                soft, hard = _resource.getrlimit(_resource.RLIMIT_NOFILE)
+                if hard > soft:
+                    _resource.setrlimit(_resource.RLIMIT_NOFILE, (hard, hard))
+                    logger.info(
+                        "pyramid_builder: FD soft limit raised %d → %d "
+                        "(hard cap)", soft, hard,
+                    )
+                else:
+                    logger.info(
+                        "pyramid_builder: FD soft limit %d, hard %d (no raise needed)",
+                        soft, hard,
+                    )
+            except (ImportError, ValueError, OSError) as _e:
+                logger.warning("pyramid_builder: could not adjust FD limit: %s", _e)
+
+            effective_workers = dask_util.effective_num_workers(
+                workers_with_adjusted, n_chunks, logger, "pyramid build",
+            )
+
+            def _run(workers, dask_config):
+                import dask.bag as db
+                npartitions = dask_util.guesstimate_npartitions(n_chunks, workers)
+                bag = db.from_sequence(sc_grid, npartitions=npartitions).map(
+                    process_super_chunk_for_dask, worker_config=worker_config,
+                )
+                with dask_util.start_dask(
+                    workers, "pyramid build", logger, config=dask_config,
+                ):
+                    with Timing_Messager(
+                        f"Building pyramid ({n_chunks} super-chunks)", logger,
+                    ):
+                        bag.compute()
+
             try:
-                build_missing_pyramid_levels(
-                    s0_reader=_s0_reader,
-                    s0_dataset_shape_voxels=dataset_shape,
-                    s0_voxel_size_zyx=self.true_voxel_size.tolist(),
-                    s0_translation_zyx=self.true_offset.tolist(),
-                    dtype=seg_arr.data.dtype,
-                    num_lods=num_lods,
-                    existing_factors=set(),
-                    output_zarr_path=pyramid_path,
-                    downsample_func=downsample_func,
-                    roi_origin_voxels=roi_voxel_origin,
-                    roi_shape_voxels=roi_voxel_shape,
-                    out_chunk_shape_voxels=tuple(out_chunk_shape),
-                    alignment_mode=self.pyramid_alignment_mode,
-                    s0_source_path=self._dataset_path,
-                    num_workers=attempt_workers,
+                dask_util.run_with_oom_retry(
+                    _run, effective_workers, "pyramid build", logger,
+                    max_retries=self.memory_retry_max,
+                    retry_on_oom=self.retry_on_oom,
+                    config=adjusted_dask_config,
                 )
                 break
-            except MemoryError as e:
-                last_exc = e
+            except MemoryError:
                 if attempt_workers == 1 or attempt >= max_retries:
                     raise
                 new_workers = max(1, attempt_workers // 2)
                 logger.warning(
-                    "pyramid_builder: MemoryError on attempt %d with %d threads. "
-                    "Halving to %d threads and retrying with larger per-thread "
-                    "memory budget (which means bigger super-chunks).",
+                    "pyramid_builder: MemoryError on attempt %d with %d workers. "
+                    "Halving to %d workers and retrying with a larger per-worker "
+                    "memory budget.",
                     attempt + 1, attempt_workers, new_workers,
                 )
                 attempt_workers = new_workers
-                # Tear down whatever partial pyramid landed before retrying
                 if os.path.isdir(pyramid_path):
                     _safely_remove_pyramid(pyramid_path, num_workers=attempt_workers)
 
diff --git a/src/mesh_n_bone/util/pyramid_builder.py b/src/mesh_n_bone/util/pyramid_builder.py
index 03d1df7..c3e412b 100644
--- a/src/mesh_n_bone/util/pyramid_builder.py
+++ b/src/mesh_n_bone/util/pyramid_builder.py
@@ -251,6 +251,166 @@ def downsample_super_chunk(
     return collected
 
 
+# ---------------------------------------------------------------------------
+# Dask-cluster worker: runs ONE super-chunk task in a dask worker process.
+# Module-level so the function reference is picklable for dask.
+# ---------------------------------------------------------------------------
+
+
+# Per-worker-process cache: tensorstore handles, keyed by path.
+# Open is non-trivial (info-file fetch) — caching saves time across the
+# many tasks a single dask worker processes.
+_PYRAMID_WORKER_TS_CACHE: dict[str, object] = {}
+
+
+def _ts_handle_for_input(dataset_path):
+    if dataset_path in _PYRAMID_WORKER_TS_CACHE:
+        return _PYRAMID_WORKER_TS_CACHE[dataset_path]
+    from mesh_n_bone.util.image_data_interface import open_ds_tensorstore
+    handle = open_ds_tensorstore(dataset_path)
+    _PYRAMID_WORKER_TS_CACHE[dataset_path] = handle
+    return handle
+
+
+def _ts_handle_for_output(path):
+    if path in _PYRAMID_WORKER_TS_CACHE:
+        return _PYRAMID_WORKER_TS_CACHE[path]
+    import tensorstore as ts
+    handle = ts.open({
+        "driver": "zarr",
+        "kvstore": {"driver": "file", "path": path},
+        "open": True,
+    }).result()
+    _PYRAMID_WORKER_TS_CACHE[path] = handle
+    return handle
+
+
+def process_super_chunk_for_dask(sc_origin_tuple, worker_config):
+    """Process ONE pyramid super-chunk inside a dask worker process.
+
+    Module-level + serialization-safe so dask can ship this to LSF workers.
+    ``worker_config`` is a plain dict of strings/tuples/numbers (no numpy
+    arrays, no closures, no opened handles).
+
+    Reads its s0 super-chunk, downsamples to each missing LOD via the
+    configured ``downsample_method``, writes each LOD's chunk to the
+    pre-created output zarr arrays under ``worker_config["pyramid_path"]``.
+
+    Returns ``None`` (dask bag collects None values; we don't aggregate).
+    """
+    from funlib.geometry import Coordinate, Roi
+    from mesh_n_bone.util.image_data_interface import to_ndarray_tensorstore
+    from mesh_n_bone.meshify.downsample import (
+        downsample_labels_3d,
+        downsample_binary_3d,
+        downsample_labels_3d_suppress_zero,
+    )
+
+    dispatch = {
+        "mode": downsample_labels_3d,
+        "mode_suppress_zero": downsample_labels_3d_suppress_zero,
+        "binary": downsample_binary_3d,
+    }
+    downsample_func = dispatch[worker_config["downsample_method"]]
+
+    sc_origin = np.array(sc_origin_tuple, dtype=np.int64)
+    out_origin = np.array(worker_config["out_origin"], dtype=np.int64)
+    super_chunk_shape = np.array(worker_config["super_chunk_shape"], dtype=np.int64)
+    dataset_shape = np.array(worker_config["dataset_shape"], dtype=np.int64)
+    factors_per_lod = [tuple(f) for f in worker_config["factors_per_lod"]]
+    missing_lods = list(worker_config["missing_lods"])
+
+    # Read s0 super-chunk in world coords (same pattern as the meshify worker)
+    ds_offset = Coordinate(worker_config["dataset_offset"])
+    voxel_size = Coordinate(worker_config["voxel_size"])
+    read_end_voxels = np.minimum(sc_origin + super_chunk_shape, dataset_shape)
+    read_size_voxels = read_end_voxels - sc_origin
+    if np.any(read_size_voxels <= 0):
+        return None
+    world_origin = ds_offset + Coordinate(sc_origin.tolist()) * voxel_size
+    world_shape = Coordinate(read_size_voxels.tolist()) * voxel_size
+    roi = Roi(world_origin, world_shape)
+
+    ts_s0 = _ts_handle_for_input(worker_config["s0_dataset_path"])
+    s0_block = to_ndarray_tensorstore(
+        ts_s0, roi, voxel_size, ds_offset,
+        swap_axes=worker_config["swap_axes"], fill_value=0,
+        source_path=worker_config["s0_dataset_path"],
+    )
+
+    # Downsample + write each LOD's chunk (streaming: drop the buffer after write)
+    for k in missing_lods:
+        f = np.asarray(factors_per_lod[k], dtype=int)
+        trim = (np.array(s0_block.shape) // f) * f
+        block = s0_block[: trim[0], : trim[1], : trim[2]]
+        if block.size == 0:
+            continue
+        ds_block, _ = downsample_func(block, tuple(f.tolist()))
+        ds_origin = sc_origin // f
+        local_origin = ds_origin - (out_origin // f)
+        out_path = os.path.join(worker_config["pyramid_path"], f"s{k}")
+        out_arr = _ts_handle_for_output(out_path)
+        z, y, x = local_origin.tolist()
+        arr_shape = out_arr.shape
+        zE = min(z + ds_block.shape[0], arr_shape[0])
+        yE = min(y + ds_block.shape[1], arr_shape[1])
+        xE = min(x + ds_block.shape[2], arr_shape[2])
+        out_arr[z:zE, y:yE, x:xE].write(
+            ds_block[: zE - z, : yE - y, : xE - x]
+        ).result()
+        del ds_block
+
+    return None
+
+
+def prepare_pyramid_metadata_and_arrays(
+    *,
+    output_zarr_path,
+    factors_per_lod,
+    missing_lods,
+    out_shape,
+    out_chunk_shape_voxels,
+    dtype,
+    s0_voxel_size_zyx,
+    s0_translation_zyx,
+    zarr_format=2,
+    s0_source_path=None,
+):
+    """Driver-side setup before dispatching pyramid super-chunks to dask.
+
+    Writes the OME-NGFF multiscales metadata, creates empty output zarr
+    arrays for each missing LOD, and optionally symlinks s0. Returns
+    ``(super_chunk_shape, max_factor)``.
+    """
+    os.makedirs(output_zarr_path, exist_ok=True)
+    metadata = build_multiscales_metadata(
+        s0_voxel_size_zyx=list(s0_voxel_size_zyx),
+        s0_translation_zyx=list(s0_translation_zyx),
+        per_lod_factors=factors_per_lod,
+        version="0.4",
+    )
+    write_multiscales_metadata(output_zarr_path, metadata, zarr_format=zarr_format)
+
+    out_chunk = np.asarray(out_chunk_shape_voxels, dtype=np.int64)
+    out_shape = np.asarray(out_shape, dtype=np.int64)
+    max_factor = np.max(np.array(factors_per_lod), axis=0)
+    for k in missing_lods:
+        f = np.asarray(factors_per_lod[k], dtype=np.int64)
+        lod_shape = ((out_shape + f - 1) // f).tolist()
+        ds_path = os.path.join(output_zarr_path, f"s{k}")
+        if os.path.exists(ds_path):
+            shutil.rmtree(ds_path)
+        _create_zarr_v2_array(
+            ds_path, shape=lod_shape, chunks=out_chunk.tolist(), dtype=dtype,
+        )
+
+    if s0_source_path is not None:
+        _try_symlink_s0(output_zarr_path, s0_source_path)
+
+    super_chunk_shape = out_chunk * max_factor
+    return super_chunk_shape, max_factor
+
+
 # ---------------------------------------------------------------------------
 # Main entry point
 # ---------------------------------------------------------------------------

From a880d7e629879f5ba4712f3a6e7bd46d7e39b7a6 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 17:25:21 -0400
Subject: [PATCH 12/19] Decouple super-chunk shape from on-disk chunk shape +
 proactive memory sizing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two correctness fixes from the c-elegans cell run:

1. On-disk output chunk shape was sized from per-thread memory budget,
   producing 18^3 chunks → 33k files per s_k array → subsequent
   meshing read of s1 hit tensorstore timeouts on Lustre metadata.

   Now: out_chunk_shape inherits the input's chunk shape clamped to
   [32, 256] and rounded to a multiple of max_factor per axis. SAME
   chunk shape used for s1, s2, s3 (uniform across scales). For the
   c-elegans cell with src chunks 132x134x166, out_chunk lands at
   128x128x160 — s1 has ~72 chunks instead of 33k.

   super_chunk_shape (the s0 read region per task) is independent: a
   multiple of (out_chunk * max_factor) sized to fit per-worker memory.

2. Per-worker memory was computed as job_memory / num_workers — wrong
   for cluster dispatch where each dask worker is its own process in
   an LSF job. Correct: per-worker = job_memory / processes_per_job
   (each LSF job hosts `processes` workers sharing the job's memory).

   For c-elegans cell with 180 GB / 12 processes, that's 15 GB per
   worker — vs the previous (incorrect) 2.5 GB / 576 = 4 MB. The
   correct accounting lets super-chunks be sized properly.

Proactive process-count adjustment via _recommended_assembly_processes
mirrors how assembly's _plan_assembly_waves chooses processes/job per
wave: if the minimum possible super-chunk wouldn't fit at
base_processes, halve processes/job upfront (not just on OOM retry).

Diagnostic FD-limit log is now silent unless the raise actually fails;
explicit log of effective worker count after task-count capping
('dispatching N super-chunks across M dask workers'). All 355 tests pass.
---
 src/mesh_n_bone/meshify/meshify.py | 243 ++++++++++++++++-------------
 1 file changed, 135 insertions(+), 108 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index 091b194..3269c3f 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2192,36 +2192,61 @@ def _build_missing_pyramid_scales(self, num_lods):
             )
         downsample_func = downsample_dispatch[self.downsample_method]
 
-        # Memory-aware sizing. With streaming writes (downsample_super_chunk's
-        # write_chunk callback), each thread holds at most: the s0 super-chunk
-        # + ONE LOD's output buffer at a time. Largest LOD output is LOD 1
-        # (1/8 of s0). Plus scratch (~2x of the LOD output).
+        # Two decoupled sizes:
+        #   out_chunk_shape:  on-disk chunk shape of the s_k zarr arrays.
+        #                     Should be a SANE chunk size (~64-128) so
+        #                     subsequent reads from the s_k arrays
+        #                     produce a manageable number of files.
+        #                     Independent of memory budget.
+        #   super_chunk_shape:  region of s0 each worker task reads. Must
+        #                     be a multiple of out_chunk_shape * max_factor
+        #                     (so each task fills an integer number of
+        #                     output chunks at every LOD). Sized to fit
+        #                     per-worker memory budget.
         max_factor = np.max(np.array([f for _, f in needed_uniform]), axis=0)
         itemsize = int(np.dtype(seg_arr.data.dtype).itemsize)
         # 1.0 (s0 read) + 1/8 (LOD 1 output) + ~2*1/8 (downsample scratch)
         amplification = 1.0 + 1.0 / 8.0 + 2.0 / 8.0            # ~1.375x
 
+        # Sane output chunk shape: inherit s0's chunk shape, clamped to
+        # the [32, 256] range and rounded to a multiple of max_factor per
+        # axis. 32 is the lower bound for "not silly small"; 256 caps
+        # the file size to ~16 MiB at uint8.
+        SANE_CHUNK_MIN = 32
+        SANE_CHUNK_MAX = 256
+        src_chunk = np.asarray(seg_arr.chunk_shape, dtype=np.int64)
+        out_chunk_shape = []
+        for ax in range(3):
+            mf = int(max_factor[ax])
+            ch = int(src_chunk[ax])
+            ch = max(SANE_CHUNK_MIN, min(SANE_CHUNK_MAX, ch))
+            # round to multiple of mf (so super-chunk * mf = out chunk)
+            ch = max(mf, (ch // mf) * mf)
+            out_chunk_shape.append(ch)
+        out_chunk_arr = np.asarray(out_chunk_shape, dtype=np.int64)
+
         worker_mb = _estimate_block_target_mb_from_dask_config(
             "dask-config.yaml",
             fallback_mb=2048,
         )
 
-        def _size_super_chunk(thread_count):
-            """Choose out_chunk_shape so each of ``thread_count`` threads
-            can hold one super-chunk plus scratch within the per-worker
-            memory budget."""
-            per_thread_bytes = (
-                worker_mb * 1e6 / max(1, int(thread_count)) / amplification
-            )
-            max_super_voxels = int(per_thread_bytes / itemsize)
-            target_super_side = max(1, int(max_super_voxels ** (1.0 / 3.0)))
-            shape = []
-            for ax in range(3):
-                mf = int(max_factor[ax])
-                super_side = (target_super_side // mf) * mf
-                super_side = max(mf, super_side)
-                shape.append(max(1, super_side // mf))
-            return shape
+        def _size_super_chunk(per_worker_mb):
+            """Choose super_chunk_shape (in s0 voxels). Each worker
+            process holds at most one super-chunk + one LOD output + scratch
+            in memory at any time. ``per_worker_mb`` is per-process
+            budget (NOT per-thread / not divided by num_workers — each
+            dask worker is its own LSF process).
+
+            Returns the super_chunk_shape as a multiple of
+            ``out_chunk_shape * max_factor`` (so each task fills integer
+            output chunks at every LOD).
+            """
+            budget_bytes = max(1e6, per_worker_mb * 1e6 / amplification)
+            base_chunk_voxels = float(np.prod(out_chunk_arr * max_factor))
+            base_chunk_bytes = base_chunk_voxels * itemsize
+            multiplier = max(1, int(budget_bytes // max(1, base_chunk_bytes)))
+            multi_side = max(1, int(multiplier ** (1.0 / 3.0)))
+            return (out_chunk_arr * max_factor * multi_side).tolist()
 
         # Resolve ROI alignment up-front (we need the aligned out_shape to
         # build the output zarr arrays)
@@ -2258,29 +2283,72 @@ def _size_super_chunk(thread_count):
             pyramid_path, self.pyramid_alignment_mode,
         )
 
-        # OOM retry loop. On MemoryError from any worker, we halve the worker
-        # count (giving each worker 2x memory) and recompute a bigger
-        # super-chunk size.
+        # Load dask config once. We use it to compute per-worker memory
+        # budget (= job_memory / processes_per_job) and to feed into
+        # run_with_oom_retry.
+        try:
+            base_dask_config = dask_util._load_dask_config()
+        except (FileNotFoundError, OSError):
+            base_dask_config = None
+        cluster_type, settings = _jobqueue_settings(base_dask_config) if base_dask_config else (None, None)
+        base_processes = int((settings or {}).get("processes", 1) or 1)
+        job_memory = _job_memory_bytes(settings)
+        fallback_per_worker_mb = 2048
+
+        # Proactive sizing — mirror assembly's _plan_assembly_waves pattern.
+        # For pyramid we have one "estimate" (per-super-chunk peak memory)
+        # since all tasks are identical, so it reduces to: size the
+        # super-chunk at the base per-worker budget, then if the minimum
+        # possible super-chunk wouldn't fit, halve processes-per-job to
+        # give each worker more memory.
+        starting_processes = base_processes
+        if job_memory is not None:
+            tentative_super = _size_super_chunk(
+                (job_memory / base_processes) / 1e6
+            )
+            tentative_peak_bytes = int(
+                np.prod(tentative_super) * itemsize * amplification
+            )
+            starting_processes = _recommended_assembly_processes(
+                tentative_peak_bytes, job_memory, base_processes,
+                memory_fraction=0.60,
+            )
+            if starting_processes != base_processes:
+                logger.info(
+                    "pyramid_builder: minimum super-chunk peak %.1f MB > "
+                    "job_memory/base_processes; reducing processes/job %d → %d "
+                    "(2x memory per worker)",
+                    tentative_peak_bytes / 1e6,
+                    base_processes, starting_processes,
+                )
+
+        # OOM retry loop. On MemoryError, halve processes-per-job further
+        # (gives each worker another 2× memory) and rebuild the super-chunk.
         max_retries = int(getattr(self, "memory_retry_max", 3))
-        attempt_workers = max(1, int(self.num_workers))
+        attempt_processes = starting_processes
         for attempt in range(max_retries + 1):
-            out_chunk_shape = _size_super_chunk(attempt_workers)
-            super_voxels = int(np.prod(np.array(out_chunk_shape) * max_factor))
-            super_bytes = super_voxels * itemsize
+            # Per-worker memory: each LSF job hosts `processes` worker
+            # processes sharing the job's memory. Each worker holds ONE
+            # super-chunk in flight.
+            if job_memory is not None and attempt_processes > 0:
+                per_worker_mb = (job_memory / attempt_processes) / 1e6
+            else:
+                per_worker_mb = fallback_per_worker_mb
+
+            super_chunk_shape = _size_super_chunk(per_worker_mb)
+            super_chunk_arr = np.asarray(super_chunk_shape, dtype=np.int64)
+            super_bytes = int(np.prod(super_chunk_arr)) * itemsize
             logger.info(
-                "pyramid_builder: %sattempt %d: dask cluster workers=%d, "
-                "per-worker budget %d MB, dtype itemsize %d → "
-                "out_chunk_shape=%s, super_chunk_shape=%s "
+                "pyramid_builder: %sattempt %d: per-worker budget %.0f MB, "
+                "processes/job=%d → out_chunk_shape=%s, super_chunk_shape=%s "
                 "(%.1f MB s0 read per task)",
                 "" if attempt == 0 else "retry ",
-                attempt + 1, attempt_workers, worker_mb, itemsize,
-                out_chunk_shape,
-                (np.array(out_chunk_shape) * max_factor).tolist(),
-                super_bytes / 1e6,
+                attempt + 1, per_worker_mb, attempt_processes,
+                out_chunk_shape, super_chunk_shape, super_bytes / 1e6,
             )
 
             # Driver-side: write OME metadata + create empty output arrays
-            super_chunk_shape, _ = prepare_pyramid_metadata_and_arrays(
+            prepare_pyramid_metadata_and_arrays(
                 output_zarr_path=pyramid_path,
                 factors_per_lod=factors_per_lod_full,
                 missing_lods=[k for k, _ in needed_uniform],
@@ -2292,21 +2360,20 @@ def _size_super_chunk(thread_count):
                 s0_source_path=self._dataset_path,
             )
 
-            # Build the super-chunk grid (in s0 voxels)
+            # Super-chunk grid (in s0 voxels)
             sc_grid = []
             for z0 in range(int(aligned_out_origin[0]),
                             int(aligned_out_origin[0] + aligned_out_shape[0]),
-                            int(super_chunk_shape[0])):
+                            int(super_chunk_arr[0])):
                 for y0 in range(int(aligned_out_origin[1]),
                                 int(aligned_out_origin[1] + aligned_out_shape[1]),
-                                int(super_chunk_shape[1])):
+                                int(super_chunk_arr[1])):
                     for x0 in range(int(aligned_out_origin[2]),
                                     int(aligned_out_origin[2] + aligned_out_shape[2]),
-                                    int(super_chunk_shape[2])):
+                                    int(super_chunk_arr[2])):
                         sc_grid.append((z0, y0, x0))
             n_chunks = len(sc_grid)
 
-            # Worker config: must be picklable (no closures, no opened handles)
             worker_config = {
                 "s0_dataset_path": dataset_path,
                 "dataset_offset": tuple(int(v) for v in seg_arr.roi.offset),
@@ -2317,83 +2384,43 @@ def _size_super_chunk(thread_count):
                 "missing_lods": [k for k, _ in needed_uniform],
                 "downsample_method": self.downsample_method,
                 "pyramid_path": pyramid_path,
-                "super_chunk_shape": tuple(int(v) for v in super_chunk_shape.tolist()),
+                "super_chunk_shape": tuple(int(v) for v in super_chunk_arr.tolist()),
                 "out_origin": tuple(int(v) for v in aligned_out_origin.tolist()),
             }
 
-            # Smart dask sizing — match the assembly pipeline's pattern:
-            # estimate per-task peak memory, recommend processes-per-LSF-job
-            # so each worker has enough memory for one super-chunk, build the
-            # adjusted dask config up-front, then let run_with_oom_retry
-            # halve further on actual OOM.
-            #
-            # Tolerant of missing dask-config.yaml: tests (and runs that
-            # don't use LSF) can dispatch via a LocalCluster without a
-            # jobqueue config — in that case run_with_oom_retry gets
-            # config=None and starts a LocalCluster.
-            adjusted_dask_config = None
-            workers_with_adjusted = attempt_workers
-            try:
-                base_dask_config = dask_util._load_dask_config()
-            except (FileNotFoundError, OSError):
-                base_dask_config = None
-            if base_dask_config:
-                cluster_type, settings = _jobqueue_settings(base_dask_config)
-                base_processes = int((settings or {}).get("processes", 1) or 1)
-                job_memory = _job_memory_bytes(settings)
-                per_task_peak_bytes = int(super_bytes * amplification)
-                recommended_processes = _recommended_assembly_processes(
-                    per_task_peak_bytes, job_memory, base_processes,
-                    memory_fraction=0.60,
-                )
+            # Build adjusted dask config when processes/job has changed
+            if base_dask_config and attempt_processes != base_processes:
                 adjusted_dask_config = _assembly_config_for_processes(
-                    base_dask_config, cluster_type, recommended_processes,
+                    base_dask_config, cluster_type, attempt_processes,
                 )
-                if recommended_processes < base_processes:
-                    # Each LSF job now hosts fewer processes (= workers), so
-                    # we need MORE jobs to keep the requested total worker
-                    # count. dask-jobqueue spins one LSF job per "worker" in
-                    # the user's request, with `processes` workers in each
-                    # job. Increase the requested-job count proportionally.
-                    workers_with_adjusted = max(
-                        1,
-                        attempt_workers * base_processes // recommended_processes,
-                    )
-                if recommended_processes != base_processes:
-                    logger.info(
-                        "pyramid_builder: super-chunk peak %.1f MB per task vs "
-                        "job memory %.1f MB / %d base processes = %.1f MB each → "
-                        "reducing processes/job to %d (2x memory per worker)",
-                        per_task_peak_bytes / 1e6,
-                        (job_memory or 0) / 1e6, base_processes,
-                        (job_memory or 0) / max(1, base_processes) / 1e6,
-                        recommended_processes,
-                    )
+                # When we shrink processes-per-job, we need MORE jobs to
+                # keep the requested total worker count.
+                workers_with_adjusted = max(
+                    1,
+                    int(self.num_workers) * base_processes // attempt_processes,
+                )
+            else:
+                adjusted_dask_config = base_dask_config
+                workers_with_adjusted = max(1, int(self.num_workers))
 
-            # Diagnostic: show the driver process's actual file-descriptor
-            # limit. The earlier crash at ~576 workers with EMFILE was
-            # surprising given the shell's ulimit -n = 1M; if we see a
-            # different number here, that's the root cause.
+            # Raise the FD soft limit to the hard cap so dask's
+            # scheduler↔worker sockets don't trip EMFILE at high worker
+            # counts. Silent: log only on actual failure.
             try:
                 import resource as _resource
                 soft, hard = _resource.getrlimit(_resource.RLIMIT_NOFILE)
                 if hard > soft:
                     _resource.setrlimit(_resource.RLIMIT_NOFILE, (hard, hard))
-                    logger.info(
-                        "pyramid_builder: FD soft limit raised %d → %d "
-                        "(hard cap)", soft, hard,
-                    )
-                else:
-                    logger.info(
-                        "pyramid_builder: FD soft limit %d, hard %d (no raise needed)",
-                        soft, hard,
-                    )
             except (ImportError, ValueError, OSError) as _e:
                 logger.warning("pyramid_builder: could not adjust FD limit: %s", _e)
 
             effective_workers = dask_util.effective_num_workers(
                 workers_with_adjusted, n_chunks, logger, "pyramid build",
             )
+            logger.info(
+                "pyramid_builder: dispatching %d super-chunks across %d "
+                "dask workers", n_chunks, effective_workers,
+            )
 
             def _run(workers, dask_config):
                 import dask.bag as db
@@ -2418,18 +2445,18 @@ def _run(workers, dask_config):
                 )
                 break
             except MemoryError:
-                if attempt_workers == 1 or attempt >= max_retries:
+                if attempt_processes == 1 or attempt >= max_retries:
                     raise
-                new_workers = max(1, attempt_workers // 2)
+                new_processes = max(1, attempt_processes // 2)
                 logger.warning(
-                    "pyramid_builder: MemoryError on attempt %d with %d workers. "
-                    "Halving to %d workers and retrying with a larger per-worker "
-                    "memory budget.",
-                    attempt + 1, attempt_workers, new_workers,
+                    "pyramid_builder: MemoryError on attempt %d with "
+                    "processes/job=%d. Halving processes/job to %d (2x memory "
+                    "per worker) and retrying.",
+                    attempt + 1, attempt_processes, new_processes,
                 )
-                attempt_workers = new_workers
+                attempt_processes = new_processes
                 if os.path.isdir(pyramid_path):
-                    _safely_remove_pyramid(pyramid_path, num_workers=attempt_workers)
+                    _safely_remove_pyramid(pyramid_path, num_workers=max(1, int(self.num_workers)))
 
         # Return mapping of newly-built factors -> path within the pyramid
         return {

From adce6612404bb1a35514c1284eb0dbb5fed98c03 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Fri, 12 Jun 2026 17:34:45 -0400
Subject: [PATCH 13/19] Fix super-chunk sizing: minimum size, not
 memory-filling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous logic grew super-chunks to fill per-worker memory, which on
small volumes (c-elegans cell, 1.5 G voxels) collapsed the task count
to 1 — single worker for all -n 576, no parallelism.

Now super_chunk_shape is FIXED at out_chunk * max_factor (the minimum
that respects alignment). For c-elegans cell that's [1024,1024,1280] =
1.34 GB per task. With processes=12 / 180 GB job memory = 15 GB per
worker, peak per task is 1.85 GB ≪ 9 GB (60% memory_fraction). 8
super-chunks total → 8 effective workers. Volumes too small to use the
full -n stay parallelism-limited; that's expected and the rest of -n
just doesn't get scaled up (dask only requests what it needs).

The memory tuning is now ONLY via processes/job: if the minimum
super-chunk wouldn't fit in (job_memory / base_processes), halve
processes/job to give each worker more memory. Same as assembly's
_recommended_assembly_processes pattern, but for a single uniform
task estimate.

Removed the cuberoot-multiplier sizing that was growing super-chunks
into one giant single-task on small volumes.
---
 src/mesh_n_bone/meshify/meshify.py | 77 +++++++++++-------------------
 1 file changed, 29 insertions(+), 48 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index 3269c3f..bad07cf 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2230,23 +2230,21 @@ def _build_missing_pyramid_scales(self, num_lods):
             fallback_mb=2048,
         )
 
-        def _size_super_chunk(per_worker_mb):
-            """Choose super_chunk_shape (in s0 voxels). Each worker
-            process holds at most one super-chunk + one LOD output + scratch
-            in memory at any time. ``per_worker_mb`` is per-process
-            budget (NOT per-thread / not divided by num_workers — each
-            dask worker is its own LSF process).
-
-            Returns the super_chunk_shape as a multiple of
-            ``out_chunk_shape * max_factor`` (so each task fills integer
-            output chunks at every LOD).
-            """
-            budget_bytes = max(1e6, per_worker_mb * 1e6 / amplification)
-            base_chunk_voxels = float(np.prod(out_chunk_arr * max_factor))
-            base_chunk_bytes = base_chunk_voxels * itemsize
-            multiplier = max(1, int(budget_bytes // max(1, base_chunk_bytes)))
-            multi_side = max(1, int(multiplier ** (1.0 / 3.0)))
-            return (out_chunk_arr * max_factor * multi_side).tolist()
+        # super_chunk_shape is FIXED at out_chunk * max_factor — the smallest
+        # super-chunk that fills exactly one output chunk per LOD. Growing
+        # super-chunks past this point reduces task count (= parallelism)
+        # without any real benefit for label data (mode downsamples are
+        # CPU-bounded, not I/O-amortization-bounded). Memory tuning is done
+        # by adjusting processes-per-LSF-job (= memory per worker), not by
+        # growing super-chunks.
+        super_chunk_shape = (out_chunk_arr * max_factor).tolist()
+        super_chunk_arr = np.asarray(super_chunk_shape, dtype=np.int64)
+        super_bytes = int(np.prod(super_chunk_arr)) * itemsize
+
+        def _per_worker_mb_for(processes):
+            if job_memory is not None and processes > 0:
+                return (job_memory / processes) / 1e6
+            return fallback_per_worker_mb
 
         # Resolve ROI alignment up-front (we need the aligned out_shape to
         # build the output zarr arrays)
@@ -2284,8 +2282,7 @@ def _size_super_chunk(per_worker_mb):
         )
 
         # Load dask config once. We use it to compute per-worker memory
-        # budget (= job_memory / processes_per_job) and to feed into
-        # run_with_oom_retry.
+        # budget (= job_memory / processes_per_job).
         try:
             base_dask_config = dask_util._load_dask_config()
         except (FileNotFoundError, OSError):
@@ -2296,48 +2293,32 @@ def _size_super_chunk(per_worker_mb):
         fallback_per_worker_mb = 2048
 
         # Proactive sizing — mirror assembly's _plan_assembly_waves pattern.
-        # For pyramid we have one "estimate" (per-super-chunk peak memory)
-        # since all tasks are identical, so it reduces to: size the
-        # super-chunk at the base per-worker budget, then if the minimum
-        # possible super-chunk wouldn't fit, halve processes-per-job to
-        # give each worker more memory.
+        # With super_chunk_shape fixed at the minimum, this reduces to:
+        # if super_chunk + scratch wouldn't fit per-worker at base_processes,
+        # halve processes-per-job upfront.
+        peak_bytes = int(super_bytes * amplification)
         starting_processes = base_processes
         if job_memory is not None:
-            tentative_super = _size_super_chunk(
-                (job_memory / base_processes) / 1e6
-            )
-            tentative_peak_bytes = int(
-                np.prod(tentative_super) * itemsize * amplification
-            )
             starting_processes = _recommended_assembly_processes(
-                tentative_peak_bytes, job_memory, base_processes,
+                peak_bytes, job_memory, base_processes,
                 memory_fraction=0.60,
             )
             if starting_processes != base_processes:
                 logger.info(
-                    "pyramid_builder: minimum super-chunk peak %.1f MB > "
-                    "job_memory/base_processes; reducing processes/job %d → %d "
-                    "(2x memory per worker)",
-                    tentative_peak_bytes / 1e6,
+                    "pyramid_builder: super-chunk peak %.1f MB > 60%% of "
+                    "(job_memory %.1f GB / %d procs = %.1f GB per worker); "
+                    "reducing processes/job %d → %d (more memory per worker)",
+                    peak_bytes / 1e6, job_memory / 1e9, base_processes,
+                    job_memory / base_processes / 1e9,
                     base_processes, starting_processes,
                 )
 
-        # OOM retry loop. On MemoryError, halve processes-per-job further
-        # (gives each worker another 2× memory) and rebuild the super-chunk.
+        # OOM retry loop. On MemoryError, halve processes-per-job (gives
+        # each worker another 2× memory).
         max_retries = int(getattr(self, "memory_retry_max", 3))
         attempt_processes = starting_processes
         for attempt in range(max_retries + 1):
-            # Per-worker memory: each LSF job hosts `processes` worker
-            # processes sharing the job's memory. Each worker holds ONE
-            # super-chunk in flight.
-            if job_memory is not None and attempt_processes > 0:
-                per_worker_mb = (job_memory / attempt_processes) / 1e6
-            else:
-                per_worker_mb = fallback_per_worker_mb
-
-            super_chunk_shape = _size_super_chunk(per_worker_mb)
-            super_chunk_arr = np.asarray(super_chunk_shape, dtype=np.int64)
-            super_bytes = int(np.prod(super_chunk_arr)) * itemsize
+            per_worker_mb = _per_worker_mb_for(attempt_processes)
             logger.info(
                 "pyramid_builder: %sattempt %d: per-worker budget %.0f MB, "
                 "processes/job=%d → out_chunk_shape=%s, super_chunk_shape=%s "

From bd485c6867ae5a1b5f526488f4928e71eda0e201 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Mon, 15 Jun 2026 15:00:06 -0400
Subject: [PATCH 14/19] Fix LOD-0 fragment drops when intermediate LODs are
 empty
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The rewrite_index_with_empty_fragments reachability pass was top-down
only — descending from REAL top-LOD positions through REAL parents.
When two LODs are independently meshed (downsample multires strategy),
mode-downsampling can drop a thin edge at LOD k+1 that still exists at
LOD k. The real LOD-k leaf would then be classified as an orphan (no
real parent chain) and dropped from the manifest entirely, producing a
chunk-shaped hole at the finest LOD in NG.

Add a bottom-up ancestor walk from every REAL position. Reachability
is now the union of both passes, so real leaves stay listed; empty
intermediates are written as 0-byte placeholders and NG's existing
fall-back-to-parent rule lets it traverse past them down to the real
leaf.

Concrete case: seg 5363 in jrc_P3_E5_D1_N2 mito had real LOD-0
fragments (3,3,4) and (3,4,4) whose LOD-1 parents (1,1,2) and (1,2,2)
were empty after mode-downsampling. Before this fix those LOD-0
fragments were dropped from the manifest; after, they're listed and
the rendered hole goes away.
---
 src/mesh_n_bone/util/mesh_io.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/mesh_n_bone/util/mesh_io.py b/src/mesh_n_bone/util/mesh_io.py
index 7da3600..91e0a97 100644
--- a/src/mesh_n_bone/util/mesh_io.py
+++ b/src/mesh_n_bone/util/mesh_io.py
@@ -283,8 +283,20 @@ def rewrite_index_with_empty_fragments(path, current_lod_fragments):
         set(map(tuple, p[s > 0].tolist())) for (p, s) in existing_per_lod
     ]
 
-    # Descent: reachable positions are sub-octants of REAL reachable
-    # positions one level up. The root is reachable by definition.
+    # Reachability is the union of two passes:
+    #   1. Top-down descent through REAL parents, which adds empty
+    #      placeholders under each real parent so NG's octree carve-out
+    #      stays complete (this was the only pass before).
+    #   2. Bottom-up ancestor walk from every REAL position, which
+    #      keeps real leaves listed even when an intermediate LOD is
+    #      empty in their cell. Independently-meshed LODs disagree:
+    #      mode-downsampling can drop a thin edge at LOD k+1 that still
+    #      exists at LOD k. Without the bottom-up pass, the real LOD-k
+    #      leaf is dropped as an "orphan" (no real parent chain) and
+    #      NG can't reach it — the user sees a chunk-shaped hole at
+    #      the finest LOD. With it, the empty intermediates are listed
+    #      as 0-byte placeholders and NG's fall-back-to-parent rule
+    #      lets it traverse down to the real leaf.
     reachable = [set() for _ in range(num_lods)]
     if real_pos_per_lod[top]:
         reachable[top] = real_pos_per_lod[top].copy()
@@ -294,6 +306,13 @@ def rewrite_index_with_empty_fragments(path, current_lod_fragments):
                     for b in (0, 1):
                         for c in (0, 1):
                             reachable[k].add((2 * X + a, 2 * Y + b, 2 * Z + c))
+    for k in range(num_lods):
+        for (X, Y, Z) in real_pos_per_lod[k]:
+            reachable[k].add((X, Y, Z))
+            x, y, z = X, Y, Z
+            for kk in range(k + 1, num_lods):
+                x, y, z = x // 2, y // 2, z // 2
+                reachable[kk].add((x, y, z))
 
     # Walk the existing data blob LOD-by-LOD in its current listed
     # order, extracting bytes for reachable real fragments. Drop bytes

From 6bee6b164d23ad2edc8e15278058768132d3cd3d Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Mon, 15 Jun 2026 15:00:15 -0400
Subject: [PATCH 15/19] Add keep_chunked_meshes flag + reuse existing pyramid

keep_chunked_meshes (default False) preserves the per-block PLY tree
(tmp_chunked_s{lod}/{mesh_id}/block_*.ply) plus the per-segment
assembled meshes after a run, so debugging a specific mesh can compare
raw MC output against the post-decimation per-LOD versions. Gates all
the cleanup rmtrees in _assemble_mesh, assemble_meshes, and
_generate_meshes_at_scale.

_existing_pyramid_scales reads OME-NGFF metadata at the auto-build
output path and returns the per-LOD downsample factor for each scale
whose .zarray exists on disk. _build_missing_pyramid_scales now checks
this first and reuses the existing pyramid when it covers all the
needed factors, instead of unconditionally rebuilding from s0. Cuts
4-5 minutes off debug re-runs in the same output dir.
---
 src/mesh_n_bone/meshify/meshify.py | 108 ++++++++++++++++++++++++++---
 1 file changed, 100 insertions(+), 8 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index bad07cf..bac1eb6 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -461,6 +461,65 @@ def _write_assembly_memory_plan(output_directory, estimates, waves):
         json.dump(plan, f, indent=2)
 
 
+def _existing_pyramid_scales(pyramid_path):
+    """Return ``{lod_index: uniform_factor}`` of usable scales on disk, or
+    ``None`` if the pyramid isn't readable.
+
+    Parses the OME-NGFF v0.4 multiscales metadata at
+    ``{pyramid_path}/.zattrs`` and returns the per-LOD uniform downsample
+    factor (relative to s0) for each dataset whose corresponding
+    ``s_k/.zarray`` actually exists on disk. Used to skip a fresh build
+    when a prior run already wrote the pyramid (e.g. debug re-runs).
+
+    Returns ``None`` when the file is missing, malformed, or doesn't
+    expose multiscales in the expected shape.
+    """
+    zattrs_path = os.path.join(pyramid_path, ".zattrs")
+    if not os.path.isfile(zattrs_path):
+        return None
+    try:
+        with open(zattrs_path) as f:
+            attrs = json.load(f)
+        datasets = attrs["multiscales"][0]["datasets"]
+        # s0's scale per-axis defines the base; s_k's factor = s_k/s_0
+        s0_scale = None
+        for ds in datasets:
+            if ds.get("path") == "s0":
+                for tr in ds.get("coordinateTransformations", []):
+                    if tr.get("type") == "scale":
+                        s0_scale = tr["scale"]
+                        break
+                break
+        if s0_scale is None:
+            return None
+        result = {}
+        for ds in datasets:
+            name = ds.get("path", "")
+            if not name.startswith("s") or not name[1:].isdigit():
+                continue
+            k = int(name[1:])
+            if k == 0:
+                continue
+            scale = None
+            for tr in ds.get("coordinateTransformations", []):
+                if tr.get("type") == "scale":
+                    scale = tr["scale"]
+                    break
+            if scale is None:
+                continue
+            factors = [scale[i] / s0_scale[i] for i in range(3)]
+            if not (factors[0] == factors[1] == factors[2]):
+                continue
+            factor = int(round(factors[0]))
+            # Verify the underlying zarr array actually exists
+            if not os.path.isfile(os.path.join(pyramid_path, name, ".zarray")):
+                continue
+            result[k] = factor
+        return result
+    except (KeyError, IndexError, ValueError, json.JSONDecodeError, OSError):
+        return None
+
+
 def _safely_remove_pyramid(pyramid_path, num_workers=8):
     """Delete the auto-built pyramid zarr with chunk-level parallelism.
 
@@ -989,6 +1048,7 @@ def __init__(
         delete_unsharded_files: bool = True,
         smooth_before_simplify: bool = True,
         target_ids: int | list[int] | None = None,
+        keep_chunked_meshes: bool = False,
     ):
         filename, dataset_name = split_dataset_path(input_path)
         self.segmentation_array = open_dataset(filename, dataset_name)
@@ -1160,6 +1220,7 @@ def __init__(
         self.delete_unsharded_files = delete_unsharded_files
         self.smooth_before_simplify = smooth_before_simplify
         self.target_ids = _normalize_target_ids(target_ids)
+        self.keep_chunked_meshes = keep_chunked_meshes
         self.coordinate_units = coordinate_units
         self.retry_on_oom = retry_on_oom
         self.memory_retry_max = memory_retry_max
@@ -1661,7 +1722,8 @@ def _assemble_mesh(self, mesh_id):
                     f"{len(mesh_files)}>{self.max_num_blocks}. Skipping.\n"
                 )
                 f.write(", ".join(mesh_files))
-            shutil.rmtree(f"{self.dirname}/{mesh_id}")
+            if not self.keep_chunked_meshes:
+                shutil.rmtree(f"{self.dirname}/{mesh_id}")
             return
 
         block_meshes = []
@@ -1687,7 +1749,8 @@ def _assemble_mesh(self, mesh_id):
                     "fixed-edge simplification).",
                     mesh_id, len(mesh_files),
                 )
-                shutil.rmtree(f"{self.dirname}/{mesh_id}", ignore_errors=True)
+                if not self.keep_chunked_meshes:
+                    shutil.rmtree(f"{self.dirname}/{mesh_id}", ignore_errors=True)
                 return
             chunk_size = (
                 self.read_write_block_shape_pixels * self.base_voxel_size_funlib
@@ -1726,7 +1789,8 @@ def _assemble_mesh(self, mesh_id):
                 "zero vertices.",
                 mesh_id,
             )
-            shutil.rmtree(f"{self.dirname}/{mesh_id}", ignore_errors=True)
+            if not self.keep_chunked_meshes:
+                shutil.rmtree(f"{self.dirname}/{mesh_id}", ignore_errors=True)
             return
 
         if check_validity or self.has_custom_roi:
@@ -1828,7 +1892,8 @@ def _assemble_mesh(self, mesh_id):
             )
         else:
             _ = mesh.export(f"{self.output_directory}/meshes/{mesh_id}.ply")
-        shutil.rmtree(f"{self.dirname}/{mesh_id}")
+        if not self.keep_chunked_meshes:
+            shutil.rmtree(f"{self.dirname}/{mesh_id}")
 
     def _assemble_mesh_batch(self, mesh_ids):
         """Assemble a memory-balanced batch of segment ids sequentially."""
@@ -1930,7 +1995,8 @@ def _run(workers, config, wave=wave, phase_name=phase_name):
             write_ngmesh_metadata(f"{self.output_directory}/meshes")
         elif self.do_singleres_multires_neuroglancer:
             write_singleres_multires_metadata(f"{self.output_directory}/meshes")
-        shutil.rmtree(dirname)
+        if not self.keep_chunked_meshes:
+            shutil.rmtree(dirname)
 
     def _generate_meshes_at_scale(self, output_mesh_dir, downsample_factor=None,
                                    target_reduction_override=None,
@@ -2016,11 +2082,21 @@ def _generate_meshes_at_scale(self, output_mesh_dir, downsample_factor=None,
                 self.target_reduction = target_reduction_override
 
             os.makedirs(self.output_directory, exist_ok=True)
-            tmp_chunked_dir = self.output_directory + "/tmp_chunked"
+            scale_tag = (
+                f"s{int(np.log2(self.downsample_factor))}"
+                if self.downsample_factor
+                else "s0"
+            )
+            tmp_chunked_dir = (
+                self.output_directory
+                + ("/tmp_chunked_" + scale_tag if self.keep_chunked_meshes
+                   else "/tmp_chunked")
+            )
             os.makedirs(tmp_chunked_dir, exist_ok=True)
             self.get_chunked_meshes(tmp_chunked_dir)
             self.assemble_meshes(tmp_chunked_dir)
-            shutil.rmtree(tmp_chunked_dir, ignore_errors=True)
+            if not self.keep_chunked_meshes:
+                shutil.rmtree(tmp_chunked_dir, ignore_errors=True)
         finally:
             self.output_directory = orig_output
             self.downsample_factor = orig_downsample
@@ -2153,6 +2229,21 @@ def _build_missing_pyramid_scales(self, num_lods):
             self.output_directory, "_intermediate_scales.zarr",
         )
 
+        # If the pyramid already exists on disk with all the needed scales,
+        # skip the rebuild and reuse what's there. Useful when re-running a
+        # job in the same output dir (e.g. for debugging) so we don't redo
+        # the (potentially slow) downsample pass each time.
+        existing_pyramid = _existing_pyramid_scales(pyramid_path)
+        if existing_pyramid is not None:
+            available = {f: f"{pyramid_path}/s{k}" for k, f in existing_pyramid.items()}
+            need_factors = {int(factor[0]) for _, factor in needed_uniform}
+            if need_factors.issubset(set(available.keys())):
+                logger.info(
+                    "pyramid_builder: reusing existing %s (scales %s)",
+                    pyramid_path, sorted(need_factors),
+                )
+                return {f: available[f] for f in need_factors}
+
         # ROI in voxel coordinates relative to the dataset's own (0, 0, 0)
         seg_arr = self.segmentation_array
         ds_offset = np.asarray(seg_arr.roi.offset, dtype=np.int64)
@@ -2801,7 +2892,8 @@ def get_meshes(self):
         os.makedirs(tmp_chunked_dir, exist_ok=True)
         self.get_chunked_meshes(tmp_chunked_dir)
         self.assemble_meshes(tmp_chunked_dir)
-        shutil.rmtree(tmp_chunked_dir, ignore_errors=True)
+        if not self.keep_chunked_meshes:
+            shutil.rmtree(tmp_chunked_dir, ignore_errors=True)
 
         if self.do_analysis:
             from mesh_n_bone.analyze.analyze import AnalyzeMeshes

From ef6aee78161be627870fd2485050d3334c750b5d Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Tue, 16 Jun 2026 14:54:30 -0400
Subject: [PATCH 16/19] Fix pyramid build OOM on large source chunks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Workers were OOM-killed even at processes/job=1 (180 GB/worker) on
bw-1 mito. The actual per-task peak grew with each of the 8 super-chunks
a worker processes before exit. Four root causes, four fixes:

1. SANE_CHUNK_MAX 256 → 128. The src zarr chunk shape inherits into
   out_chunk_shape, which multiplies by max_factor=8 for super_chunk.
   bw-1's 224³ source chunks produced a 1792³ super-chunk (11.5 GB at
   uint16). Capping at 128 keeps super_chunk_shape ≤ 1024³ (~2 GB at
   uint16) regardless of input chunking, and triples parallelism for
   large volumes (27 tasks vs 8).

2. amplification 1.375x → 2.5x. The original estimate accounted only
   for s0 + s1/s2/s3 output buffers, not for tensorstore decode/encode
   caches and glibc fragmentation across sequential super-chunks. The
   underestimate kept the proactive process-per-job reduction from
   triggering until OOMs forced the post-hoc retry loop.

3. Disable tensorstore cache_pool (total_bytes_limit=0) in the capped
   context spec. Decoded-chunk cache is unbounded by default and lives
   for the worker's lifetime, since _PYRAMID_WORKER_TS_CACHE holds the
   input + 3 output handles across all tasks. We don't benefit from
   caching here (each super-chunk reads/writes a non-overlapping
   region). Plumb the capped context into _ts_handle_for_output too.

4. Explicit `del s0_block` + gc.collect() + libc malloc_trim() at the
   end of process_super_chunk_for_dask. Free-but-mapped pages don't
   return to OS without an explicit trim — they compound across the
   8+ sequential tasks each worker handles.

Together these address the cross-task accumulation that no single
estimate-based retry can catch.
---
 src/mesh_n_bone/meshify/meshify.py           | 21 ++++++++++++++------
 src/mesh_n_bone/util/image_data_interface.py |  7 +++++++
 src/mesh_n_bone/util/pyramid_builder.py      | 21 ++++++++++++++++++++
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index bac1eb6..dc2a2da 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2296,15 +2296,24 @@ def _build_missing_pyramid_scales(self, num_lods):
         #                     per-worker memory budget.
         max_factor = np.max(np.array([f for _, f in needed_uniform]), axis=0)
         itemsize = int(np.dtype(seg_arr.data.dtype).itemsize)
-        # 1.0 (s0 read) + 1/8 (LOD 1 output) + ~2*1/8 (downsample scratch)
-        amplification = 1.0 + 1.0 / 8.0 + 2.0 / 8.0            # ~1.375x
+        # 1.0 (s0 read) + 1/8 (LOD 1 output) + ~1/8 (downsample scratch)
+        # plus ~1x slack for tensorstore decode/encode buffers, output-handle
+        # cache_pool retention across cascaded LODs (s1, s2, s3), and
+        # glibc malloc fragmentation across the 8+ sequential super-chunks
+        # a worker processes before exit. Underestimating this was the root
+        # cause of OOMs on bw-1 mito even at processes=1 / 180 GB/worker.
+        amplification = 2.5
 
         # Sane output chunk shape: inherit s0's chunk shape, clamped to
-        # the [32, 256] range and rounded to a multiple of max_factor per
-        # axis. 32 is the lower bound for "not silly small"; 256 caps
-        # the file size to ~16 MiB at uint8.
+        # the [32, 128] range and rounded to a multiple of max_factor per
+        # axis. The upper bound caps super_chunk_shape (= out_chunk *
+        # max_factor) at 128*8 = 1024 voxels per axis, i.e. ~1-2 GB per
+        # task. Higher than 128 here can produce a 1792³ super-chunk
+        # (11.5 GB at uint16) that OOMs workers, since the realistic
+        # amplification (above) plus tensorstore caches puts the per-task
+        # peak well over the 30 GB/worker budget at default processes=6.
         SANE_CHUNK_MIN = 32
-        SANE_CHUNK_MAX = 256
+        SANE_CHUNK_MAX = 128
         src_chunk = np.asarray(seg_arr.chunk_shape, dtype=np.int64)
         out_chunk_shape = []
         for ax in range(3):
diff --git a/src/mesh_n_bone/util/image_data_interface.py b/src/mesh_n_bone/util/image_data_interface.py
index d4fc31c..a4d310e 100644
--- a/src/mesh_n_bone/util/image_data_interface.py
+++ b/src/mesh_n_bone/util/image_data_interface.py
@@ -83,6 +83,13 @@ def _capped_tensorstore_context_spec():
         "gcs_request_concurrency": {"limit": 1},
         "s3_request_concurrency": {"limit": 1},
         "http_request_concurrency": {"limit": 1},
+        # Disable tensorstore's decoded-chunk cache. Default is unbounded,
+        # so for long-lived worker processes (e.g. dask LSF workers that
+        # cache opened handles in _PYRAMID_WORKER_TS_CACHE across multiple
+        # super-chunks), the cache accumulates decoded chunks until the
+        # worker is OOM-killed. We don't benefit from caching anyway —
+        # each super-chunk reads/writes a non-overlapping region.
+        "cache_pool": {"total_bytes_limit": 0},
     }
 
 
diff --git a/src/mesh_n_bone/util/pyramid_builder.py b/src/mesh_n_bone/util/pyramid_builder.py
index c3e412b..3af5bd8 100644
--- a/src/mesh_n_bone/util/pyramid_builder.py
+++ b/src/mesh_n_bone/util/pyramid_builder.py
@@ -276,10 +276,17 @@ def _ts_handle_for_output(path):
     if path in _PYRAMID_WORKER_TS_CACHE:
         return _PYRAMID_WORKER_TS_CACHE[path]
     import tensorstore as ts
+    from mesh_n_bone.util.image_data_interface import (
+        _capped_tensorstore_context_spec,
+    )
     handle = ts.open({
         "driver": "zarr",
         "kvstore": {"driver": "file", "path": path},
         "open": True,
+        # Cap thread pools + disable decoded-chunk cache. Without this
+        # the output handle's cache_pool accumulates encoded chunks for
+        # the worker's lifetime, OOMing on large super-chunk runs.
+        "context": _capped_tensorstore_context_spec(),
     }).result()
     _PYRAMID_WORKER_TS_CACHE[path] = handle
     return handle
@@ -360,6 +367,20 @@ def process_super_chunk_for_dask(sc_origin_tuple, worker_config):
         ).result()
         del ds_block
 
+    # Release the big s0 read buffer + force allocator release. Without this
+    # the worker accumulates s0 read residue across the 8+ super-chunks it
+    # processes before exit (cached tensorstore handles in
+    # _PYRAMID_WORKER_TS_CACHE don't release on their own, and glibc malloc
+    # doesn't munmap freed pages without an explicit trim).
+    del s0_block
+    import gc
+    gc.collect()
+    try:
+        import ctypes
+        ctypes.CDLL("libc.so.6").malloc_trim(0)
+    except (OSError, AttributeError):
+        pass
+
     return None
 
 

From 1356760cec30f6dd42b7ecc5e7fb2711c1daaa2b Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Tue, 16 Jun 2026 15:04:02 -0400
Subject: [PATCH 17/19] Use getattr for keep_chunked_meshes gates

Tests instantiate Meshify via object.__new__ + manual attribute
assignment, bypassing __init__. Direct self.keep_chunked_meshes access
in the rmtree gates raised AttributeError there. Switch to
getattr(self, "keep_chunked_meshes", False) so tests don't need to set
the attribute explicitly.
---
 src/mesh_n_bone/meshify/meshify.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index dc2a2da..e4040c8 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -1722,7 +1722,7 @@ def _assemble_mesh(self, mesh_id):
                     f"{len(mesh_files)}>{self.max_num_blocks}. Skipping.\n"
                 )
                 f.write(", ".join(mesh_files))
-            if not self.keep_chunked_meshes:
+            if not getattr(self, "keep_chunked_meshes", False):
                 shutil.rmtree(f"{self.dirname}/{mesh_id}")
             return
 
@@ -1749,7 +1749,7 @@ def _assemble_mesh(self, mesh_id):
                     "fixed-edge simplification).",
                     mesh_id, len(mesh_files),
                 )
-                if not self.keep_chunked_meshes:
+                if not getattr(self, "keep_chunked_meshes", False):
                     shutil.rmtree(f"{self.dirname}/{mesh_id}", ignore_errors=True)
                 return
             chunk_size = (
@@ -1789,7 +1789,7 @@ def _assemble_mesh(self, mesh_id):
                 "zero vertices.",
                 mesh_id,
             )
-            if not self.keep_chunked_meshes:
+            if not getattr(self, "keep_chunked_meshes", False):
                 shutil.rmtree(f"{self.dirname}/{mesh_id}", ignore_errors=True)
             return
 
@@ -1892,7 +1892,7 @@ def _assemble_mesh(self, mesh_id):
             )
         else:
             _ = mesh.export(f"{self.output_directory}/meshes/{mesh_id}.ply")
-        if not self.keep_chunked_meshes:
+        if not getattr(self, "keep_chunked_meshes", False):
             shutil.rmtree(f"{self.dirname}/{mesh_id}")
 
     def _assemble_mesh_batch(self, mesh_ids):
@@ -1995,7 +1995,7 @@ def _run(workers, config, wave=wave, phase_name=phase_name):
             write_ngmesh_metadata(f"{self.output_directory}/meshes")
         elif self.do_singleres_multires_neuroglancer:
             write_singleres_multires_metadata(f"{self.output_directory}/meshes")
-        if not self.keep_chunked_meshes:
+        if not getattr(self, "keep_chunked_meshes", False):
             shutil.rmtree(dirname)
 
     def _generate_meshes_at_scale(self, output_mesh_dir, downsample_factor=None,
@@ -2089,13 +2089,14 @@ def _generate_meshes_at_scale(self, output_mesh_dir, downsample_factor=None,
             )
             tmp_chunked_dir = (
                 self.output_directory
-                + ("/tmp_chunked_" + scale_tag if self.keep_chunked_meshes
+                + ("/tmp_chunked_" + scale_tag
+                   if getattr(self, "keep_chunked_meshes", False)
                    else "/tmp_chunked")
             )
             os.makedirs(tmp_chunked_dir, exist_ok=True)
             self.get_chunked_meshes(tmp_chunked_dir)
             self.assemble_meshes(tmp_chunked_dir)
-            if not self.keep_chunked_meshes:
+            if not getattr(self, "keep_chunked_meshes", False):
                 shutil.rmtree(tmp_chunked_dir, ignore_errors=True)
         finally:
             self.output_directory = orig_output
@@ -2901,7 +2902,7 @@ def get_meshes(self):
         os.makedirs(tmp_chunked_dir, exist_ok=True)
         self.get_chunked_meshes(tmp_chunked_dir)
         self.assemble_meshes(tmp_chunked_dir)
-        if not self.keep_chunked_meshes:
+        if not getattr(self, "keep_chunked_meshes", False):
             shutil.rmtree(tmp_chunked_dir, ignore_errors=True)
 
         if self.do_analysis:

From 1066b60b1e904ce8cf05d3b009bd84b2a313b056 Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Thu, 18 Jun 2026 16:56:59 -0400
Subject: [PATCH 18/19] Set lod_scale_multiplier to voxel_size in mesh info
 files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this, every mesh's manifest stores lod_scales=[1,2,4,8] and
lod_scale_multiplier=1, so NG reads the per-LOD spatial resolutions as
literally 1-8 nm. At any reasonable viewing zoom (pixel_size > 8 nm in
world units) every LOD satisfies "scale <= pixel_size", so NG picks
the coarsest LOD per its selection rule. Result: meshes always render
at LOD-N-1, even when zoomed in close — the "low res at moderate zoom"
bug.

By setting lod_scale_multiplier to the source voxel size (e.g. 16 for
16 nm c-elegans data), NG sees effective lod_scales of
[voxel_size, 2*voxel_size, 4*voxel_size, ...]. NG now picks LOD-0 at
pixel_size ~ voxel_size, transitioning to coarser LODs as zoom-out
makes the chunk size shrink in screen space. Matches hemibrain's setup
(which uses transform=[16,16,16] for the same effect).

Plumbed through:
- write_sharded_info_file and write_info_file gain a
  lod_scale_multiplier parameter (default 1.0 = backward-compatible).
- meshify.py passes self.base_voxel_size_funlib[0] from the source
  segmentation's voxel size to both call sites.
- multires.py CLI reads a new voxel_size_nm config knob (defaults to
  1.0) and passes it through. Users running the standalone multires
  CLI should set this in their run-config.yaml's
  optional_decimation_settings.

Users wanting "load less" at zoom-out (e.g. many simultaneous meshes)
can bump NG's "Resolution (mesh)" slider above 1.0 px to widen the
LOD transition window.
---
 src/mesh_n_bone/config.py                 |  7 +++++++
 src/mesh_n_bone/meshify/meshify.py        |  6 +++++-
 src/mesh_n_bone/multires/multires.py      |  3 +++
 src/mesh_n_bone/util/neuroglancer.py      | 12 ++++++++++--
 src/mesh_n_bone/util/sharded_mesh_util.py | 14 ++++++++++++--
 5 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/mesh_n_bone/config.py b/src/mesh_n_bone/config.py
index fb0dc93..78c9c46 100644
--- a/src/mesh_n_bone/config.py
+++ b/src/mesh_n_bone/config.py
@@ -68,6 +68,13 @@ def read_multires_config(config_path):
             optional_decimation_settings["target_faces_per_lod0_chunk"] = (
                 DEFAULT_TARGET_FACES_PER_LOD0_CHUNK
             )
+        # voxel_size_nm controls the info file's `lod_scale_multiplier` so
+        # NG sees per-LOD spatial resolutions in physical units (voxel_size,
+        # 2*voxel_size, 4*voxel_size, ...). Set to the dataset's source
+        # voxel size in nm. Default 1.0 = NG sees scales as raw [1,2,4,8]
+        # (the historical buggy behavior — NG aggressively picks coarse LODs).
+        if "voxel_size_nm" not in optional_decimation_settings:
+            optional_decimation_settings["voxel_size_nm"] = 1.0
 
         optional_properties_settings = config.get("optional_properties_settings", {})
         if "segment_properties_csv" not in optional_properties_settings:
diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index e4040c8..ad88cd9 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2708,6 +2708,7 @@ def _pack(workers, config):
             with Timing_Messager("Writing sharded info file", logger):
                 sharded_mesh_util.write_sharded_info_file(
                     multires_path, spec, vertex_quantization_bits=16,
+                    lod_scale_multiplier=float(self.base_voxel_size_funlib[0]),
                 )
 
             if self.delete_unsharded_files:
@@ -2719,7 +2720,10 @@ def _pack(workers, config):
                     )
         else:
             with Timing_Messager("Writing info file", logger):
-                neuroglancer.write_info_file(multires_path)
+                neuroglancer.write_info_file(
+                    multires_path,
+                    lod_scale_multiplier=float(self.base_voxel_size_funlib[0]),
+                )
 
         if self.delete_decimated_meshes:
             with Timing_Messager("Cleaning up intermediate mesh files", logger):
diff --git a/src/mesh_n_bone/multires/multires.py b/src/mesh_n_bone/multires/multires.py
index 48199eb..e513c8e 100644
--- a/src/mesh_n_bone/multires/multires.py
+++ b/src/mesh_n_bone/multires/multires.py
@@ -382,6 +382,7 @@ def run_multires(config_path, num_workers, roi=None):
     aggressiveness = optional_decimation_settings["aggressiveness"]
     delete_decimated_meshes_flag = optional_decimation_settings["delete_decimated_meshes"]
     target_faces_per_lod0_chunk = optional_decimation_settings["target_faces_per_lod0_chunk"]
+    voxel_size_nm = float(optional_decimation_settings.get("voxel_size_nm", 1.0))
     retry_on_oom = optional_decimation_settings.get("retry_on_oom", True)
     memory_retry_max = optional_decimation_settings.get("memory_retry_max", 3)
 
@@ -526,6 +527,7 @@ def _pack(workers, config):
                 with Timing_Messager("Writing sharded info file", logger):
                     sharded_mesh_util.write_sharded_info_file(
                         multires_output_path, spec, vertex_quantization_bits=16,
+                        lod_scale_multiplier=voxel_size_nm,
                     )
 
                 if sharding_settings["delete_unsharded_files"]:
@@ -539,6 +541,7 @@ def _pack(workers, config):
                 with Timing_Messager("Writing info file", logger):
                     neuroglancer.write_info_file(
                         multires_output_path, vertex_quantization_bits=16,
+                        lod_scale_multiplier=voxel_size_nm,
                     )
 
             if not skip_decimation and delete_decimated_meshes_flag:
diff --git a/src/mesh_n_bone/util/neuroglancer.py b/src/mesh_n_bone/util/neuroglancer.py
index 911be82..df54933 100644
--- a/src/mesh_n_bone/util/neuroglancer.py
+++ b/src/mesh_n_bone/util/neuroglancer.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 
-def write_info_file(path, vertex_quantization_bits=16):
+def write_info_file(path, vertex_quantization_bits=16, lod_scale_multiplier=1.0):
     """Write the ``info`` JSON file for a Neuroglancer multi-LOD Draco mesh layer.
 
     The generated file uses the specified vertex quantization, an identity
@@ -19,13 +19,21 @@ def write_info_file(path, vertex_quantization_bits=16):
     vertex_quantization_bits : int, optional
         Number of quantization bits per vertex coordinate.  Default is
         ``16``.
+    lod_scale_multiplier : float, optional
+        Multiplied into each segment manifest's `lod_scales` to give NG
+        the true per-LOD spatial resolution in physical units. Should be
+        set to the dataset's LOD-0 voxel size (e.g. 16 for 16 nm voxels)
+        so the per-LOD scales `[1, 2, 4, 8]` stored in manifests become
+        `[16, 32, 64, 128]` nm in NG's eyes. With the default 1.0, NG
+        would treat LOD-0 as 1-unit-resolution and aggressively pick
+        coarse LODs for any view (the historical buggy default).
     """
     with open(f"{path}/info", "w") as f:
         info = {
             "@type": "neuroglancer_multilod_draco",
             "vertex_quantization_bits": vertex_quantization_bits,
             "transform": [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
-            "lod_scale_multiplier": 1,
+            "lod_scale_multiplier": float(lod_scale_multiplier),
             "segment_properties": "segment_properties",
         }
         json.dump(info, f)
diff --git a/src/mesh_n_bone/util/sharded_mesh_util.py b/src/mesh_n_bone/util/sharded_mesh_util.py
index 05bbe4a..d3e81ee 100644
--- a/src/mesh_n_bone/util/sharded_mesh_util.py
+++ b/src/mesh_n_bone/util/sharded_mesh_util.py
@@ -163,7 +163,10 @@ def pack_one_shard(
 
 
 def write_sharded_info_file(
-    path: str, spec: ShardingSpecification, vertex_quantization_bits: int = 16
+    path: str,
+    spec: ShardingSpecification,
+    vertex_quantization_bits: int = 16,
+    lod_scale_multiplier: float = 1.0,
 ) -> None:
     """Write the top-level `info` for sharded multi-resolution Draco meshes.
 
@@ -172,6 +175,13 @@ def write_sharded_info_file(
     the unsharded `write_info_file`. The value MUST equal whatever the
     encoder used, or Neuroglancer will decode vertices at the wrong scale
     and the mesh geometry will appear stretched or shrunk by a power of 2.
+
+    ``lod_scale_multiplier`` is multiplied into the per-LOD `lod_scales`
+    values stored in each segment's manifest. The manifest stores
+    `[1, 2, 4, ...]`, and the multiplier should equal the dataset's
+    voxel size in physical units so NG sees true per-LOD spatial
+    resolutions. With the default 1.0, NG would treat LOD-0 as
+    1-unit-resolution and aggressively pick coarse LODs for any view.
     """
 
     # spec.to_dict() may contain numpy uint64 — coerce to plain ints for JSON.
@@ -183,7 +193,7 @@ def write_sharded_info_file(
         "@type": "neuroglancer_multilod_draco",
         "vertex_quantization_bits": int(vertex_quantization_bits),
         "transform": [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
-        "lod_scale_multiplier": 1,
+        "lod_scale_multiplier": float(lod_scale_multiplier),
         "segment_properties": "segment_properties",
         "sharding": sharding,
     }

From 0a0b7a3474ff8450b65a3da69d36878ea553d21c Mon Sep 17 00:00:00 2001
From: David Ackerman <ackermand@janelia.hhmi.org>
Date: Tue, 23 Jun 2026 14:17:55 -0400
Subject: [PATCH 19/19] Default to fixed lod_0_box_size + MIN_USEFUL_FACES cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes that work together:

1. Default lod_0_box_size = 1024 × voxel_size_nm (isotropic). Matches
   hemibrain's setup — every mesh in a dataset uses the same physical
   chunk size, giving consistent LOD-transition zoom thresholds across
   mesh sizes. Small organelles fit in 1 chunk (no internal seams);
   large structures subdivide as needed.

   Previous behavior (target_faces_per_lod0_chunk adaptive scaling) is
   preserved when the user explicitly overrides target_faces_per_lod0_chunk
   to a non-default value — useful for users who relied on it.

2. MIN_USEFUL_FACES_PER_LOD = 50 cap on per-segment num_lods. After
   each LOD is loaded, if its face count is below 50, drop it and all
   coarser LODs. Avoids near-degenerate LOD meshes (a 5-face mesh adds
   nothing useful; the parent LOD covers that zoom range fine via NG's
   fall-back-to-parent rule).

Plumbed voxel_size_nm + min_useful_faces through:
- generate_neuroglancer_multires_mesh
- generate_all_neuroglancer_multires_meshes
- meshify.py caller (passes self.base_voxel_size_funlib[0])
- standalone multires.py CLI (passes voxel_size_nm config knob)
---
 src/mesh_n_bone/meshify/meshify.py   |  1 +
 src/mesh_n_bone/multires/multires.py | 56 ++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/mesh_n_bone/meshify/meshify.py b/src/mesh_n_bone/meshify/meshify.py
index ad88cd9..2f93113 100644
--- a/src/mesh_n_bone/meshify/meshify.py
+++ b/src/mesh_n_bone/meshify/meshify.py
@@ -2658,6 +2658,7 @@ def _run(workers, config):
                         np.array(file_sizes, dtype=float),
                         self.lod_0_box_size,
                         target_faces_per_lod0_chunk=self.target_faces_per_lod0_chunk,
+                        voxel_size_nm=float(self.base_voxel_size_funlib[0]),
                     )
 
         dask_util.run_with_oom_retry(
diff --git a/src/mesh_n_bone/multires/multires.py b/src/mesh_n_bone/multires/multires.py
index e513c8e..70057e5 100644
--- a/src/mesh_n_bone/multires/multires.py
+++ b/src/mesh_n_bone/multires/multires.py
@@ -20,12 +20,22 @@
 
 
 DEFAULT_TARGET_FACES_PER_LOD0_CHUNK = 25_000
+# Hemibrain uses 1024 voxels per LOD-0 chunk (= 16 μm at their 16-nm
+# meshing scale). Same number used here as a sensible dataset-level
+# default: 1024 × voxel_size for a single isotropic chunk size that
+# behaves consistently across mesh sizes.
+DEFAULT_BOX_SIZE_VOXELS = 1024
+# A LOD whose decimated face count would fall below this is dropped from
+# per-segment output — further decimation produces degenerate meshes.
+MIN_USEFUL_FACES_PER_LOD = 50
 
 
 def generate_neuroglancer_multires_mesh(
     id, num_subtask_workers, output_path, lods, original_ext, lod_0_box_size=None,
     vertex_quantization_bits=16,
     target_faces_per_lod0_chunk=DEFAULT_TARGET_FACES_PER_LOD0_CHUNK,
+    voxel_size_nm=1.0,
+    min_useful_faces=MIN_USEFUL_FACES_PER_LOD,
 ):
     """Create a complete multiresolution mesh for a single segment.
 
@@ -98,6 +108,15 @@ def generate_neuroglancer_multires_mesh(
                     f"{previous_lod} ({previous_faces} faces)."
                 )
                 break
+            if current_lod > 0 and num_faces < min_useful_faces:
+                lod_truncation_reason = (
+                    f"Segment {id} using {idx}/{requested_lod_count} requested "
+                    f"LODs; dropping LOD {current_lod} and later because it "
+                    f"has {num_faces} faces, below the useful-detail "
+                    f"threshold ({min_useful_faces}) — further decimation "
+                    f"produces degenerate meshes."
+                )
+                break
             lod_face_counts.append((current_lod, num_faces))
             # LOD-0 bounds drive the chunk grid.
             if current_lod == 0 and vertices is not None:
@@ -132,18 +151,27 @@ def generate_neuroglancer_multires_mesh(
             return
 
         if lod_0_box_size is None:
-            # Surface-area scaling: triangles distribute across the
-            # mesh's 2-D surface, so total chunks ∝ N²-on-axis. Use
-            # sqrt for the per-axis count.
-            heuristic_num_chunks = np.ceil(
-                lod0_num_faces / target_faces_per_lod0_chunk
-            )
-            num_chunks_per_axis = max(
-                1, int(np.ceil(np.sqrt(heuristic_num_chunks)))
-            )
-            lod_0_box_size = (
-                np.ceil(lod0_distances_per_axis / num_chunks_per_axis) + 1
-            )
+            if target_faces_per_lod0_chunk != DEFAULT_TARGET_FACES_PER_LOD0_CHUNK:
+                # User explicitly tuned target_faces_per_lod0_chunk →
+                # use the adaptive surface-area heuristic.
+                heuristic_num_chunks = np.ceil(
+                    lod0_num_faces / target_faces_per_lod0_chunk
+                )
+                num_chunks_per_axis = max(
+                    1, int(np.ceil(np.sqrt(heuristic_num_chunks)))
+                )
+                lod_0_box_size = (
+                    np.ceil(lod0_distances_per_axis / num_chunks_per_axis) + 1
+                )
+            else:
+                # Dataset-level default: fixed N voxels per chunk side
+                # (matches hemibrain's setup at 1024 voxels). Gives
+                # consistent LOD-transition zoom thresholds across mesh
+                # sizes — a 1 μm mito and a 100 μm neuron get the same
+                # physical chunk size, just different chunk counts.
+                lod_0_box_size = np.full(
+                    3, DEFAULT_BOX_SIZE_VOXELS * voxel_size_nm, dtype=float,
+                )
 
         # Compute the LOD 0 chunk grid from the s0 mesh extent.
         mesh_extent = vertex_max - vertex_min
@@ -298,6 +326,8 @@ def generate_all_neuroglancer_multires_meshes(
     output_path, num_workers, ids, lods, original_ext, file_sizes,
     lod_0_box_size=None, vertex_quantization_bits=16,
     target_faces_per_lod0_chunk=DEFAULT_TARGET_FACES_PER_LOD0_CHUNK,
+    voxel_size_nm=1.0,
+    min_useful_faces=MIN_USEFUL_FACES_PER_LOD,
 ):
     """Generate Neuroglancer multiresolution meshes for all segments.
 
@@ -335,6 +365,7 @@ def get_number_of_subtask_workers(file_sizes, num_workers):
     fixed_args_list = [
         output_path, lods, original_ext, lod_0_box_size,
         vertex_quantization_bits, target_faces_per_lod0_chunk,
+        voxel_size_nm, min_useful_faces,
     ]
     for idx, id in enumerate(ids):
         variable_args_list.append((id, num_subtask_workers[idx]))
@@ -480,6 +511,7 @@ def _run_multires(workers, config):
                             np.array(file_sizes), lod_0_box_size,
                             vertex_quantization_bits=16,
                             target_faces_per_lod0_chunk=target_faces_per_lod0_chunk,
+                            voxel_size_nm=voxel_size_nm,
                         )
             dask_util.run_with_oom_retry(
                 _run_multires, effective_workers, "multires creation", logger,