data_handler

`orchard.data_handler` ¶

Data Handler Package.

This package manages the end-to-end data pipeline, from downloading raw NPZ files using the Dataset Registry to providing fully configured PyTorch DataLoaders.

`VisionDataset(images, labels, *, transform=None)` ¶

Bases: Dataset[tuple[Tensor, Tensor]]

PyTorch Dataset for NPZ-based vision data.

The constructor accepts raw NumPy arrays directly (no I/O). Use the classmethod factories to load from disk:

VisionDataset.from_npz(...) — eager, full split into RAM.
VisionDataset.lazy(...) — memory-mapped, pages loaded on demand.

Initializes the dataset from pre-loaded arrays.

Parameters:

Name	Type	Description	Default
`images`	`NDArray[Any]`	Image array with shape `(N, H, W)` or `(N, H, W, C)`.	required
`labels`	`NDArray[Any]`	Label array, any shape that flattens to `(N,)`.	required
`transform`	`Compose \| None`	Pipeline of Torchvision transforms.	`None`

Source code in orchard/data_handler/dataset.py

def __init__(
    self,
    images: npt.NDArray[Any],
    labels: npt.NDArray[Any],
    *,
    transform: transforms.Compose | None = None,
) -> None:
    """
    Initializes the dataset from pre-loaded arrays.

    Args:
        images: Image array with shape ``(N, H, W)`` or ``(N, H, W, C)``.
        labels: Label array, any shape that flattens to ``(N,)``.
        transform: Pipeline of Torchvision transforms.
    """
    # Ensure consistent (N, H, W, C) for PIL conversion
    if images.ndim == 3:  # (N, H, W) -> (N, H, W, 1)
        images = np.expand_dims(images, axis=-1)

    self.images = images
    self.labels: npt.NDArray[Any] = labels.ravel().astype(np.int64)
    self.transform = transform

    # Kept alive to prevent GC of mmap arrays (set by .lazy())
    self._npz_handle: np.lib.npyio.NpzFile | None = None
    # Index mapping for lazy subsampling (None = use all)
    self._indices: npt.NDArray[Any] | None = None

`from_npz(path, split='train', *, transform=None, max_samples=None, seed=42)` `classmethod` ¶

Eagerly load a split from an NPZ archive into RAM.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the dataset `.npz` archive.	required
`split`	`str`	Dataset split to load (`train`, `val`, or `test`).	`'train'`
`transform`	`Compose \| None`	Pipeline of Torchvision transforms.	`None`
`max_samples`	`int \| None`	If set, limits the number of samples (subsampling).	`None`
`seed`	`int`	Random seed for deterministic subsampling.	`42`

Source code in orchard/data_handler/dataset.py

@classmethod
def from_npz(
    cls,
    path: Path,
    split: str = "train",
    *,
    transform: transforms.Compose | None = None,
    max_samples: int | None = None,
    seed: int = 42,
) -> VisionDataset:
    """
    Eagerly load a split from an NPZ archive into RAM.

    Args:
        path: Path to the dataset ``.npz`` archive.
        split: Dataset split to load (``train``, ``val``, or ``test``).
        transform: Pipeline of Torchvision transforms.
        max_samples: If set, limits the number of samples (subsampling).
        seed: Random seed for deterministic subsampling.
    """
    if not path.exists():
        raise OrchardDatasetError(f"Dataset file not found at: {path}")

    with np.load(path) as data:
        raw_images = data[f"{split}_images"]
        raw_labels = data[f"{split}_labels"]

        total_available = len(raw_labels)

        # Deterministic subsampling logic
        if max_samples and max_samples < total_available:
            rng = np.random.default_rng(seed)
            chosen = rng.choice(total_available, size=max_samples, replace=False)
            images = raw_images[chosen]
            labels = raw_labels[chosen]
        else:
            images = np.array(raw_images)
            labels = raw_labels

    return cls(images, labels, transform=transform)

`lazy(path, split='train', *, transform=None, max_samples=None, seed=42)` `classmethod` ¶

Memory-mapped load from an NPZ archive (no full RAM copy).

Images are loaded page-by-page on demand. Suitable for large datasets that do not fit in RAM and for lightweight health checks.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the `.npz` file.	required
`split`	`str`	Dataset split to load (default `train`).	`'train'`
`transform`	`Compose \| None`	Pipeline of Torchvision transforms.	`None`
`max_samples`	`int \| None`	If set, limits the number of samples (subsampling).	`None`
`seed`	`int`	Random seed for deterministic subsampling.	`42`

Source code in orchard/data_handler/dataset.py

@classmethod
def lazy(
    cls,
    path: Path,
    split: str = "train",
    *,
    transform: transforms.Compose | None = None,
    max_samples: int | None = None,
    seed: int = 42,
) -> VisionDataset:
    """
    Memory-mapped load from an NPZ archive (no full RAM copy).

    Images are loaded page-by-page on demand. Suitable for large datasets
    that do not fit in RAM and for lightweight health checks.

    Args:
        path: Path to the ``.npz`` file.
        split: Dataset split to load (default ``train``).
        transform: Pipeline of Torchvision transforms.
        max_samples: If set, limits the number of samples (subsampling).
        seed: Random seed for deterministic subsampling.
    """
    data = np.load(path, mmap_mode="r")
    instance = cls(data[f"{split}_images"], data[f"{split}_labels"], transform=transform)
    instance._npz_handle = data

    if max_samples and max_samples < len(instance.labels):
        rng = np.random.default_rng(seed)
        instance._indices = rng.choice(len(instance.labels), size=max_samples, replace=False)
        # Eagerly subsample labels (small) so .labels and __len__ stay consistent
        instance.labels = instance.labels[instance._indices]

    return instance

`len()` ¶

Returns the total number of samples currently in the dataset.

Source code in orchard/data_handler/dataset.py

def __len__(self) -> int:
    """Returns the total number of samples currently in the dataset."""
    return len(self.labels)

`getitem(idx)` ¶

Retrieves a standardized sample-label pair.

The image is converted to a PIL object to ensure compatibility with Torchvision V2 transforms before being returned as a PyTorch Tensor.

Parameters:

Name	Type	Description	Default
`idx`	`int`	Sample index.	required

Returns:

Type	Description
`Tensor`	A pair of (image, label) where image is a `(C, H, W)` float
`Tensor`	tensor and label is a scalar long tensor.

Source code in orchard/data_handler/dataset.py

def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Retrieves a standardized sample-label pair.

    The image is converted to a PIL object to ensure compatibility with
    Torchvision V2 transforms before being returned as a PyTorch Tensor.

    Args:
        idx: Sample index.

    Returns:
        A pair of (image, label) where image is a ``(C, H, W)`` float
        tensor and label is a scalar long tensor.
    """
    # Remap index for lazy subsampling (images stay full mmap)
    img_idx = self._indices[idx] if self._indices is not None else idx
    img = self.images[img_idx]

    pil_img = Image.fromarray(img.squeeze() if img.shape[-1] == 1 else img)

    if self.transform:
        img_t = self.transform(pil_img)
    else:
        img_t = transforms.functional.to_tensor(pil_img)

    return img_t, torch.tensor(int(self.labels[idx]), dtype=torch.long)

`DatasetData(path, name, is_rgb, num_classes)` `dataclass` ¶

Metadata container for a loaded dataset.

Stores path and format info instead of raw arrays to save RAM.

`DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata)` ¶

Orchestrates the creation of optimized PyTorch DataLoaders.

This factory centralizes the configuration of training, validation, and testing pipelines. It ensures that data transformations, class balancing, and hardware settings are synchronized across all splits.

Attributes:

Name	Type	Description
`dataset_cfg`	`DatasetConfig`	Dataset sub-config.
`training_cfg`	`TrainingConfig`	Training sub-config.
`aug_cfg`	`AugmentationConfig`	Augmentation sub-config.
`num_workers`	`int`	Resolved worker count from hardware config.
`metadata`	`DatasetData`	Data path and raw format information.
`ds_meta`	`DatasetMetadata`	Official dataset registry specifications.
`logger`	`Logger`	Module-specific logger.

Initializes the factory with environment and dataset metadata.

Parameters:

Name	Type	Description	Default
`dataset_cfg`	`DatasetConfig`	Dataset sub-config (splits, classes, resolution).	required
`training_cfg`	`TrainingConfig`	Training sub-config (batch size, seed).	required
`aug_cfg`	`AugmentationConfig`	Augmentation sub-config (transforms pipeline).	required
`num_workers`	`int`	Resolved worker count from hardware config.	required
`metadata`	`DatasetData`	Metadata from the data fetcher/downloader.	required

Source code in orchard/data_handler/loader.py

def __init__(
    self,
    dataset_cfg: DatasetConfig,
    training_cfg: TrainingConfig,
    aug_cfg: AugmentationConfig,
    num_workers: int,
    metadata: DatasetData,
) -> None:
    """
    Initializes the factory with environment and dataset metadata.

    Args:
        dataset_cfg: Dataset sub-config (splits, classes, resolution).
        training_cfg: Training sub-config (batch size, seed).
        aug_cfg: Augmentation sub-config (transforms pipeline).
        num_workers: Resolved worker count from hardware config.
        metadata: Metadata from the data fetcher/downloader.
    """
    self.dataset_cfg = dataset_cfg
    self.training_cfg = training_cfg
    self.aug_cfg = aug_cfg
    self._num_workers = num_workers
    self.metadata = metadata

    wrapper = DatasetRegistryWrapper(resolution=dataset_cfg.resolution)
    self.ds_meta = wrapper.get_dataset(dataset_cfg.dataset_name)
    self.logger = logging.getLogger(LOGGER_NAME)

`build(is_optuna=False)` ¶

Constructs and returns the full suite of DataLoaders.

Assembles train/val/test splits with transforms, optional class balancing, and hardware-aware infrastructure settings.

Parameters:

Name	Type	Description	Default
`is_optuna`	`bool`	If True, use memory-conservative settings for hyperparameter tuning (fewer workers, no persistent workers).	`False`

Returns:

Type	Description
`tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]`	A tuple of (train_loader, val_loader, test_loader).

Source code in orchard/data_handler/loader.py

def build(
    self, is_optuna: bool = False
) -> tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]:
    """
    Constructs and returns the full suite of DataLoaders.

    Assembles train/val/test splits with transforms, optional class
    balancing, and hardware-aware infrastructure settings.

    Args:
        is_optuna: If True, use memory-conservative settings for
            hyperparameter tuning (fewer workers, no persistent workers).

    Returns:
        A tuple of (train_loader, val_loader, test_loader).
    """
    # 1. Setup transforms
    train_trans, val_trans = self._get_transformation_pipelines()

    # 2. Instantiate Dataset splits (lazy=mmap, eager=full RAM copy)
    _build = VisionDataset.lazy if self.dataset_cfg.lazy_loading else VisionDataset.from_npz
    ds_params = {"path": self.metadata.path, "seed": self.training_cfg.seed}

    train_ds = _build(
        **ds_params,
        split="train",
        transform=train_trans,
        max_samples=self.dataset_cfg.max_samples,
    )

    # Proportional downsizing for validation/testing if max_samples is set
    sub_samples = None
    if self.dataset_cfg.max_samples:
        sub_samples = max(
            _MIN_SUBSAMPLED_SPLIT,
            int(self.dataset_cfg.max_samples * self.dataset_cfg.val_ratio),
        )

    val_ds = _build(**ds_params, split="val", transform=val_trans, max_samples=sub_samples)
    test_ds = _build(**ds_params, split="test", transform=val_trans, max_samples=sub_samples)

    # 3. Resolve Sampler and Infrastructure
    sampler = self._get_balancing_sampler(train_ds)
    infra_kwargs = self._get_infrastructure_kwargs(is_optuna=is_optuna)

    # 4. Construct DataLoaders
    train_loader = DataLoader(
        train_ds,
        batch_size=self.training_cfg.batch_size,
        shuffle=(sampler is None),
        sampler=sampler,
        drop_last=True,
        **infra_kwargs,
    )

    val_loader = DataLoader(
        val_ds, batch_size=self.training_cfg.batch_size, shuffle=False, **infra_kwargs
    )

    test_loader = DataLoader(
        test_ds, batch_size=self.training_cfg.batch_size, shuffle=False, **infra_kwargs
    )

    optuna_str = " (Optuna)" if is_optuna else ""
    self.logger.info(
        "%s%s %-18s: (%s)%s → Train:[%d] Val:[%d] Test:[%d]",
        LogStyle.INDENT,
        LogStyle.ARROW,
        "DataLoaders",
        self.dataset_cfg.processing_mode,
        optuna_str,
        len(train_ds),
        len(val_ds),
        len(test_ds),
    )

    return train_loader, val_loader, test_loader

`show_sample_images(loader, save_path, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, title_prefix=None)` ¶

Extract a batch from the DataLoader and save a grid of sample images.

Saves images with their corresponding labels to verify data integrity and augmentations.

Parameters:

Name	Type	Description	Default
`loader`	`DataLoader[Any]`	The PyTorch DataLoader to sample from.	required
`save_path`	`Path`	Full path (including filename) to save the resulting image.	required
`mean`	`tuple[float, ...] \| None`	Per-channel mean for denormalization.	`None`
`std`	`tuple[float, ...] \| None`	Per-channel std for denormalization.	`None`
`arch_name`	`str`	Architecture name for the figure title.	`'Model'`
`fig_dpi`	`int`	DPI for the saved figure.	`_DEFAULT_DPI`
`num_samples`	`int`	Number of images to display in the grid.	`16`
`title_prefix`	`str \| None`	Optional string to prepend to the figure title.	`None`

Source code in orchard/data_handler/data_explorer.py

def show_sample_images(
    loader: DataLoader[Any],
    save_path: Path,
    *,
    mean: tuple[float, ...] | None = None,
    std: tuple[float, ...] | None = None,
    arch_name: str = "Model",
    fig_dpi: int = _DEFAULT_DPI,
    num_samples: int = 16,
    title_prefix: str | None = None,
) -> None:
    """
    Extract a batch from the DataLoader and save a grid of sample images.

    Saves images with their corresponding labels to verify data integrity and augmentations.

    Args:
        loader: The PyTorch DataLoader to sample from.
        save_path: Full path (including filename) to save the resulting image.
        mean: Per-channel mean for denormalization.
        std: Per-channel std for denormalization.
        arch_name: Architecture name for the figure title.
        fig_dpi: DPI for the saved figure.
        num_samples: Number of images to display in the grid.
        title_prefix: Optional string to prepend to the figure title.
    """
    try:
        batch_images, _ = next(iter(loader))
    except StopIteration:
        logger.error("DataLoader is empty. Cannot generate sample images.")
        return

    actual_samples = min(len(batch_images), num_samples)
    images = batch_images[:actual_samples]

    # Apply denormalization if mean/std are provided
    if mean is not None and std is not None:
        mean_t = torch.tensor(mean).view(-1, 1, 1)
        std_t = torch.tensor(std).view(-1, 1, 1)
        images = images * std_t + mean_t

    images = torch.clamp(images, 0, 1)

    # Create a grid
    grid = make_grid(images, nrow=int(actual_samples**0.5), padding=2)  # pragma: no mutate

    # Convert to numpy HWC for matplotlib
    plt.imshow(  # pragma: no mutate
        (
            grid.squeeze(0).cpu().numpy()
            if images.shape[1] == 1
            else grid.permute(1, 2, 0).cpu().numpy()
        ),
        cmap="gray" if images.shape[1] == 1 else None,  # pragma: no mutate
    )

    # Figure title
    title_str = f"{arch_name} — {actual_samples} Samples"  # pragma: no mutate
    if title_prefix:
        title_str = f"{title_prefix} — {title_str}"  # pragma: no mutate
    plt.title(title_str, fontsize=14)  # pragma: no mutate

    plt.axis("off")  # pragma: no mutate
    plt.tight_layout()  # pragma: no mutate

    # Ensure target directory exists
    save_path.parent.mkdir(parents=True, exist_ok=True)

    plt.savefig(save_path, dpi=fig_dpi, bbox_inches="tight")  # pragma: no mutate
    plt.close()  # pragma: no mutate
    logger.info(
        "%s%s %-18s: %s",
        LogStyle.INDENT,
        LogStyle.ARROW,
        "Sample Grid",
        save_path.name,
    )

`show_samples_for_dataset(loader, dataset_name, run_paths, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, resolution=None)` ¶

Generate a grid of sample images from a dataset and save to the figures directory.

Parameters:

Name	Type	Description	Default
`loader`	`DataLoader[Any]`	PyTorch DataLoader to sample images from.	required
`dataset_name`	`str`	Name of the dataset, used in the filename and title.	required
`run_paths`	`RunPaths`	RunPaths instance to resolve figure saving path.	required
`mean`	`tuple[float, ...] \| None`	Per-channel mean for denormalization.	`None`
`std`	`tuple[float, ...] \| None`	Per-channel std for denormalization.	`None`
`arch_name`	`str`	Architecture name for the figure title.	`'Model'`
`fig_dpi`	`int`	DPI for the saved figure.	`_DEFAULT_DPI`
`num_samples`	`int`	Number of images to include in the grid.	`16`
`resolution`	`int \| None`	Resolution to include in filename to avoid overwriting.	`None`

Source code in orchard/data_handler/data_explorer.py

def show_samples_for_dataset(
    loader: DataLoader[Any],
    dataset_name: str,
    run_paths: RunPaths,
    *,
    mean: tuple[float, ...] | None = None,
    std: tuple[float, ...] | None = None,
    arch_name: str = "Model",
    fig_dpi: int = _DEFAULT_DPI,
    num_samples: int = 16,
    resolution: int | None = None,
) -> None:
    """
    Generate a grid of sample images from a dataset and save to the figures directory.

    Args:
        loader: PyTorch DataLoader to sample images from.
        dataset_name: Name of the dataset, used in the filename and title.
        run_paths: RunPaths instance to resolve figure saving path.
        mean: Per-channel mean for denormalization.
        std: Per-channel std for denormalization.
        arch_name: Architecture name for the figure title.
        fig_dpi: DPI for the saved figure.
        num_samples: Number of images to include in the grid.
        resolution: Resolution to include in filename to avoid overwriting.
    """
    res_str = f"_{resolution}x{resolution}" if resolution else ""
    save_path = run_paths.get_fig_path(f"{dataset_name}/sample_grid{res_str}.png")
    save_path.parent.mkdir(parents=True, exist_ok=True)

    show_sample_images(
        loader=loader,
        save_path=save_path,
        mean=mean,
        std=std,
        arch_name=arch_name,
        fig_dpi=fig_dpi,
        num_samples=num_samples,
        title_prefix=f"{dataset_name}{res_str}",
    )

`create_synthetic_dataset(num_classes=8, samples=100, resolution=28, channels=3, name='syntheticmnist')` ¶

Create a synthetic NPZ-compatible dataset for testing.

This function generates random image data and labels, saves them to a temporary .npz file, and returns a DatasetData object that can be used with the existing data pipeline.

Parameters:

Name	Type	Description	Default
`num_classes`	`int`	Number of classification categories (default: 8)	`8`
`samples`	`int`	Number of training samples (default: 100)	`100`
`resolution`	`int`	Image resolution (HxW) (default: 28)	`28`
`channels`	`int`	Number of color channels (default: 3 for RGB)	`3`
`name`	`str`	Dataset name for identification (default: "syntheticmnist")	`'syntheticmnist'`

Returns:

Name	Type	Description
`DatasetData`	`DatasetData`	A data object compatible with the existing pipeline

Example

data = create_synthetic_dataset(num_classes=8, samples=100) train_loader, val_loader, test_loader = get_dataloaders( ... data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers ... )

Source code in orchard/data_handler/diagnostic/synthetic.py

def create_synthetic_dataset(
    num_classes: int = 8,
    samples: int = 100,
    resolution: int = 28,
    channels: int = 3,
    name: str = "syntheticmnist",
) -> DatasetData:
    """
    Create a synthetic NPZ-compatible dataset for testing.

    This function generates random image data and labels, saves them to a
    temporary .npz file, and returns a DatasetData object that can be used
    with the existing data pipeline.

    Args:
        num_classes: Number of classification categories (default: 8)
        samples: Number of training samples (default: 100)
        resolution: Image resolution (HxW) (default: 28)
        channels: Number of color channels (default: 3 for RGB)
        name: Dataset name for identification (default: "syntheticmnist")

    Returns:
        DatasetData: A data object compatible with the existing pipeline

    Example:
        >>> data = create_synthetic_dataset(num_classes=8, samples=100)
        >>> train_loader, val_loader, test_loader = get_dataloaders(
        ...     data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers
        ... )
    """
    rng = np.random.default_rng(_SYNTHETIC_SEED)

    # Generate synthetic image data
    train_images = rng.integers(
        0, _SYNTHETIC_PIXEL_RANGE, (samples, resolution, resolution, channels), dtype=np.uint8
    )
    train_labels = rng.integers(0, num_classes, (samples, 1), dtype=np.uint8)

    # Validation and test sets are smaller (10% of training size each)
    val_samples = max(_MIN_SPLIT_SAMPLES, samples // 10)
    test_samples = max(_MIN_SPLIT_SAMPLES, samples // 10)

    val_images = rng.integers(
        0, _SYNTHETIC_PIXEL_RANGE, (val_samples, resolution, resolution, channels), dtype=np.uint8
    )
    val_labels = rng.integers(0, num_classes, (val_samples, 1), dtype=np.uint8)

    test_images = rng.integers(
        0, _SYNTHETIC_PIXEL_RANGE, (test_samples, resolution, resolution, channels), dtype=np.uint8
    )
    test_labels = rng.integers(0, num_classes, (test_samples, 1), dtype=np.uint8)

    # Create a temporary .npz file with standard format
    temp_file = tempfile.NamedTemporaryFile(
        suffix=".npz", delete=False, prefix="synthetic_dataset_"
    )
    temp_path = Path(temp_file.name)
    temp_file.close()

    # Save in NPZ format with correct key names
    np.savez(
        temp_path,
        train_images=train_images,
        train_labels=train_labels,
        val_images=val_images,
        val_labels=val_labels,
        test_images=test_images,
        test_labels=test_labels,
    )

    # Return a DatasetData object with all required parameters
    is_rgb = channels == 3

    return DatasetData(
        path=temp_path,
        name=name,
        is_rgb=is_rgb,
        num_classes=num_classes,
    )

`create_synthetic_grayscale_dataset(num_classes=8, samples=100, resolution=28)` ¶

Create a synthetic grayscale NPZ dataset for testing.

Convenience function for creating single-channel (grayscale) synthetic data.

Parameters:

Name	Type	Description	Default
`num_classes`	`int`	Number of classification categories (default: 8)	`8`
`samples`	`int`	Number of training samples (default: 100)	`100`
`resolution`	`int`	Image resolution (HxW) (default: 28)	`28`

Returns:

Name	Type	Description
`DatasetData`	`DatasetData`	A grayscale data object compatible with the pipeline

Source code in orchard/data_handler/diagnostic/synthetic.py

def create_synthetic_grayscale_dataset(
    num_classes: int = 8,
    samples: int = 100,
    resolution: int = 28,
) -> DatasetData:
    """
    Create a synthetic grayscale NPZ dataset for testing.

    Convenience function for creating single-channel (grayscale) synthetic data.

    Args:
        num_classes: Number of classification categories (default: 8)
        samples: Number of training samples (default: 100)
        resolution: Image resolution (HxW) (default: 28)

    Returns:
        DatasetData: A grayscale data object compatible with the pipeline
    """
    return create_synthetic_dataset(
        num_classes=num_classes,
        samples=samples,
        resolution=resolution,
        channels=1,
        name="syntheticmnist_gray",
    )

`create_temp_loader(dataset_path, batch_size=_DEFAULT_HEALTHCHECK_BATCH_SIZE)` ¶

Load a NPZ dataset lazily and return a DataLoader for health checks.

This avoids loading the entire dataset into RAM at once, which is critical for large datasets (e.g., 224x224 images).

Source code in orchard/data_handler/diagnostic/temp_loader.py

def create_temp_loader(
    dataset_path: Path, batch_size: int = _DEFAULT_HEALTHCHECK_BATCH_SIZE
) -> DataLoader[Any]:
    """
    Load a NPZ dataset lazily and return a DataLoader for health checks.

    This avoids loading the entire dataset into RAM at once, which is critical
    for large datasets (e.g., 224x224 images).
    """
    dataset = VisionDataset.lazy(dataset_path)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return loader

`ensure_dataset_npz(metadata, retries=5, delay=5.0)` ¶

Dispatcher that routes each dataset to its dedicated fetch pipeline.

Automatically detects dataset type from metadata.name and delegates to the appropriate download/conversion module. Adding a new domain (e.g. a new resolution or source) only requires a new branch here and a corresponding fetch module.

Parameters:

Name	Type	Description	Default
`metadata`	`DatasetMetadata`	Metadata containing URL, MD5, name and target path.	required
`retries`	`int`	Max number of download attempts (NPZ fetcher only).	`5`
`delay`	`float`	Delay (seconds) between retries (NPZ fetcher only).	`5.0`

Returns:

Name	Type	Description
`Path`	`Path`	Path to the successfully validated .npz file.

Source code in orchard/data_handler/fetcher.py

def ensure_dataset_npz(
    metadata: DatasetMetadata,
    retries: int = 5,
    delay: float = 5.0,
) -> Path:
    """
    Dispatcher that routes each dataset to its dedicated fetch pipeline.

    Automatically detects dataset type from ``metadata.name`` and delegates
    to the appropriate download/conversion module. Adding a new domain
    (e.g. a new resolution or source) only requires a new branch here and
    a corresponding fetch module.

    Args:
        metadata (DatasetMetadata): Metadata containing URL, MD5, name and target path.
        retries (int): Max number of download attempts (NPZ fetcher only).
        delay (float): Delay (seconds) between retries (NPZ fetcher only).

    Returns:
        Path: Path to the successfully validated .npz file.
    """
    # Galaxy10 requires HDF5 download and conversion to NPZ
    if metadata.name == "galaxy10":
        from .fetchers import ensure_galaxy10_npz

        return ensure_galaxy10_npz(metadata)

    # CIFAR-10/100 via torchvision download and NPZ conversion
    if metadata.name in ("cifar10", "cifar100"):
        from .fetchers import ensure_cifar_npz

        return ensure_cifar_npz(metadata)

    # Default: standard NPZ download with retries and MD5 check
    from .fetchers import ensure_medmnist_npz

    return ensure_medmnist_npz(metadata, retries=retries, delay=delay)

`load_dataset(metadata)` ¶

Ensures the dataset is present and returns its metadata container.

Source code in orchard/data_handler/fetcher.py

def load_dataset(metadata: DatasetMetadata) -> DatasetData:
    """
    Ensures the dataset is present and returns its metadata container.
    """
    return _load_and_inspect(metadata)

`get_dataloaders(metadata, dataset_cfg, training_cfg, aug_cfg, num_workers, is_optuna=False)` ¶

Convenience function for creating train/val/test DataLoaders.

Wraps DataLoaderFactory for streamlined loader construction with automatic class balancing, hardware optimization, and Optuna support.

Parameters:

Name	Type	Description	Default
`metadata`	`DatasetData`	Dataset metadata from load_dataset (paths, splits).	required
`dataset_cfg`	`DatasetConfig`	Dataset sub-config (splits, classes, resolution).	required
`training_cfg`	`TrainingConfig`	Training sub-config (batch size, seed).	required
`aug_cfg`	`AugmentationConfig`	Augmentation sub-config (transforms pipeline).	required
`num_workers`	`int`	Resolved worker count from hardware config.	required
`is_optuna`	`bool`	If True, use memory-conservative settings for hyperparameter tuning.	`False`

Returns:

Type	Description
`tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]`	A 3-tuple of (train_loader, val_loader, test_loader).

Example

data = load_dataset(ds_meta) loaders = get_dataloaders( ... data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers ... )

Source code in orchard/data_handler/loader.py

def get_dataloaders(
    metadata: DatasetData,
    dataset_cfg: DatasetConfig,
    training_cfg: TrainingConfig,
    aug_cfg: AugmentationConfig,
    num_workers: int,
    is_optuna: bool = False,
) -> tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]:
    """
    Convenience function for creating train/val/test DataLoaders.

    Wraps DataLoaderFactory for streamlined loader construction with
    automatic class balancing, hardware optimization, and Optuna support.

    Args:
        metadata: Dataset metadata from load_dataset (paths, splits).
        dataset_cfg: Dataset sub-config (splits, classes, resolution).
        training_cfg: Training sub-config (batch size, seed).
        aug_cfg: Augmentation sub-config (transforms pipeline).
        num_workers: Resolved worker count from hardware config.
        is_optuna: If True, use memory-conservative settings for
            hyperparameter tuning.

    Returns:
        A 3-tuple of (train_loader, val_loader, test_loader).

    Example:
        >>> data = load_dataset(ds_meta)
        >>> loaders = get_dataloaders(
        ...     data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers
        ... )
    """
    factory = DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata)
    return factory.build(is_optuna=is_optuna)

`get_augmentations_description(aug_cfg, img_size, mixup_alpha, ds_meta=None)` ¶

Generates descriptive string of augmentations for logging.

Parameters:

Name	Type	Description	Default
`aug_cfg`	`AugmentationConfig`	Augmentation sub-configuration	required
`img_size`	`int`	Target image size for resized crop	required
`mixup_alpha`	`float`	MixUp alpha (0.0 to disable)	required
`ds_meta`	`DatasetMetadata \| None`	Dataset metadata (if provided, respects domain flags)	`None`

Returns:

Type	Description
`str`	Human-readable augmentation summary

Source code in orchard/data_handler/transforms.py

def get_augmentations_description(
    aug_cfg: AugmentationConfig,
    img_size: int,
    mixup_alpha: float,
    ds_meta: DatasetMetadata | None = None,
) -> str:
    """
    Generates descriptive string of augmentations for logging.

    Args:
        aug_cfg: Augmentation sub-configuration
        img_size: Target image size for resized crop
        mixup_alpha: MixUp alpha (0.0 to disable)
        ds_meta: Dataset metadata (if provided, respects domain flags)

    Returns:
        Human-readable augmentation summary
    """
    is_anatomical = ds_meta.is_anatomical if ds_meta else True
    is_texture = ds_meta.is_texture_based if ds_meta else True

    params = {}
    if not is_anatomical:
        params["HFlip"] = aug_cfg.hflip
        params["Rotation"] = f"{aug_cfg.rotation_angle}°"
    if not is_texture:
        params["Jitter"] = aug_cfg.jitter_val
    params["ResizedCrop"] = f"{img_size} ({aug_cfg.min_scale}, 1.0)"

    descr = [f"{k}({v})" for k, v in params.items()]

    if mixup_alpha > 0:
        descr.append(f"MixUp(α={mixup_alpha})")

    return ", ".join(descr)

`get_pipeline_transforms(aug_cfg, img_size, ds_meta, *, force_rgb=True, norm_mean=None, norm_std=None)` ¶

Constructs training and validation transformation pipelines.

Dynamically adapts to dataset characteristics (RGB vs Grayscale) and optionally promotes grayscale to 3-channel for pretrained-weight compatibility. Uses torchvision v2 transforms for improved CPU/GPU performance.

Pipeline Logic

Convert to tensor format (ToImage + ToDtype)
Promote 1-channel to 3-channel when force_rgb is True and the dataset is native grayscale
Apply domain-aware augmentations (training only): geometric transforms disabled for anatomical datasets, color jitter reduced for texture-based datasets
Normalize with dataset-specific statistics

Parameters:

Name	Type	Description	Default
`aug_cfg`	`AugmentationConfig`	Augmentation sub-configuration	required
`img_size`	`int`	Target image size	required
`ds_meta`	`DatasetMetadata`	Dataset metadata (channels, domain flags)	required
`force_rgb`	`bool`	Promote grayscale datasets to 3-channel RGB	`True`
`norm_mean`	`tuple[float, ...] \| None`	Pre-computed normalization mean (from DatasetConfig). When None, computed from ds_meta + force_rgb.	`None`
`norm_std`	`tuple[float, ...] \| None`	Pre-computed normalization std (from DatasetConfig). When None, computed from ds_meta + force_rgb.	`None`

Returns:

Type	Description
`tuple[Compose, Compose]`	tuple[v2.Compose, v2.Compose]: (train_transform, val_transform)

Source code in orchard/data_handler/transforms.py

def get_pipeline_transforms(
    aug_cfg: AugmentationConfig,
    img_size: int,
    ds_meta: DatasetMetadata,
    *,
    force_rgb: bool = True,
    norm_mean: tuple[float, ...] | None = None,
    norm_std: tuple[float, ...] | None = None,
) -> tuple[v2.Compose, v2.Compose]:
    """
    Constructs training and validation transformation pipelines.

    Dynamically adapts to dataset characteristics (RGB vs Grayscale) and
    optionally promotes grayscale to 3-channel for pretrained-weight
    compatibility.  Uses torchvision v2 transforms for improved CPU/GPU
    performance.

    Pipeline Logic:
        1. Convert to tensor format (ToImage + ToDtype)
        2. Promote 1-channel to 3-channel when ``force_rgb`` is True
           and the dataset is native grayscale
        3. Apply domain-aware augmentations (training only):
           geometric transforms disabled for anatomical datasets,
           color jitter reduced for texture-based datasets
        4. Normalize with dataset-specific statistics

    Args:
        aug_cfg: Augmentation sub-configuration
        img_size: Target image size
        ds_meta: Dataset metadata (channels, domain flags)
        force_rgb: Promote grayscale datasets to 3-channel RGB
        norm_mean: Pre-computed normalization mean (from DatasetConfig).
            When None, computed from ds_meta + force_rgb.
        norm_std: Pre-computed normalization std (from DatasetConfig).
            When None, computed from ds_meta + force_rgb.

    Returns:
        tuple[v2.Compose, v2.Compose]: (train_transform, val_transform)
    """
    # Determine if dataset is native RGB or requires grayscale promotion
    is_rgb = ds_meta.in_channels == 3
    promote_to_rgb = not is_rgb and force_rgb

    # Normalization statistics: prefer caller-provided (single source of truth
    # via DatasetConfig.mean/std), fall back to local computation.
    if norm_mean is not None and norm_std is not None:
        mean = list(norm_mean)
        std = list(norm_std)
    elif promote_to_rgb:
        mean = [ds_meta.mean[0]] * 3
        std = [ds_meta.std[0]] * 3
    else:
        mean = list(ds_meta.mean)
        std = list(ds_meta.std)

    def get_base_ops() -> list[v2.Transform]:
        """
        Foundational operations common to all pipelines.

        Returns:
            list of base transforms (tensor conversion + channel promotion)
        """
        ops = [
            v2.ToImage(),  # Convert PIL/ndarray to tensor
            v2.ToDtype(torch.float32, scale=True),  # Scale to [0,1]
        ]

        # Promote 1-channel to 3-channel for architecture compatibility
        if promote_to_rgb:
            ops.append(v2.Grayscale(num_output_channels=3))

        return ops

    # --- TRAINING PIPELINE ---
    # Domain-aware augmentations: respects is_anatomical and is_texture_based flags
    train_ops = [*get_base_ops()]

    # Geometric: disabled for anatomical datasets (orientation is diagnostic)
    if not ds_meta.is_anatomical:
        train_ops.append(v2.RandomHorizontalFlip(p=aug_cfg.hflip))
        train_ops.append(v2.RandomRotation(aug_cfg.rotation_angle))

    # Photometric: reduced for texture-based datasets (fine patterns are fragile)
    if not ds_meta.is_texture_based:
        train_ops.append(
            v2.ColorJitter(
                brightness=aug_cfg.jitter_val,
                contrast=aug_cfg.jitter_val,
                saturation=aug_cfg.jitter_val if is_rgb else 0.0,
            )
        )

    train_ops.extend(
        [
            v2.RandomResizedCrop(
                size=img_size,
                scale=(aug_cfg.min_scale, 1.0),
                antialias=True,
                interpolation=v2.InterpolationMode.BILINEAR,
            ),
            v2.Normalize(mean=mean, std=std),
        ]
    )

    train_transform = v2.Compose(train_ops)

    # --- VALIDATION/INFERENCE PIPELINE ---
    # Deterministic transformations only (no augmentation)
    val_transform = v2.Compose(
        [
            *get_base_ops(),
            v2.Resize(size=img_size, antialias=True),
            v2.Normalize(mean=mean, std=std),
        ]
    )

    return train_transform, val_transform

data_handler

orchard.data_handler ¶

VisionDataset(images, labels, *, transform=None) ¶

from_npz(path, split='train', *, transform=None, max_samples=None, seed=42) classmethod ¶

lazy(path, split='train', *, transform=None, max_samples=None, seed=42) classmethod ¶

__len__() ¶

__getitem__(idx) ¶

DatasetData(path, name, is_rgb, num_classes) dataclass ¶

DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata) ¶

build(is_optuna=False) ¶

show_sample_images(loader, save_path, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, title_prefix=None) ¶

show_samples_for_dataset(loader, dataset_name, run_paths, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, resolution=None) ¶

create_synthetic_dataset(num_classes=8, samples=100, resolution=28, channels=3, name='syntheticmnist') ¶

create_synthetic_grayscale_dataset(num_classes=8, samples=100, resolution=28) ¶

create_temp_loader(dataset_path, batch_size=_DEFAULT_HEALTHCHECK_BATCH_SIZE) ¶

ensure_dataset_npz(metadata, retries=5, delay=5.0) ¶

load_dataset(metadata) ¶

get_dataloaders(metadata, dataset_cfg, training_cfg, aug_cfg, num_workers, is_optuna=False) ¶

get_augmentations_description(aug_cfg, img_size, mixup_alpha, ds_meta=None) ¶

get_pipeline_transforms(aug_cfg, img_size, ds_meta, *, force_rgb=True, norm_mean=None, norm_std=None) ¶

`orchard.data_handler` ¶

`VisionDataset(images, labels, *, transform=None)` ¶

`from_npz(path, split='train', *, transform=None, max_samples=None, seed=42)` `classmethod` ¶

`lazy(path, split='train', *, transform=None, max_samples=None, seed=42)` `classmethod` ¶

`len()` ¶

`getitem(idx)` ¶

`DatasetData(path, name, is_rgb, num_classes)` `dataclass` ¶

`DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata)` ¶

`build(is_optuna=False)` ¶

`show_sample_images(loader, save_path, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, title_prefix=None)` ¶

`show_samples_for_dataset(loader, dataset_name, run_paths, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, resolution=None)` ¶

`create_synthetic_dataset(num_classes=8, samples=100, resolution=28, channels=3, name='syntheticmnist')` ¶

`create_synthetic_grayscale_dataset(num_classes=8, samples=100, resolution=28)` ¶

`create_temp_loader(dataset_path, batch_size=_DEFAULT_HEALTHCHECK_BATCH_SIZE)` ¶

`ensure_dataset_npz(metadata, retries=5, delay=5.0)` ¶

`load_dataset(metadata)` ¶

`get_dataloaders(metadata, dataset_cfg, training_cfg, aug_cfg, num_workers, is_optuna=False)` ¶

`get_augmentations_description(aug_cfg, img_size, mixup_alpha, ds_meta=None)` ¶

`get_pipeline_transforms(aug_cfg, img_size, ds_meta, *, force_rgb=True, norm_mean=None, norm_std=None)` ¶