Skip to content

data_handler

orchard.data_handler

Data Handler Package.

This package manages the end-to-end data pipeline, from downloading raw NPZ files using the Dataset Registry to providing fully configured PyTorch DataLoaders.

VisionDataset(images, labels, *, transform=None)

Bases: Dataset[tuple[Tensor, Tensor]]

PyTorch Dataset for NPZ-based vision data.

The constructor accepts raw NumPy arrays directly (no I/O). Use the classmethod factories to load from disk:

  • VisionDataset.from_npz(...) — eager, full split into RAM.
  • VisionDataset.lazy(...) — memory-mapped, pages loaded on demand.

Initializes the dataset from pre-loaded arrays.

Parameters:

Name Type Description Default
images NDArray[Any]

Image array with shape (N, H, W) or (N, H, W, C).

required
labels NDArray[Any]

Label array, any shape that flattens to (N,).

required
transform Compose | None

Pipeline of Torchvision transforms.

None
Source code in orchard/data_handler/dataset.py
def __init__(
    self,
    images: npt.NDArray[Any],
    labels: npt.NDArray[Any],
    *,
    transform: transforms.Compose | None = None,
) -> None:
    """
    Initializes the dataset from pre-loaded arrays.

    Args:
        images: Image array with shape ``(N, H, W)`` or ``(N, H, W, C)``.
        labels: Label array, any shape that flattens to ``(N,)``.
        transform: Pipeline of Torchvision transforms.
    """
    # Ensure consistent (N, H, W, C) for PIL conversion
    if images.ndim == 3:  # (N, H, W) -> (N, H, W, 1)
        images = np.expand_dims(images, axis=-1)

    self.images = images
    self.labels: npt.NDArray[Any] = labels.ravel().astype(np.int64)
    self.transform = transform

    # Kept alive to prevent GC of mmap arrays (set by .lazy())
    self._npz_handle: np.lib.npyio.NpzFile | None = None
    # Index mapping for lazy subsampling (None = use all)
    self._indices: npt.NDArray[Any] | None = None

from_npz(path, split='train', *, transform=None, max_samples=None, seed=42) classmethod

Eagerly load a split from an NPZ archive into RAM.

Parameters:

Name Type Description Default
path Path

Path to the dataset .npz archive.

required
split str

Dataset split to load (train, val, or test).

'train'
transform Compose | None

Pipeline of Torchvision transforms.

None
max_samples int | None

If set, limits the number of samples (subsampling).

None
seed int

Random seed for deterministic subsampling.

42
Source code in orchard/data_handler/dataset.py
@classmethod
def from_npz(
    cls,
    path: Path,
    split: str = "train",
    *,
    transform: transforms.Compose | None = None,
    max_samples: int | None = None,
    seed: int = 42,
) -> VisionDataset:
    """
    Eagerly load a split from an NPZ archive into RAM.

    Args:
        path: Path to the dataset ``.npz`` archive.
        split: Dataset split to load (``train``, ``val``, or ``test``).
        transform: Pipeline of Torchvision transforms.
        max_samples: If set, limits the number of samples (subsampling).
        seed: Random seed for deterministic subsampling.
    """
    if not path.exists():
        raise OrchardDatasetError(f"Dataset file not found at: {path}")

    with np.load(path) as data:
        raw_images = data[f"{split}_images"]
        raw_labels = data[f"{split}_labels"]

        total_available = len(raw_labels)

        # Deterministic subsampling logic
        if max_samples and max_samples < total_available:
            rng = np.random.default_rng(seed)
            chosen = rng.choice(total_available, size=max_samples, replace=False)
            images = raw_images[chosen]
            labels = raw_labels[chosen]
        else:
            images = np.array(raw_images)
            labels = raw_labels

    return cls(images, labels, transform=transform)

lazy(path, split='train', *, transform=None, max_samples=None, seed=42) classmethod

Memory-mapped load from an NPZ archive (no full RAM copy).

Images are loaded page-by-page on demand. Suitable for large datasets that do not fit in RAM and for lightweight health checks.

Parameters:

Name Type Description Default
path Path

Path to the .npz file.

required
split str

Dataset split to load (default train).

'train'
transform Compose | None

Pipeline of Torchvision transforms.

None
max_samples int | None

If set, limits the number of samples (subsampling).

None
seed int

Random seed for deterministic subsampling.

42
Source code in orchard/data_handler/dataset.py
@classmethod
def lazy(
    cls,
    path: Path,
    split: str = "train",
    *,
    transform: transforms.Compose | None = None,
    max_samples: int | None = None,
    seed: int = 42,
) -> VisionDataset:
    """
    Memory-mapped load from an NPZ archive (no full RAM copy).

    Images are loaded page-by-page on demand. Suitable for large datasets
    that do not fit in RAM and for lightweight health checks.

    Args:
        path: Path to the ``.npz`` file.
        split: Dataset split to load (default ``train``).
        transform: Pipeline of Torchvision transforms.
        max_samples: If set, limits the number of samples (subsampling).
        seed: Random seed for deterministic subsampling.
    """
    data = np.load(path, mmap_mode="r")
    instance = cls(data[f"{split}_images"], data[f"{split}_labels"], transform=transform)
    instance._npz_handle = data

    if max_samples and max_samples < len(instance.labels):
        rng = np.random.default_rng(seed)
        instance._indices = rng.choice(len(instance.labels), size=max_samples, replace=False)
        # Eagerly subsample labels (small) so .labels and __len__ stay consistent
        instance.labels = instance.labels[instance._indices]

    return instance

__len__()

Returns the total number of samples currently in the dataset.

Source code in orchard/data_handler/dataset.py
def __len__(self) -> int:
    """Returns the total number of samples currently in the dataset."""
    return len(self.labels)

__getitem__(idx)

Retrieves a standardized sample-label pair.

The image is converted to a PIL object to ensure compatibility with Torchvision V2 transforms before being returned as a PyTorch Tensor.

Parameters:

Name Type Description Default
idx int

Sample index.

required

Returns:

Type Description
Tensor

A pair of (image, label) where image is a (C, H, W) float

Tensor

tensor and label is a scalar long tensor.

Source code in orchard/data_handler/dataset.py
def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Retrieves a standardized sample-label pair.

    The image is converted to a PIL object to ensure compatibility with
    Torchvision V2 transforms before being returned as a PyTorch Tensor.

    Args:
        idx: Sample index.

    Returns:
        A pair of (image, label) where image is a ``(C, H, W)`` float
        tensor and label is a scalar long tensor.
    """
    # Remap index for lazy subsampling (images stay full mmap)
    img_idx = self._indices[idx] if self._indices is not None else idx
    img = self.images[img_idx]

    pil_img = Image.fromarray(img.squeeze() if img.shape[-1] == 1 else img)

    if self.transform:
        img_t = self.transform(pil_img)
    else:
        img_t = transforms.functional.to_tensor(pil_img)

    return img_t, torch.tensor(int(self.labels[idx]), dtype=torch.long)

DatasetData(path, name, is_rgb, num_classes) dataclass

Metadata container for a loaded dataset.

Stores path and format info instead of raw arrays to save RAM.

DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata)

Orchestrates the creation of optimized PyTorch DataLoaders.

This factory centralizes the configuration of training, validation, and testing pipelines. It ensures that data transformations, class balancing, and hardware settings are synchronized across all splits.

Attributes:

Name Type Description
dataset_cfg DatasetConfig

Dataset sub-config.

training_cfg TrainingConfig

Training sub-config.

aug_cfg AugmentationConfig

Augmentation sub-config.

num_workers int

Resolved worker count from hardware config.

metadata DatasetData

Data path and raw format information.

ds_meta DatasetMetadata

Official dataset registry specifications.

logger Logger

Module-specific logger.

Initializes the factory with environment and dataset metadata.

Parameters:

Name Type Description Default
dataset_cfg DatasetConfig

Dataset sub-config (splits, classes, resolution).

required
training_cfg TrainingConfig

Training sub-config (batch size, seed).

required
aug_cfg AugmentationConfig

Augmentation sub-config (transforms pipeline).

required
num_workers int

Resolved worker count from hardware config.

required
metadata DatasetData

Metadata from the data fetcher/downloader.

required
Source code in orchard/data_handler/loader.py
def __init__(
    self,
    dataset_cfg: DatasetConfig,
    training_cfg: TrainingConfig,
    aug_cfg: AugmentationConfig,
    num_workers: int,
    metadata: DatasetData,
) -> None:
    """
    Initializes the factory with environment and dataset metadata.

    Args:
        dataset_cfg: Dataset sub-config (splits, classes, resolution).
        training_cfg: Training sub-config (batch size, seed).
        aug_cfg: Augmentation sub-config (transforms pipeline).
        num_workers: Resolved worker count from hardware config.
        metadata: Metadata from the data fetcher/downloader.
    """
    self.dataset_cfg = dataset_cfg
    self.training_cfg = training_cfg
    self.aug_cfg = aug_cfg
    self._num_workers = num_workers
    self.metadata = metadata

    wrapper = DatasetRegistryWrapper(resolution=dataset_cfg.resolution)
    self.ds_meta = wrapper.get_dataset(dataset_cfg.dataset_name)
    self.logger = logging.getLogger(LOGGER_NAME)

build(is_optuna=False)

Constructs and returns the full suite of DataLoaders.

Assembles train/val/test splits with transforms, optional class balancing, and hardware-aware infrastructure settings.

Parameters:

Name Type Description Default
is_optuna bool

If True, use memory-conservative settings for hyperparameter tuning (fewer workers, no persistent workers).

False

Returns:

Type Description
tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]

A tuple of (train_loader, val_loader, test_loader).

Source code in orchard/data_handler/loader.py
def build(
    self, is_optuna: bool = False
) -> tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]:
    """
    Constructs and returns the full suite of DataLoaders.

    Assembles train/val/test splits with transforms, optional class
    balancing, and hardware-aware infrastructure settings.

    Args:
        is_optuna: If True, use memory-conservative settings for
            hyperparameter tuning (fewer workers, no persistent workers).

    Returns:
        A tuple of (train_loader, val_loader, test_loader).
    """
    # 1. Setup transforms
    train_trans, val_trans = self._get_transformation_pipelines()

    # 2. Instantiate Dataset splits (lazy=mmap, eager=full RAM copy)
    _build = VisionDataset.lazy if self.dataset_cfg.lazy_loading else VisionDataset.from_npz
    ds_params = {"path": self.metadata.path, "seed": self.training_cfg.seed}

    train_ds = _build(
        **ds_params,
        split="train",
        transform=train_trans,
        max_samples=self.dataset_cfg.max_samples,
    )

    # Proportional downsizing for validation/testing if max_samples is set
    sub_samples = None
    if self.dataset_cfg.max_samples:
        sub_samples = max(
            _MIN_SUBSAMPLED_SPLIT,
            int(self.dataset_cfg.max_samples * self.dataset_cfg.val_ratio),
        )

    val_ds = _build(**ds_params, split="val", transform=val_trans, max_samples=sub_samples)
    test_ds = _build(**ds_params, split="test", transform=val_trans, max_samples=sub_samples)

    # 3. Resolve Sampler and Infrastructure
    sampler = self._get_balancing_sampler(train_ds)
    infra_kwargs = self._get_infrastructure_kwargs(is_optuna=is_optuna)

    # 4. Construct DataLoaders
    train_loader = DataLoader(
        train_ds,
        batch_size=self.training_cfg.batch_size,
        shuffle=(sampler is None),
        sampler=sampler,
        drop_last=True,
        **infra_kwargs,
    )

    val_loader = DataLoader(
        val_ds, batch_size=self.training_cfg.batch_size, shuffle=False, **infra_kwargs
    )

    test_loader = DataLoader(
        test_ds, batch_size=self.training_cfg.batch_size, shuffle=False, **infra_kwargs
    )

    optuna_str = " (Optuna)" if is_optuna else ""
    self.logger.info(
        "%s%s %-18s: (%s)%s → Train:[%d] Val:[%d] Test:[%d]",
        LogStyle.INDENT,
        LogStyle.ARROW,
        "DataLoaders",
        self.dataset_cfg.processing_mode,
        optuna_str,
        len(train_ds),
        len(val_ds),
        len(test_ds),
    )

    return train_loader, val_loader, test_loader

show_sample_images(loader, save_path, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, title_prefix=None)

Extract a batch from the DataLoader and save a grid of sample images.

Saves images with their corresponding labels to verify data integrity and augmentations.

Parameters:

Name Type Description Default
loader DataLoader[Any]

The PyTorch DataLoader to sample from.

required
save_path Path

Full path (including filename) to save the resulting image.

required
mean tuple[float, ...] | None

Per-channel mean for denormalization.

None
std tuple[float, ...] | None

Per-channel std for denormalization.

None
arch_name str

Architecture name for the figure title.

'Model'
fig_dpi int

DPI for the saved figure.

_DEFAULT_DPI
num_samples int

Number of images to display in the grid.

16
title_prefix str | None

Optional string to prepend to the figure title.

None
Source code in orchard/data_handler/data_explorer.py
def show_sample_images(
    loader: DataLoader[Any],
    save_path: Path,
    *,
    mean: tuple[float, ...] | None = None,
    std: tuple[float, ...] | None = None,
    arch_name: str = "Model",
    fig_dpi: int = _DEFAULT_DPI,
    num_samples: int = 16,
    title_prefix: str | None = None,
) -> None:
    """
    Extract a batch from the DataLoader and save a grid of sample images.

    Saves images with their corresponding labels to verify data integrity and augmentations.

    Args:
        loader: The PyTorch DataLoader to sample from.
        save_path: Full path (including filename) to save the resulting image.
        mean: Per-channel mean for denormalization.
        std: Per-channel std for denormalization.
        arch_name: Architecture name for the figure title.
        fig_dpi: DPI for the saved figure.
        num_samples: Number of images to display in the grid.
        title_prefix: Optional string to prepend to the figure title.
    """
    try:
        batch_images, _ = next(iter(loader))
    except StopIteration:
        logger.error("DataLoader is empty. Cannot generate sample images.")
        return

    actual_samples = min(len(batch_images), num_samples)
    images = batch_images[:actual_samples]

    # Apply denormalization if mean/std are provided
    if mean is not None and std is not None:
        mean_t = torch.tensor(mean).view(-1, 1, 1)
        std_t = torch.tensor(std).view(-1, 1, 1)
        images = images * std_t + mean_t

    images = torch.clamp(images, 0, 1)

    # Create a grid
    grid = make_grid(images, nrow=int(actual_samples**0.5), padding=2)  # pragma: no mutate

    # Convert to numpy HWC for matplotlib
    plt.imshow(  # pragma: no mutate
        (
            grid.squeeze(0).cpu().numpy()
            if images.shape[1] == 1
            else grid.permute(1, 2, 0).cpu().numpy()
        ),
        cmap="gray" if images.shape[1] == 1 else None,  # pragma: no mutate
    )

    # Figure title
    title_str = f"{arch_name}{actual_samples} Samples"  # pragma: no mutate
    if title_prefix:
        title_str = f"{title_prefix}{title_str}"  # pragma: no mutate
    plt.title(title_str, fontsize=14)  # pragma: no mutate

    plt.axis("off")  # pragma: no mutate
    plt.tight_layout()  # pragma: no mutate

    # Ensure target directory exists
    save_path.parent.mkdir(parents=True, exist_ok=True)

    plt.savefig(save_path, dpi=fig_dpi, bbox_inches="tight")  # pragma: no mutate
    plt.close()  # pragma: no mutate
    logger.info(
        "%s%s %-18s: %s",
        LogStyle.INDENT,
        LogStyle.ARROW,
        "Sample Grid",
        save_path.name,
    )

show_samples_for_dataset(loader, dataset_name, run_paths, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, resolution=None)

Generate a grid of sample images from a dataset and save to the figures directory.

Parameters:

Name Type Description Default
loader DataLoader[Any]

PyTorch DataLoader to sample images from.

required
dataset_name str

Name of the dataset, used in the filename and title.

required
run_paths RunPaths

RunPaths instance to resolve figure saving path.

required
mean tuple[float, ...] | None

Per-channel mean for denormalization.

None
std tuple[float, ...] | None

Per-channel std for denormalization.

None
arch_name str

Architecture name for the figure title.

'Model'
fig_dpi int

DPI for the saved figure.

_DEFAULT_DPI
num_samples int

Number of images to include in the grid.

16
resolution int | None

Resolution to include in filename to avoid overwriting.

None
Source code in orchard/data_handler/data_explorer.py
def show_samples_for_dataset(
    loader: DataLoader[Any],
    dataset_name: str,
    run_paths: RunPaths,
    *,
    mean: tuple[float, ...] | None = None,
    std: tuple[float, ...] | None = None,
    arch_name: str = "Model",
    fig_dpi: int = _DEFAULT_DPI,
    num_samples: int = 16,
    resolution: int | None = None,
) -> None:
    """
    Generate a grid of sample images from a dataset and save to the figures directory.

    Args:
        loader: PyTorch DataLoader to sample images from.
        dataset_name: Name of the dataset, used in the filename and title.
        run_paths: RunPaths instance to resolve figure saving path.
        mean: Per-channel mean for denormalization.
        std: Per-channel std for denormalization.
        arch_name: Architecture name for the figure title.
        fig_dpi: DPI for the saved figure.
        num_samples: Number of images to include in the grid.
        resolution: Resolution to include in filename to avoid overwriting.
    """
    res_str = f"_{resolution}x{resolution}" if resolution else ""
    save_path = run_paths.get_fig_path(f"{dataset_name}/sample_grid{res_str}.png")
    save_path.parent.mkdir(parents=True, exist_ok=True)

    show_sample_images(
        loader=loader,
        save_path=save_path,
        mean=mean,
        std=std,
        arch_name=arch_name,
        fig_dpi=fig_dpi,
        num_samples=num_samples,
        title_prefix=f"{dataset_name}{res_str}",
    )

create_synthetic_dataset(num_classes=8, samples=100, resolution=28, channels=3, name='syntheticmnist')

Create a synthetic NPZ-compatible dataset for testing.

This function generates random image data and labels, saves them to a temporary .npz file, and returns a DatasetData object that can be used with the existing data pipeline.

Parameters:

Name Type Description Default
num_classes int

Number of classification categories (default: 8)

8
samples int

Number of training samples (default: 100)

100
resolution int

Image resolution (HxW) (default: 28)

28
channels int

Number of color channels (default: 3 for RGB)

3
name str

Dataset name for identification (default: "syntheticmnist")

'syntheticmnist'

Returns:

Name Type Description
DatasetData DatasetData

A data object compatible with the existing pipeline

Example

data = create_synthetic_dataset(num_classes=8, samples=100) train_loader, val_loader, test_loader = get_dataloaders( ... data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers ... )

Source code in orchard/data_handler/diagnostic/synthetic.py
def create_synthetic_dataset(
    num_classes: int = 8,
    samples: int = 100,
    resolution: int = 28,
    channels: int = 3,
    name: str = "syntheticmnist",
) -> DatasetData:
    """
    Create a synthetic NPZ-compatible dataset for testing.

    This function generates random image data and labels, saves them to a
    temporary .npz file, and returns a DatasetData object that can be used
    with the existing data pipeline.

    Args:
        num_classes: Number of classification categories (default: 8)
        samples: Number of training samples (default: 100)
        resolution: Image resolution (HxW) (default: 28)
        channels: Number of color channels (default: 3 for RGB)
        name: Dataset name for identification (default: "syntheticmnist")

    Returns:
        DatasetData: A data object compatible with the existing pipeline

    Example:
        >>> data = create_synthetic_dataset(num_classes=8, samples=100)
        >>> train_loader, val_loader, test_loader = get_dataloaders(
        ...     data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers
        ... )
    """
    rng = np.random.default_rng(_SYNTHETIC_SEED)

    # Generate synthetic image data
    train_images = rng.integers(
        0, _SYNTHETIC_PIXEL_RANGE, (samples, resolution, resolution, channels), dtype=np.uint8
    )
    train_labels = rng.integers(0, num_classes, (samples, 1), dtype=np.uint8)

    # Validation and test sets are smaller (10% of training size each)
    val_samples = max(_MIN_SPLIT_SAMPLES, samples // 10)
    test_samples = max(_MIN_SPLIT_SAMPLES, samples // 10)

    val_images = rng.integers(
        0, _SYNTHETIC_PIXEL_RANGE, (val_samples, resolution, resolution, channels), dtype=np.uint8
    )
    val_labels = rng.integers(0, num_classes, (val_samples, 1), dtype=np.uint8)

    test_images = rng.integers(
        0, _SYNTHETIC_PIXEL_RANGE, (test_samples, resolution, resolution, channels), dtype=np.uint8
    )
    test_labels = rng.integers(0, num_classes, (test_samples, 1), dtype=np.uint8)

    # Create a temporary .npz file with standard format
    temp_file = tempfile.NamedTemporaryFile(
        suffix=".npz", delete=False, prefix="synthetic_dataset_"
    )
    temp_path = Path(temp_file.name)
    temp_file.close()

    # Save in NPZ format with correct key names
    np.savez(
        temp_path,
        train_images=train_images,
        train_labels=train_labels,
        val_images=val_images,
        val_labels=val_labels,
        test_images=test_images,
        test_labels=test_labels,
    )

    # Return a DatasetData object with all required parameters
    is_rgb = channels == 3

    return DatasetData(
        path=temp_path,
        name=name,
        is_rgb=is_rgb,
        num_classes=num_classes,
    )

create_synthetic_grayscale_dataset(num_classes=8, samples=100, resolution=28)

Create a synthetic grayscale NPZ dataset for testing.

Convenience function for creating single-channel (grayscale) synthetic data.

Parameters:

Name Type Description Default
num_classes int

Number of classification categories (default: 8)

8
samples int

Number of training samples (default: 100)

100
resolution int

Image resolution (HxW) (default: 28)

28

Returns:

Name Type Description
DatasetData DatasetData

A grayscale data object compatible with the pipeline

Source code in orchard/data_handler/diagnostic/synthetic.py
def create_synthetic_grayscale_dataset(
    num_classes: int = 8,
    samples: int = 100,
    resolution: int = 28,
) -> DatasetData:
    """
    Create a synthetic grayscale NPZ dataset for testing.

    Convenience function for creating single-channel (grayscale) synthetic data.

    Args:
        num_classes: Number of classification categories (default: 8)
        samples: Number of training samples (default: 100)
        resolution: Image resolution (HxW) (default: 28)

    Returns:
        DatasetData: A grayscale data object compatible with the pipeline
    """
    return create_synthetic_dataset(
        num_classes=num_classes,
        samples=samples,
        resolution=resolution,
        channels=1,
        name="syntheticmnist_gray",
    )

create_temp_loader(dataset_path, batch_size=_DEFAULT_HEALTHCHECK_BATCH_SIZE)

Load a NPZ dataset lazily and return a DataLoader for health checks.

This avoids loading the entire dataset into RAM at once, which is critical for large datasets (e.g., 224x224 images).

Source code in orchard/data_handler/diagnostic/temp_loader.py
def create_temp_loader(
    dataset_path: Path, batch_size: int = _DEFAULT_HEALTHCHECK_BATCH_SIZE
) -> DataLoader[Any]:
    """
    Load a NPZ dataset lazily and return a DataLoader for health checks.

    This avoids loading the entire dataset into RAM at once, which is critical
    for large datasets (e.g., 224x224 images).
    """
    dataset = VisionDataset.lazy(dataset_path)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return loader

ensure_dataset_npz(metadata, retries=5, delay=5.0)

Dispatcher that routes each dataset to its dedicated fetch pipeline.

Automatically detects dataset type from metadata.name and delegates to the appropriate download/conversion module. Adding a new domain (e.g. a new resolution or source) only requires a new branch here and a corresponding fetch module.

Parameters:

Name Type Description Default
metadata DatasetMetadata

Metadata containing URL, MD5, name and target path.

required
retries int

Max number of download attempts (NPZ fetcher only).

5
delay float

Delay (seconds) between retries (NPZ fetcher only).

5.0

Returns:

Name Type Description
Path Path

Path to the successfully validated .npz file.

Source code in orchard/data_handler/fetcher.py
def ensure_dataset_npz(
    metadata: DatasetMetadata,
    retries: int = 5,
    delay: float = 5.0,
) -> Path:
    """
    Dispatcher that routes each dataset to its dedicated fetch pipeline.

    Automatically detects dataset type from ``metadata.name`` and delegates
    to the appropriate download/conversion module. Adding a new domain
    (e.g. a new resolution or source) only requires a new branch here and
    a corresponding fetch module.

    Args:
        metadata (DatasetMetadata): Metadata containing URL, MD5, name and target path.
        retries (int): Max number of download attempts (NPZ fetcher only).
        delay (float): Delay (seconds) between retries (NPZ fetcher only).

    Returns:
        Path: Path to the successfully validated .npz file.
    """
    # Galaxy10 requires HDF5 download and conversion to NPZ
    if metadata.name == "galaxy10":
        from .fetchers import ensure_galaxy10_npz

        return ensure_galaxy10_npz(metadata)

    # CIFAR-10/100 via torchvision download and NPZ conversion
    if metadata.name in ("cifar10", "cifar100"):
        from .fetchers import ensure_cifar_npz

        return ensure_cifar_npz(metadata)

    # Default: standard NPZ download with retries and MD5 check
    from .fetchers import ensure_medmnist_npz

    return ensure_medmnist_npz(metadata, retries=retries, delay=delay)

load_dataset(metadata)

Ensures the dataset is present and returns its metadata container.

Source code in orchard/data_handler/fetcher.py
def load_dataset(metadata: DatasetMetadata) -> DatasetData:
    """
    Ensures the dataset is present and returns its metadata container.
    """
    return _load_and_inspect(metadata)

get_dataloaders(metadata, dataset_cfg, training_cfg, aug_cfg, num_workers, is_optuna=False)

Convenience function for creating train/val/test DataLoaders.

Wraps DataLoaderFactory for streamlined loader construction with automatic class balancing, hardware optimization, and Optuna support.

Parameters:

Name Type Description Default
metadata DatasetData

Dataset metadata from load_dataset (paths, splits).

required
dataset_cfg DatasetConfig

Dataset sub-config (splits, classes, resolution).

required
training_cfg TrainingConfig

Training sub-config (batch size, seed).

required
aug_cfg AugmentationConfig

Augmentation sub-config (transforms pipeline).

required
num_workers int

Resolved worker count from hardware config.

required
is_optuna bool

If True, use memory-conservative settings for hyperparameter tuning.

False

Returns:

Type Description
tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]

A 3-tuple of (train_loader, val_loader, test_loader).

Example

data = load_dataset(ds_meta) loaders = get_dataloaders( ... data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers ... )

Source code in orchard/data_handler/loader.py
def get_dataloaders(
    metadata: DatasetData,
    dataset_cfg: DatasetConfig,
    training_cfg: TrainingConfig,
    aug_cfg: AugmentationConfig,
    num_workers: int,
    is_optuna: bool = False,
) -> tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]:
    """
    Convenience function for creating train/val/test DataLoaders.

    Wraps DataLoaderFactory for streamlined loader construction with
    automatic class balancing, hardware optimization, and Optuna support.

    Args:
        metadata: Dataset metadata from load_dataset (paths, splits).
        dataset_cfg: Dataset sub-config (splits, classes, resolution).
        training_cfg: Training sub-config (batch size, seed).
        aug_cfg: Augmentation sub-config (transforms pipeline).
        num_workers: Resolved worker count from hardware config.
        is_optuna: If True, use memory-conservative settings for
            hyperparameter tuning.

    Returns:
        A 3-tuple of (train_loader, val_loader, test_loader).

    Example:
        >>> data = load_dataset(ds_meta)
        >>> loaders = get_dataloaders(
        ...     data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers
        ... )
    """
    factory = DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata)
    return factory.build(is_optuna=is_optuna)

get_augmentations_description(aug_cfg, img_size, mixup_alpha, ds_meta=None)

Generates descriptive string of augmentations for logging.

Parameters:

Name Type Description Default
aug_cfg AugmentationConfig

Augmentation sub-configuration

required
img_size int

Target image size for resized crop

required
mixup_alpha float

MixUp alpha (0.0 to disable)

required
ds_meta DatasetMetadata | None

Dataset metadata (if provided, respects domain flags)

None

Returns:

Type Description
str

Human-readable augmentation summary

Source code in orchard/data_handler/transforms.py
def get_augmentations_description(
    aug_cfg: AugmentationConfig,
    img_size: int,
    mixup_alpha: float,
    ds_meta: DatasetMetadata | None = None,
) -> str:
    """
    Generates descriptive string of augmentations for logging.

    Args:
        aug_cfg: Augmentation sub-configuration
        img_size: Target image size for resized crop
        mixup_alpha: MixUp alpha (0.0 to disable)
        ds_meta: Dataset metadata (if provided, respects domain flags)

    Returns:
        Human-readable augmentation summary
    """
    is_anatomical = ds_meta.is_anatomical if ds_meta else True
    is_texture = ds_meta.is_texture_based if ds_meta else True

    params = {}
    if not is_anatomical:
        params["HFlip"] = aug_cfg.hflip
        params["Rotation"] = f"{aug_cfg.rotation_angle}°"
    if not is_texture:
        params["Jitter"] = aug_cfg.jitter_val
    params["ResizedCrop"] = f"{img_size} ({aug_cfg.min_scale}, 1.0)"

    descr = [f"{k}({v})" for k, v in params.items()]

    if mixup_alpha > 0:
        descr.append(f"MixUp(α={mixup_alpha})")

    return ", ".join(descr)

get_pipeline_transforms(aug_cfg, img_size, ds_meta, *, force_rgb=True, norm_mean=None, norm_std=None)

Constructs training and validation transformation pipelines.

Dynamically adapts to dataset characteristics (RGB vs Grayscale) and optionally promotes grayscale to 3-channel for pretrained-weight compatibility. Uses torchvision v2 transforms for improved CPU/GPU performance.

Pipeline Logic
  1. Convert to tensor format (ToImage + ToDtype)
  2. Promote 1-channel to 3-channel when force_rgb is True and the dataset is native grayscale
  3. Apply domain-aware augmentations (training only): geometric transforms disabled for anatomical datasets, color jitter reduced for texture-based datasets
  4. Normalize with dataset-specific statistics

Parameters:

Name Type Description Default
aug_cfg AugmentationConfig

Augmentation sub-configuration

required
img_size int

Target image size

required
ds_meta DatasetMetadata

Dataset metadata (channels, domain flags)

required
force_rgb bool

Promote grayscale datasets to 3-channel RGB

True
norm_mean tuple[float, ...] | None

Pre-computed normalization mean (from DatasetConfig). When None, computed from ds_meta + force_rgb.

None
norm_std tuple[float, ...] | None

Pre-computed normalization std (from DatasetConfig). When None, computed from ds_meta + force_rgb.

None

Returns:

Type Description
tuple[Compose, Compose]

tuple[v2.Compose, v2.Compose]: (train_transform, val_transform)

Source code in orchard/data_handler/transforms.py
def get_pipeline_transforms(
    aug_cfg: AugmentationConfig,
    img_size: int,
    ds_meta: DatasetMetadata,
    *,
    force_rgb: bool = True,
    norm_mean: tuple[float, ...] | None = None,
    norm_std: tuple[float, ...] | None = None,
) -> tuple[v2.Compose, v2.Compose]:
    """
    Constructs training and validation transformation pipelines.

    Dynamically adapts to dataset characteristics (RGB vs Grayscale) and
    optionally promotes grayscale to 3-channel for pretrained-weight
    compatibility.  Uses torchvision v2 transforms for improved CPU/GPU
    performance.

    Pipeline Logic:
        1. Convert to tensor format (ToImage + ToDtype)
        2. Promote 1-channel to 3-channel when ``force_rgb`` is True
           and the dataset is native grayscale
        3. Apply domain-aware augmentations (training only):
           geometric transforms disabled for anatomical datasets,
           color jitter reduced for texture-based datasets
        4. Normalize with dataset-specific statistics

    Args:
        aug_cfg: Augmentation sub-configuration
        img_size: Target image size
        ds_meta: Dataset metadata (channels, domain flags)
        force_rgb: Promote grayscale datasets to 3-channel RGB
        norm_mean: Pre-computed normalization mean (from DatasetConfig).
            When None, computed from ds_meta + force_rgb.
        norm_std: Pre-computed normalization std (from DatasetConfig).
            When None, computed from ds_meta + force_rgb.

    Returns:
        tuple[v2.Compose, v2.Compose]: (train_transform, val_transform)
    """
    # Determine if dataset is native RGB or requires grayscale promotion
    is_rgb = ds_meta.in_channels == 3
    promote_to_rgb = not is_rgb and force_rgb

    # Normalization statistics: prefer caller-provided (single source of truth
    # via DatasetConfig.mean/std), fall back to local computation.
    if norm_mean is not None and norm_std is not None:
        mean = list(norm_mean)
        std = list(norm_std)
    elif promote_to_rgb:
        mean = [ds_meta.mean[0]] * 3
        std = [ds_meta.std[0]] * 3
    else:
        mean = list(ds_meta.mean)
        std = list(ds_meta.std)

    def get_base_ops() -> list[v2.Transform]:
        """
        Foundational operations common to all pipelines.

        Returns:
            list of base transforms (tensor conversion + channel promotion)
        """
        ops = [
            v2.ToImage(),  # Convert PIL/ndarray to tensor
            v2.ToDtype(torch.float32, scale=True),  # Scale to [0,1]
        ]

        # Promote 1-channel to 3-channel for architecture compatibility
        if promote_to_rgb:
            ops.append(v2.Grayscale(num_output_channels=3))

        return ops

    # --- TRAINING PIPELINE ---
    # Domain-aware augmentations: respects is_anatomical and is_texture_based flags
    train_ops = [*get_base_ops()]

    # Geometric: disabled for anatomical datasets (orientation is diagnostic)
    if not ds_meta.is_anatomical:
        train_ops.append(v2.RandomHorizontalFlip(p=aug_cfg.hflip))
        train_ops.append(v2.RandomRotation(aug_cfg.rotation_angle))

    # Photometric: reduced for texture-based datasets (fine patterns are fragile)
    if not ds_meta.is_texture_based:
        train_ops.append(
            v2.ColorJitter(
                brightness=aug_cfg.jitter_val,
                contrast=aug_cfg.jitter_val,
                saturation=aug_cfg.jitter_val if is_rgb else 0.0,
            )
        )

    train_ops.extend(
        [
            v2.RandomResizedCrop(
                size=img_size,
                scale=(aug_cfg.min_scale, 1.0),
                antialias=True,
                interpolation=v2.InterpolationMode.BILINEAR,
            ),
            v2.Normalize(mean=mean, std=std),
        ]
    )

    train_transform = v2.Compose(train_ops)

    # --- VALIDATION/INFERENCE PIPELINE ---
    # Deterministic transformations only (no augmentation)
    val_transform = v2.Compose(
        [
            *get_base_ops(),
            v2.Resize(size=img_size, antialias=True),
            v2.Normalize(mean=mean, std=std),
        ]
    )

    return train_transform, val_transform