data_handler

`orchard.data_handler` ¶

Data Handler Package.

This package manages the end-to-end data pipeline, from downloading raw NPZ files using the Dataset Registry to providing fully configured PyTorch DataLoaders.

`VisionDataset(images, labels, *, transform=None)` ¶

Bases: Dataset[tuple[Tensor, Tensor]]

PyTorch Dataset for NPZ-based image data.

The constructor accepts raw NumPy arrays directly (no I/O). Use the classmethod factories to load from disk:

VisionDataset.from_npz(...) — eager, full split into RAM.
VisionDataset.lazy(...) — memory-mapped, pages loaded on demand.

Initializes the dataset from pre-loaded arrays.

Parameters:

Name	Type	Description	Default
`images`	`NDArray[Any]`	Image array with shape `(N, H, W)` or `(N, H, W, C)`.	required
`labels`	`NDArray[Any]`	Label array, any shape that flattens to `(N,)`.	required
`transform`	`Compose \| None`	Pipeline of Torchvision transforms.	`None`

Source code in orchard/data_handler/dataset.py

def __init__(
    self,
    images: npt.NDArray[Any],
    labels: npt.NDArray[Any],
    *,
    transform: transforms.Compose | None = None,
) -> None:
    """
    Initializes the dataset from pre-loaded arrays.

    Args:
        images: Image array with shape ``(N, H, W)`` or ``(N, H, W, C)``.
        labels: Label array, any shape that flattens to ``(N,)``.
        transform: Pipeline of Torchvision transforms.
    """
    # Ensure consistent (N, H, W, C) for PIL conversion
    if images.ndim == 3:  # (N, H, W) -> (N, H, W, 1)
        images = np.expand_dims(images, axis=-1)

    self.images = images
    self.labels: npt.NDArray[Any] = labels.ravel().astype(np.int64)
    self.transform = transform

    # Kept alive to prevent GC of mmap arrays (set by .lazy())
    self._npz_handle: np.lib.npyio.NpzFile | None = None
    # Index mapping for lazy subsampling (None = use all)
    self._indices: npt.NDArray[Any] | None = None

`from_npz(path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED)` `classmethod` ¶

Eagerly load a split from an NPZ archive into RAM.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the dataset `.npz` archive.	required
`split`	`str`	Dataset split to load (`train`, `val`, or `test`).	`'train'`
`transform`	`Compose \| None`	Pipeline of Torchvision transforms.	`None`
`max_samples`	`int \| None`	If set, limits the number of samples (subsampling).	`None`
`seed`	`int`	Random seed for deterministic subsampling.	`DEFAULT_SEED`

Source code in orchard/data_handler/dataset.py

@classmethod
def from_npz(
    cls,
    path: Path,
    split: str = "train",
    *,
    transform: transforms.Compose | None = None,
    max_samples: int | None = None,
    seed: int = DEFAULT_SEED,
) -> VisionDataset:
    """
    Eagerly load a split from an NPZ archive into RAM.

    Args:
        path: Path to the dataset ``.npz`` archive.
        split: Dataset split to load (``train``, ``val``, or ``test``).
        transform: Pipeline of Torchvision transforms.
        max_samples: If set, limits the number of samples (subsampling).
        seed: Random seed for deterministic subsampling.
    """
    if not path.exists():
        raise OrchardDatasetError(f"Dataset file not found at: {path}")

    with np.load(path) as data:
        raw_images = data[f"{split}_images"]
        raw_labels = data[f"{split}_labels"]

        total_available = len(raw_labels)

        # Deterministic subsampling logic
        if max_samples and max_samples < total_available:
            rng = np.random.default_rng(seed)
            chosen = rng.choice(total_available, size=max_samples, replace=False)
            images = raw_images[chosen]
            labels = raw_labels[chosen]
        else:
            images = np.array(raw_images)
            labels = raw_labels

    return cls(images, labels, transform=transform)

`lazy(path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED)` `classmethod` ¶

Memory-mapped load from an NPZ archive (no full RAM copy).

Images are loaded page-by-page on demand. Suitable for large datasets that do not fit in RAM and for lightweight health checks.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the `.npz` file.	required
`split`	`str`	Dataset split to load (default `train`).	`'train'`
`transform`	`Compose \| None`	Pipeline of Torchvision transforms.	`None`
`max_samples`	`int \| None`	If set, limits the number of samples (subsampling).	`None`
`seed`	`int`	Random seed for deterministic subsampling.	`DEFAULT_SEED`

Source code in orchard/data_handler/dataset.py

@classmethod
def lazy(
    cls,
    path: Path,
    split: str = "train",
    *,
    transform: transforms.Compose | None = None,
    max_samples: int | None = None,
    seed: int = DEFAULT_SEED,
) -> VisionDataset:
    """
    Memory-mapped load from an NPZ archive (no full RAM copy).

    Images are loaded page-by-page on demand. Suitable for large datasets
    that do not fit in RAM and for lightweight health checks.

    Args:
        path: Path to the ``.npz`` file.
        split: Dataset split to load (default ``train``).
        transform: Pipeline of Torchvision transforms.
        max_samples: If set, limits the number of samples (subsampling).
        seed: Random seed for deterministic subsampling.
    """
    data = np.load(path, mmap_mode="r")  # pragma: no mutate
    instance = cls(data[f"{split}_images"], data[f"{split}_labels"], transform=transform)
    instance._npz_handle = data

    if max_samples and max_samples < len(instance.labels):
        rng = np.random.default_rng(seed)
        instance._indices = rng.choice(len(instance.labels), size=max_samples, replace=False)
        # Eagerly subsample labels (small) so .labels and __len__ stay consistent
        instance.labels = instance.labels[instance._indices]

    return instance

`len()` ¶

Returns the total number of samples currently in the dataset.

Source code in orchard/data_handler/dataset.py

def __len__(self) -> int:
    """Returns the total number of samples currently in the dataset."""
    return len(self.labels)

`getitem(idx)` ¶

Retrieves a standardized sample-label pair.

The image is converted to a PIL object to ensure compatibility with Torchvision V2 transforms before being returned as a PyTorch Tensor.

Parameters:

Name	Type	Description	Default
`idx`	`int`	Sample index.	required

Returns:

Type	Description
`Tensor`	A pair of (image, label) where image is a `(C, H, W)` float
`Tensor`	tensor and label is a scalar long tensor.

Source code in orchard/data_handler/dataset.py

def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Retrieves a standardized sample-label pair.

    The image is converted to a PIL object to ensure compatibility with
    Torchvision V2 transforms before being returned as a PyTorch Tensor.

    Args:
        idx: Sample index.

    Returns:
        A pair of (image, label) where image is a ``(C, H, W)`` float
        tensor and label is a scalar long tensor.
    """
    # Remap index for lazy subsampling (images stay full mmap)
    img_idx = self._indices[idx] if self._indices is not None else idx
    img = self.images[img_idx]

    pil_img = Image.fromarray(img.squeeze() if img.shape[-1] == 1 else img)

    if self.transform:
        img_t = self.transform(pil_img)
    else:
        img_t = transforms.functional.to_tensor(pil_img)

    return img_t, torch.tensor(int(self.labels[idx]), dtype=torch.long)

`DetectionDataset(images, annotations, *, transform=None)` ¶

Bases: Dataset[tuple[Tensor, dict[str, Tensor]]]

PyTorch Dataset for detection tasks with bounding-box annotations.

Each sample returns an (image, target) pair where target is a dict with boxes (N, 4) in [x1, y1, x2, y2] format and labels (N,) as int64 class indices.

Initialize from pre-loaded arrays and annotation list.

Parameters:

Name	Type	Description	Default
`images`	`NDArray[Any]`	Image array `(N, H, W, C)` or `(N, H, W)`.	required
`annotations`	`list[dict[str, NDArray[Any]]]`	Per-image annotation dicts with `boxes` and `labels` numpy arrays.	required
`transform`	`Compose \| None`	Torchvision transform pipeline for images.	`None`

Source code in orchard/data_handler/detection_dataset.py

def __init__(
    self,
    images: npt.NDArray[Any],
    annotations: list[dict[str, npt.NDArray[Any]]],
    *,
    transform: transforms.Compose | None = None,
) -> None:
    """
    Initialize from pre-loaded arrays and annotation list.

    Args:
        images: Image array ``(N, H, W, C)`` or ``(N, H, W)``.
        annotations: Per-image annotation dicts with ``boxes`` and
            ``labels`` numpy arrays.
        transform: Torchvision transform pipeline for images.
    """
    if images.ndim == 3:
        images = np.expand_dims(images, axis=-1)

    self.images = images
    self.annotations = annotations
    self.transform = transform

`from_arrays(images, annotations, *, transform=None, max_samples=None, seed=DEFAULT_SEED)` `classmethod` ¶

Build a DetectionDataset with optional subsampling.

Parameters:

Name	Type	Description	Default
`images`	`NDArray[Any]`	Image array `(N, H, W, C)`.	required
`annotations`	`list[dict[str, NDArray[Any]]]`	Per-image annotation dicts.	required
`transform`	`Compose \| None`	Transform pipeline.	`None`
`max_samples`	`int \| None`	Limit number of samples.	`None`
`seed`	`int`	Random seed for deterministic subsampling.	`DEFAULT_SEED`

Source code in orchard/data_handler/detection_dataset.py

@classmethod
def from_arrays(
    cls,
    images: npt.NDArray[Any],
    annotations: list[dict[str, npt.NDArray[Any]]],
    *,
    transform: transforms.Compose | None = None,
    max_samples: int | None = None,
    seed: int = DEFAULT_SEED,
) -> DetectionDataset:
    """
    Build a DetectionDataset with optional subsampling.

    Args:
        images: Image array ``(N, H, W, C)``.
        annotations: Per-image annotation dicts.
        transform: Transform pipeline.
        max_samples: Limit number of samples.
        seed: Random seed for deterministic subsampling.
    """
    if max_samples and max_samples < len(images):
        rng = np.random.default_rng(seed)
        indices = rng.choice(len(images), size=max_samples, replace=False)
        images = images[indices]
        annotations = [annotations[i] for i in indices]

    return cls(images, annotations, transform=transform)

`from_npz(image_path, annotation_path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED)` `classmethod` ¶

Load a detection dataset from NPZ (images) and NPZ (annotations).

The image NPZ has key {split}_images. The annotation NPZ has keys {split}_boxes (list of (N_i, 4) arrays) and {split}_labels (list of (N_i,) arrays), stored as object arrays.

Parameters:

Name	Type	Description	Default
`image_path`	`Path`	Path to images NPZ.	required
`annotation_path`	`Path`	Path to annotations NPZ.	required
`split`	`str`	Dataset split (`train`, `val`, `test`).	`'train'`
`transform`	`Compose \| None`	Transform pipeline.	`None`
`max_samples`	`int \| None`	Limit number of samples.	`None`
`seed`	`int`	Random seed.	`DEFAULT_SEED`

Source code in orchard/data_handler/detection_dataset.py

@classmethod
def from_npz(
    cls,
    image_path: Path,
    annotation_path: Path,
    split: str = "train",
    *,
    transform: transforms.Compose | None = None,
    max_samples: int | None = None,
    seed: int = DEFAULT_SEED,
) -> DetectionDataset:
    """
    Load a detection dataset from NPZ (images) and NPZ (annotations).

    The image NPZ has key ``{split}_images``. The annotation NPZ has
    keys ``{split}_boxes`` (list of (N_i, 4) arrays) and
    ``{split}_labels`` (list of (N_i,) arrays), stored as object arrays.

    Args:
        image_path: Path to images NPZ.
        annotation_path: Path to annotations NPZ.
        split: Dataset split (``train``, ``val``, ``test``).
        transform: Transform pipeline.
        max_samples: Limit number of samples.
        seed: Random seed.
    """
    if not image_path.exists():
        raise OrchardDatasetError(f"Image file not found: {image_path}")
    if not annotation_path.exists():
        raise OrchardDatasetError(f"Annotation file not found: {annotation_path}")

    with np.load(image_path) as img_data:
        images = np.array(img_data[f"{split}_images"])

    with np.load(annotation_path, allow_pickle=True) as ann_data:
        boxes_list = ann_data[f"{split}_boxes"]
        labels_list = ann_data[f"{split}_labels"]

    annotations: list[dict[str, npt.NDArray[Any]]] = [
        {"boxes": np.array(b, dtype=np.float32), "labels": np.array(lab, dtype=np.int64)}
        for b, lab in zip(boxes_list, labels_list)
    ]

    return cls.from_arrays(
        images, annotations, transform=transform, max_samples=max_samples, seed=seed
    )

`len()` ¶

Returns the number of images in the dataset.

Source code in orchard/data_handler/detection_dataset.py

def __len__(self) -> int:
    """Returns the number of images in the dataset."""
    return len(self.images)

`getitem(idx)` ¶

Retrieve an image and its bounding-box annotations.

Parameters:

Name	Type	Description	Default
`idx`	`int`	Sample index.	required

Returns:

Type	Description
`Tensor`	Tuple of (image_tensor, target_dict) where target_dict has
`dict[str, Tensor]`	`boxes` (N, 4) float32 and `labels` (N,) int64.

Source code in orchard/data_handler/detection_dataset.py

def __getitem__(self, idx: int) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
    """
    Retrieve an image and its bounding-box annotations.

    Args:
        idx: Sample index.

    Returns:
        Tuple of (image_tensor, target_dict) where target_dict has
        ``boxes`` (N, 4) float32 and ``labels`` (N,) int64.
    """
    img = self.images[idx]
    if img.shape[-1] == 1:
        img = np.repeat(img, 3, axis=-1)
    pil_img = Image.fromarray(img)

    if self.transform:
        img_t: torch.Tensor = self.transform(pil_img)
    else:
        img_t = transforms.functional.to_tensor(pil_img)

    ann = self.annotations[idx]
    target = {
        "boxes": torch.as_tensor(ann["boxes"], dtype=torch.float32),
        "labels": torch.as_tensor(ann["labels"], dtype=torch.int64),
    }

    return img_t, target

`DatasetData(path, name, is_rgb, num_classes, annotation_path=None)` `dataclass` ¶

Metadata container for a loaded dataset.

Stores path and format info instead of raw arrays to save RAM.

Attributes:

Name	Type	Description
`path`	`Path`	Path to images NPZ.
`name`	`str`	Dataset identifier.
`is_rgb`	`bool`	Whether images are RGB (3 channels).
`num_classes`	`int`	Number of categories.
`annotation_path`	`Path \| None`	Path to annotations NPZ (detection datasets only; None for classification).

`DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata, task_type='classification')` ¶

Orchestrates the creation of optimized PyTorch DataLoaders.

This factory centralizes the configuration of training, validation, and testing pipelines. It ensures that data transformations, class balancing, and hardware settings are synchronized across all splits.

Attributes:

Name	Type	Description
`dataset_cfg`	`DatasetConfig`	Dataset sub-config.
`training_cfg`	`TrainingConfig`	Training sub-config.
`aug_cfg`	`AugmentationConfig`	Augmentation sub-config.
`num_workers`	`int`	Resolved worker count from hardware config.
`metadata`	`DatasetData`	Data path and raw format information.
`ds_meta`	`DatasetMetadata`	Official dataset registry specifications.
`logger`	`Logger`	Module-specific logger.

Initializes the factory with environment and dataset metadata.

Parameters:

Name	Type	Description	Default
`dataset_cfg`	`DatasetConfig`	Dataset sub-config (splits, classes, resolution).	required
`training_cfg`	`TrainingConfig`	Training sub-config (batch size, seed).	required
`aug_cfg`	`AugmentationConfig`	Augmentation sub-config (transforms pipeline).	required
`num_workers`	`int`	Resolved worker count from hardware config.	required
`metadata`	`DatasetData`	Metadata from the data fetcher/downloader.	required
`task_type`	`str`	Task type (`"classification"` or `"detection"`). Controls collate function and sampler selection.	`'classification'`

Source code in orchard/data_handler/loader.py

def __init__(
    self,
    dataset_cfg: DatasetConfig,
    training_cfg: TrainingConfig,
    aug_cfg: AugmentationConfig,
    num_workers: int,
    metadata: DatasetData,
    task_type: str = "classification",
) -> None:
    """
    Initializes the factory with environment and dataset metadata.

    Args:
        dataset_cfg: Dataset sub-config (splits, classes, resolution).
        training_cfg: Training sub-config (batch size, seed).
        aug_cfg: Augmentation sub-config (transforms pipeline).
        num_workers: Resolved worker count from hardware config.
        metadata: Metadata from the data fetcher/downloader.
        task_type: Task type (``"classification"`` or ``"detection"``).
            Controls collate function and sampler selection.
    """
    self.dataset_cfg = dataset_cfg
    self.training_cfg = training_cfg
    self.aug_cfg = aug_cfg
    self._num_workers = num_workers
    self.metadata = metadata
    self._task_type = task_type

    # task_type→None is unkillable (falls back to classification, same as default)
    wrapper = get_registry(dataset_cfg.resolution, task_type)  # pragma: no mutate
    self.ds_meta = wrapper.get_dataset(dataset_cfg.dataset_name)
    self.logger = logging.getLogger(LOGGER_NAME)

`build(is_optuna=False)` ¶

Constructs and returns the full suite of DataLoaders.

Assembles train/val/test splits with transforms, optional class balancing, and hardware-aware infrastructure settings.

Parameters:

Name	Type	Description	Default
`is_optuna`	`bool`	If True, use memory-conservative settings for hyperparameter tuning (fewer workers, no persistent workers).	`False`

Returns:

Type	Description
`tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]`	A tuple of (train_loader, val_loader, test_loader).

Source code in orchard/data_handler/loader.py

def build(
    self, is_optuna: bool = False
) -> tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]:
    """
    Constructs and returns the full suite of DataLoaders.

    Assembles train/val/test splits with transforms, optional class
    balancing, and hardware-aware infrastructure settings.

    Args:
        is_optuna: If True, use memory-conservative settings for
            hyperparameter tuning (fewer workers, no persistent workers).

    Returns:
        A tuple of (train_loader, val_loader, test_loader).
    """
    # 1. Setup transforms
    train_trans, val_trans = self._get_transformation_pipelines()

    # 2. Instantiate Dataset splits
    is_detection = self._task_type == "detection"

    sub_samples = None
    if self.dataset_cfg.max_samples:
        sub_samples = max(
            MIN_SPLIT_SAMPLES,
            int(self.dataset_cfg.max_samples * self.dataset_cfg.val_ratio),
        )

    if is_detection and self.metadata.annotation_path is not None:
        train_ds, val_ds, test_ds = self._build_detection_splits(
            train_trans, val_trans, sub_samples
        )
    else:
        train_ds, val_ds, test_ds = self._build_classification_splits(
            train_trans, val_trans, sub_samples
        )

    # 3. Resolve Sampler, Collate, and Infrastructure
    sampler = None if is_detection else self._get_balancing_sampler(train_ds)
    collate_fn = detection_collate_fn if is_detection else None
    infra_kwargs = self._get_infrastructure_kwargs(is_optuna=is_optuna)

    # 4. Construct DataLoaders
    train_loader = DataLoader(
        train_ds,
        batch_size=self.training_cfg.batch_size,
        shuffle=(sampler is None),
        sampler=sampler,
        drop_last=True,
        collate_fn=collate_fn,
        **infra_kwargs,
    )

    val_loader = DataLoader(
        val_ds,
        batch_size=self.training_cfg.batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        **infra_kwargs,
    )

    test_loader = DataLoader(
        test_ds,
        batch_size=self.training_cfg.batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        **infra_kwargs,
    )

    optuna_str = " (Optuna)" if is_optuna else ""
    self.logger.info(
        "%s%s %-18s: (%s)%s → Train:[%d] Val:[%d] Test:[%d]",
        LogStyle.INDENT,
        LogStyle.ARROW,
        "DataLoaders",
        self.dataset_cfg.processing_mode,
        optuna_str,
        len(train_ds),
        len(val_ds),
        len(test_ds),
    )

    return train_loader, val_loader, test_loader

`detection_collate_fn(batch)` ¶

Collate detection samples into list-based batches.

Unlike the default PyTorch collate (which stacks tensors), detection requires list-based batching because each image can have a different number of bounding boxes.

Parameters:

Name	Type	Description	Default
`batch`	`list[tuple[Tensor, dict[str, Any]]]`	List of (image, target_dict) tuples from the dataset.	required

Returns:

Type	Description
`tuple[list[Tensor], list[dict[str, Any]]]`	Tuple of (list of image tensors, list of target dicts).

Source code in orchard/data_handler/collate.py

def detection_collate_fn(
    batch: list[tuple[torch.Tensor, dict[str, Any]]],
) -> tuple[list[torch.Tensor], list[dict[str, Any]]]:
    """
    Collate detection samples into list-based batches.

    Unlike the default PyTorch collate (which stacks tensors), detection
    requires list-based batching because each image can have a different
    number of bounding boxes.

    Args:
        batch: List of (image, target_dict) tuples from the dataset.

    Returns:
        Tuple of (list of image tensors, list of target dicts).
    """
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    return images, targets

`show_sample_images(loader, save_path, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, title_prefix=None)` ¶

Extract a batch from the DataLoader and save a grid of sample images.

Saves images with their corresponding labels to verify data integrity and augmentations.

Parameters:

Name	Type	Description	Default
`loader`	`DataLoader[Any]`	The PyTorch DataLoader to sample from.	required
`save_path`	`Path`	Full path (including filename) to save the resulting image.	required
`mean`	`tuple[float, ...] \| None`	Per-channel mean for denormalization.	`None`
`std`	`tuple[float, ...] \| None`	Per-channel std for denormalization.	`None`
`arch_name`	`str`	Architecture name for the figure title.	`'Model'`
`fig_dpi`	`int`	DPI for the saved figure.	`_DEFAULT_DPI`
`num_samples`	`int`	Number of images to display in the grid.	`16`
`title_prefix`	`str \| None`	Optional string to prepend to the figure title.	`None`

Source code in orchard/data_handler/data_explorer.py

def show_sample_images(
    loader: DataLoader[Any],
    save_path: Path,
    *,
    mean: tuple[float, ...] | None = None,
    std: tuple[float, ...] | None = None,
    arch_name: str = "Model",
    fig_dpi: int = _DEFAULT_DPI,
    num_samples: int = 16,
    title_prefix: str | None = None,
) -> None:
    """
    Extract a batch from the DataLoader and save a grid of sample images.

    Saves images with their corresponding labels to verify data integrity and augmentations.

    Args:
        loader: The PyTorch DataLoader to sample from.
        save_path: Full path (including filename) to save the resulting image.
        mean: Per-channel mean for denormalization.
        std: Per-channel std for denormalization.
        arch_name: Architecture name for the figure title.
        fig_dpi: DPI for the saved figure.
        num_samples: Number of images to display in the grid.
        title_prefix: Optional string to prepend to the figure title.
    """
    try:
        batch_images, _ = next(iter(loader))
    except StopIteration:
        logger.error("DataLoader is empty. Cannot generate sample images.")
        return

    actual_samples = min(len(batch_images), num_samples)
    images = batch_images[:actual_samples]

    # Apply denormalization if mean/std are provided
    if mean is not None and std is not None:
        mean_t = torch.tensor(mean).view(-1, 1, 1)
        std_t = torch.tensor(std).view(-1, 1, 1)
        images = images * std_t + mean_t

    images = torch.clamp(images, 0, 1)

    # Create a grid
    grid = make_grid(images, nrow=int(actual_samples**0.5), padding=2)  # pragma: no mutate

    # Convert to numpy HWC for matplotlib
    plt.imshow(
        (
            grid.squeeze(0).cpu().numpy()
            if images.shape[1] == 1
            else grid.permute(1, 2, 0).cpu().numpy()
        ),
        cmap="gray" if images.shape[1] == 1 else None,
    )

    # Figure title
    title_str = f"{arch_name} — {actual_samples} Samples"  # pragma: no mutate
    if title_prefix:
        title_str = f"{title_prefix} — {title_str}"  # pragma: no mutate
    plt.title(title_str, fontsize=14)

    plt.axis("off")
    plt.tight_layout()

    # Ensure target directory exists
    save_path.parent.mkdir(parents=True, exist_ok=True)

    plt.savefig(save_path, dpi=fig_dpi, bbox_inches="tight")
    plt.close()
    logger.info(
        "%s%s %-18s: %s",
        LogStyle.INDENT,
        LogStyle.ARROW,
        "Sample Grid",
        save_path.name,
    )

`show_samples_for_dataset(loader, dataset_name, run_paths, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, resolution=None)` ¶

Generate a grid of sample images from a dataset and save to the figures directory.

Parameters:

Name	Type	Description	Default
`loader`	`DataLoader[Any]`	PyTorch DataLoader to sample images from.	required
`dataset_name`	`str`	Name of the dataset, used in the filename and title.	required
`run_paths`	`RunPaths`	RunPaths instance to resolve figure saving path.	required
`mean`	`tuple[float, ...] \| None`	Per-channel mean for denormalization.	`None`
`std`	`tuple[float, ...] \| None`	Per-channel std for denormalization.	`None`
`arch_name`	`str`	Architecture name for the figure title.	`'Model'`
`fig_dpi`	`int`	DPI for the saved figure.	`_DEFAULT_DPI`
`num_samples`	`int`	Number of images to include in the grid.	`16`
`resolution`	`int \| None`	Resolution to include in filename to avoid overwriting.	`None`

Source code in orchard/data_handler/data_explorer.py

def show_samples_for_dataset(
    loader: DataLoader[Any],
    dataset_name: str,
    run_paths: RunPaths,
    *,
    mean: tuple[float, ...] | None = None,
    std: tuple[float, ...] | None = None,
    arch_name: str = "Model",  # pragma: no mutate
    fig_dpi: int = _DEFAULT_DPI,
    num_samples: int = 16,  # pragma: no mutate
    resolution: int | None = None,
) -> None:
    """
    Generate a grid of sample images from a dataset and save to the figures directory.

    Args:
        loader: PyTorch DataLoader to sample images from.
        dataset_name: Name of the dataset, used in the filename and title.
        run_paths: RunPaths instance to resolve figure saving path.
        mean: Per-channel mean for denormalization.
        std: Per-channel std for denormalization.
        arch_name: Architecture name for the figure title.
        fig_dpi: DPI for the saved figure.
        num_samples: Number of images to include in the grid.
        resolution: Resolution to include in filename to avoid overwriting.
    """
    res_str = f"_{resolution}x{resolution}" if resolution else ""
    save_path = run_paths.get_fig_path(f"{dataset_name}/sample_grid{res_str}.png")
    save_path.parent.mkdir(parents=True, exist_ok=True)  # pragma: no mutate

    show_sample_images(
        loader=loader,
        save_path=save_path,
        mean=mean,
        std=std,
        arch_name=arch_name,
        fig_dpi=fig_dpi,
        num_samples=num_samples,
        title_prefix=f"{dataset_name}{res_str}",
    )

`create_synthetic_dataset(num_classes=8, samples=100, resolution=28, channels=3, name='syntheticmnist')` ¶

Create a synthetic NPZ-compatible dataset for testing.

This function generates random image data and labels, saves them to a temporary .npz file, and returns a DatasetData object that can be used with the existing data pipeline.

Parameters:

Name	Type	Description	Default
`num_classes`	`int`	Number of target categories (default: 8)	`8`
`samples`	`int`	Number of training samples (default: 100)	`100`
`resolution`	`int`	Image resolution (HxW) (default: 28)	`28`
`channels`	`int`	Number of color channels (default: 3 for RGB)	`3`
`name`	`str`	Dataset name for identification (default: "syntheticmnist")	`'syntheticmnist'`

Returns:

Name	Type	Description
`DatasetData`	`DatasetData`	A data object compatible with the existing pipeline

Example

data = create_synthetic_dataset(num_classes=8, samples=100) train_loader, val_loader, test_loader = get_dataloaders( ... data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers ... )

Source code in orchard/data_handler/diagnostic/synthetic.py

def create_synthetic_dataset(
    num_classes: int = 8,  # pragma: no mutate
    samples: int = 100,  # pragma: no mutate
    resolution: int = 28,  # pragma: no mutate
    channels: int = 3,  # pragma: no mutate
    name: str = "syntheticmnist",  # pragma: no mutate
) -> DatasetData:
    """
    Create a synthetic NPZ-compatible dataset for testing.

    This function generates random image data and labels, saves them to a
    temporary .npz file, and returns a DatasetData object that can be used
    with the existing data pipeline.

    Args:
        num_classes: Number of target categories (default: 8)
        samples: Number of training samples (default: 100)
        resolution: Image resolution (HxW) (default: 28)
        channels: Number of color channels (default: 3 for RGB)
        name: Dataset name for identification (default: "syntheticmnist")

    Returns:
        DatasetData: A data object compatible with the existing pipeline

    Example:
        >>> data = create_synthetic_dataset(num_classes=8, samples=100)
        >>> train_loader, val_loader, test_loader = get_dataloaders(
        ...     data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers
        ... )
    """
    # pragma: no mutate start
    rng = np.random.default_rng(_SYNTHETIC_SEED)

    # Generate synthetic image data
    train_images = rng.integers(
        0,
        _SYNTHETIC_PIXEL_RANGE,
        (samples, resolution, resolution, channels),
        dtype=np.uint8,
    )
    train_labels = rng.integers(0, num_classes, (samples, 1), dtype=np.uint8)

    # Validation and test sets are smaller (10% of training size each)
    val_samples = max(MIN_SPLIT_SAMPLES, samples // 10)
    test_samples = max(MIN_SPLIT_SAMPLES, samples // 10)

    val_images = rng.integers(
        0,
        _SYNTHETIC_PIXEL_RANGE,
        (val_samples, resolution, resolution, channels),
        dtype=np.uint8,
    )
    val_labels = rng.integers(0, num_classes, (val_samples, 1), dtype=np.uint8)

    test_images = rng.integers(
        0,
        _SYNTHETIC_PIXEL_RANGE,
        (test_samples, resolution, resolution, channels),
        dtype=np.uint8,
    )
    test_labels = rng.integers(0, num_classes, (test_samples, 1), dtype=np.uint8)

    # Create a temporary .npz file with standard format
    temp_file = tempfile.NamedTemporaryFile(
        suffix=".npz", delete=False, prefix="synthetic_dataset_"
    )
    # pragma: no mutate end
    temp_path = Path(temp_file.name)
    temp_file.close()

    # Save in NPZ format with correct key names
    np.savez(
        temp_path,
        train_images=train_images,
        train_labels=train_labels,
        val_images=val_images,
        val_labels=val_labels,
        test_images=test_images,
        test_labels=test_labels,
    )

    # Return a DatasetData object with all required parameters
    is_rgb = channels == 3

    return DatasetData(
        path=temp_path,
        name=name,
        is_rgb=is_rgb,
        num_classes=num_classes,
    )

`create_synthetic_grayscale_dataset(num_classes=8, samples=100, resolution=28)` ¶

Create a synthetic grayscale NPZ dataset for testing.

Convenience function for creating single-channel (grayscale) synthetic data.

Parameters:

Name	Type	Description	Default
`num_classes`	`int`	Number of target categories (default: 8)	`8`
`samples`	`int`	Number of training samples (default: 100)	`100`
`resolution`	`int`	Image resolution (HxW) (default: 28)	`28`

Returns:

Name	Type	Description
`DatasetData`	`DatasetData`	A grayscale data object compatible with the pipeline

Source code in orchard/data_handler/diagnostic/synthetic.py

def create_synthetic_grayscale_dataset(
    num_classes: int = 8,  # pragma: no mutate
    samples: int = 100,  # pragma: no mutate
    resolution: int = 28,  # pragma: no mutate
) -> DatasetData:
    """
    Create a synthetic grayscale NPZ dataset for testing.

    Convenience function for creating single-channel (grayscale) synthetic data.

    Args:
        num_classes: Number of target categories (default: 8)
        samples: Number of training samples (default: 100)
        resolution: Image resolution (HxW) (default: 28)

    Returns:
        DatasetData: A grayscale data object compatible with the pipeline
    """
    return create_synthetic_dataset(
        num_classes=num_classes,
        samples=samples,
        resolution=resolution,
        channels=1,
        name="syntheticmnist_gray",
    )

`create_temp_loader(dataset_path, batch_size=_DEFAULT_HEALTHCHECK_BATCH_SIZE)` ¶

Load a NPZ dataset lazily and return a DataLoader for health checks.

This avoids loading the entire dataset into RAM at once, which is critical for large datasets (e.g., 224x224 images).

Source code in orchard/data_handler/diagnostic/temp_loader.py

def create_temp_loader(
    dataset_path: Path, batch_size: int = _DEFAULT_HEALTHCHECK_BATCH_SIZE
) -> DataLoader[Any]:
    """
    Load a NPZ dataset lazily and return a DataLoader for health checks.

    This avoids loading the entire dataset into RAM at once, which is critical
    for large datasets (e.g., 224x224 images).
    """
    dataset = VisionDataset.lazy(dataset_path)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return loader

`ensure_dataset_npz(metadata, retries=5, delay=5.0)` ¶

Dispatcher that routes each dataset to its dedicated fetch pipeline.

Automatically detects dataset type from metadata.name and delegates to the appropriate download/conversion module. Adding a new domain (e.g. a new resolution or source) only requires a new branch here and a corresponding fetch module.

Parameters:

Name	Type	Description	Default
`metadata`	`DatasetMetadata`	Metadata containing URL, MD5, name and target path.	required
`retries`	`int`	Max number of download attempts (NPZ fetcher only).	`5`
`delay`	`float`	Delay (seconds) between retries (NPZ fetcher only).	`5.0`

Returns:

Name	Type	Description
`Path`	`Path`	Path to the successfully validated .npz file.

Source code in orchard/data_handler/dispatcher.py

def ensure_dataset_npz(
    metadata: DatasetMetadata,
    retries: int = 5,
    delay: float = 5.0,
) -> Path:
    """
    Dispatcher that routes each dataset to its dedicated fetch pipeline.

    Automatically detects dataset type from ``metadata.name`` and delegates
    to the appropriate download/conversion module. Adding a new domain
    (e.g. a new resolution or source) only requires a new branch here and
    a corresponding fetch module.

    Args:
        metadata (DatasetMetadata): Metadata containing URL, MD5, name and target path.
        retries (int): Max number of download attempts (NPZ fetcher only).
        delay (float): Delay (seconds) between retries (NPZ fetcher only).

    Returns:
        Path: Path to the successfully validated .npz file.
    """
    # Galaxy10 requires HDF5 download and conversion to NPZ
    if metadata.name == "galaxy10":
        from .fetchers import ensure_galaxy10_npz

        return ensure_galaxy10_npz(metadata)

    # CIFAR-10/100 via torchvision download and NPZ conversion
    if metadata.name in ("cifar10", "cifar100"):
        from .fetchers import ensure_cifar_npz

        return ensure_cifar_npz(metadata)

    # PennFudan pedestrian detection (ZIP → 2 NPZ)
    if metadata.name == "pennfudan":
        from .fetchers import ensure_pennfudan_npz

        return ensure_pennfudan_npz(metadata)

    # Default: standard NPZ download with retries and MD5 check
    from .fetchers import ensure_medmnist_npz

    return ensure_medmnist_npz(metadata, retries=retries, delay=delay)

`load_dataset(metadata)` ¶

Ensure the dataset is present on disk and return its inspection results.

Downloads the dataset if missing, then inspects all samples to determine format properties (color mode, class count).

Parameters:

Name	Type	Description	Default
`metadata`	`DatasetMetadata`	Dataset metadata (URL, MD5, name, path).	required

Returns:

Type	Description
`DatasetData`	Inspection results with format info derived from the full dataset.

Source code in orchard/data_handler/dispatcher.py

def load_dataset(metadata: DatasetMetadata) -> DatasetData:
    """
    Ensure the dataset is present on disk and return its inspection results.

    Downloads the dataset if missing, then inspects all samples to determine
    format properties (color mode, class count).

    Args:
        metadata: Dataset metadata (URL, MD5, name, path).

    Returns:
        Inspection results with format info derived from the full dataset.
    """
    return _load_and_inspect(metadata)

`get_dataloaders(metadata, dataset_cfg, training_cfg, aug_cfg, num_workers, is_optuna=False, task_type='classification')` ¶

Convenience function for creating train/val/test DataLoaders.

Wraps DataLoaderFactory for streamlined loader construction with automatic class balancing, hardware optimization, and Optuna support.

Parameters:

Name	Type	Description	Default
`metadata`	`DatasetData`	Dataset metadata from load_dataset (paths, splits).	required
`dataset_cfg`	`DatasetConfig`	Dataset sub-config (splits, classes, resolution).	required
`training_cfg`	`TrainingConfig`	Training sub-config (batch size, seed).	required
`aug_cfg`	`AugmentationConfig`	Augmentation sub-config (transforms pipeline).	required
`num_workers`	`int`	Resolved worker count from hardware config.	required
`is_optuna`	`bool`	If True, use memory-conservative settings for hyperparameter tuning.	`False`
`task_type`	`str`	Task type (`"classification"` or `"detection"`).	`'classification'`

Returns:

Type	Description
`tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]`	A 3-tuple of (train_loader, val_loader, test_loader).

Example

data = load_dataset(ds_meta) loaders = get_dataloaders( ... data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers ... )

Source code in orchard/data_handler/loader.py

def get_dataloaders(
    metadata: DatasetData,
    dataset_cfg: DatasetConfig,
    training_cfg: TrainingConfig,
    aug_cfg: AugmentationConfig,
    num_workers: int,
    is_optuna: bool = False,
    task_type: str = "classification",
) -> tuple[DataLoader[Any], DataLoader[Any], DataLoader[Any]]:
    """
    Convenience function for creating train/val/test DataLoaders.

    Wraps DataLoaderFactory for streamlined loader construction with
    automatic class balancing, hardware optimization, and Optuna support.

    Args:
        metadata: Dataset metadata from load_dataset (paths, splits).
        dataset_cfg: Dataset sub-config (splits, classes, resolution).
        training_cfg: Training sub-config (batch size, seed).
        aug_cfg: Augmentation sub-config (transforms pipeline).
        num_workers: Resolved worker count from hardware config.
        is_optuna: If True, use memory-conservative settings for
            hyperparameter tuning.
        task_type: Task type (``"classification"`` or ``"detection"``).

    Returns:
        A 3-tuple of (train_loader, val_loader, test_loader).

    Example:
        >>> data = load_dataset(ds_meta)
        >>> loaders = get_dataloaders(
        ...     data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers
        ... )
    """
    factory = DataLoaderFactory(
        dataset_cfg, training_cfg, aug_cfg, num_workers, metadata, task_type=task_type
    )
    return factory.build(is_optuna=is_optuna)

`get_augmentations_description(aug_cfg, img_size, mixup_alpha, ds_meta=None)` ¶

Generates descriptive string of augmentations for logging.

Parameters:

Name	Type	Description	Default
`aug_cfg`	`AugmentationConfig`	Augmentation sub-configuration	required
`img_size`	`int`	Target image size for resized crop	required
`mixup_alpha`	`float`	MixUp alpha (0.0 to disable)	required
`ds_meta`	`DatasetMetadata \| None`	Dataset metadata (if provided, respects domain flags)	`None`

Returns:

Type	Description
`str`	Human-readable augmentation summary

Source code in orchard/data_handler/transforms.py

def get_augmentations_description(
    aug_cfg: AugmentationConfig,
    img_size: int,
    mixup_alpha: float,
    ds_meta: DatasetMetadata | None = None,
) -> str:
    """
    Generates descriptive string of augmentations for logging.

    Args:
        aug_cfg: Augmentation sub-configuration
        img_size: Target image size for resized crop
        mixup_alpha: MixUp alpha (0.0 to disable)
        ds_meta: Dataset metadata (if provided, respects domain flags)

    Returns:
        Human-readable augmentation summary
    """
    is_anatomical = ds_meta.is_anatomical if ds_meta else True
    is_texture = ds_meta.is_texture_based if ds_meta else True

    params = {}
    if not is_anatomical:
        params["HFlip"] = aug_cfg.hflip
        params["Rotation"] = f"{aug_cfg.rotation_angle}°"
    if not is_texture:
        params["Jitter"] = aug_cfg.jitter_val
    params["ResizedCrop"] = f"{img_size} ({aug_cfg.min_scale}, 1.0)"

    descr = [f"{k}({v})" for k, v in params.items()]

    if mixup_alpha > 0:
        descr.append(f"MixUp(α={mixup_alpha})")

    return ", ".join(descr)

`get_pipeline_transforms(aug_cfg, img_size, ds_meta, *, force_rgb=True, norm_mean=None, norm_std=None)` ¶

Constructs training and validation transformation pipelines.

Dynamically adapts to dataset characteristics (RGB vs Grayscale) and optionally promotes grayscale to 3-channel for pretrained-weight compatibility. Uses torchvision v2 transforms for improved CPU/GPU performance.

Pipeline Logic

Convert to tensor format (ToImage + ToDtype)
Promote 1-channel to 3-channel when force_rgb is True and the dataset is native grayscale
Apply domain-aware augmentations (training only): geometric transforms disabled for anatomical datasets, color jitter reduced for texture-based datasets
Normalize with dataset-specific statistics

Parameters:

Name	Type	Description	Default
`aug_cfg`	`AugmentationConfig`	Augmentation sub-configuration	required
`img_size`	`int`	Target image size	required
`ds_meta`	`DatasetMetadata`	Dataset metadata (channels, domain flags)	required
`force_rgb`	`bool`	Promote grayscale datasets to 3-channel RGB	`True`
`norm_mean`	`tuple[float, ...] \| None`	Pre-computed normalization mean (from DatasetConfig). When None, computed from ds_meta + force_rgb.	`None`
`norm_std`	`tuple[float, ...] \| None`	Pre-computed normalization std (from DatasetConfig). When None, computed from ds_meta + force_rgb.	`None`

Returns:

Type	Description
`tuple[Compose, Compose]`	tuple[v2.Compose, v2.Compose]: (train_transform, val_transform)

Source code in orchard/data_handler/transforms.py

def get_pipeline_transforms(
    aug_cfg: AugmentationConfig,
    img_size: int,
    ds_meta: DatasetMetadata,
    *,
    force_rgb: bool = True,
    norm_mean: tuple[float, ...] | None = None,
    norm_std: tuple[float, ...] | None = None,
) -> tuple[v2.Compose, v2.Compose]:
    """
    Constructs training and validation transformation pipelines.

    Dynamically adapts to dataset characteristics (RGB vs Grayscale) and
    optionally promotes grayscale to 3-channel for pretrained-weight
    compatibility.  Uses torchvision v2 transforms for improved CPU/GPU
    performance.

    Pipeline Logic:
        1. Convert to tensor format (ToImage + ToDtype)
        2. Promote 1-channel to 3-channel when ``force_rgb`` is True
           and the dataset is native grayscale
        3. Apply domain-aware augmentations (training only):
           geometric transforms disabled for anatomical datasets,
           color jitter reduced for texture-based datasets
        4. Normalize with dataset-specific statistics

    Args:
        aug_cfg: Augmentation sub-configuration
        img_size: Target image size
        ds_meta: Dataset metadata (channels, domain flags)
        force_rgb: Promote grayscale datasets to 3-channel RGB
        norm_mean: Pre-computed normalization mean (from DatasetConfig).
            When None, computed from ds_meta + force_rgb.
        norm_std: Pre-computed normalization std (from DatasetConfig).
            When None, computed from ds_meta + force_rgb.

    Returns:
        tuple[v2.Compose, v2.Compose]: (train_transform, val_transform)
    """
    # Determine if dataset is native RGB or requires grayscale promotion
    is_rgb = ds_meta.in_channels == 3
    promote_to_rgb = not is_rgb and force_rgb

    # Normalization statistics: prefer caller-provided (single source of truth
    # via DatasetConfig.mean/std), fall back to local computation.
    if norm_mean is not None and norm_std is not None:
        mean = list(norm_mean)
        std = list(norm_std)
    elif promote_to_rgb:
        mean = [ds_meta.mean[0]] * 3
        std = [ds_meta.std[0]] * 3
    else:
        mean = list(ds_meta.mean)
        std = list(ds_meta.std)

    def get_base_ops() -> list[v2.Transform]:
        """
        Foundational operations common to all pipelines.

        Returns:
            list of base transforms (tensor conversion + channel promotion)
        """
        ops = [
            v2.ToImage(),  # Convert PIL/ndarray to tensor
            v2.ToDtype(torch.float32, scale=True),  # Scale to [0,1]
        ]

        # Promote 1-channel to 3-channel for architecture compatibility
        if promote_to_rgb:
            ops.append(v2.Grayscale(num_output_channels=3))

        return ops

    # --- TRAINING PIPELINE ---
    # Domain-aware augmentations: respects is_anatomical and is_texture_based flags
    train_ops = [*get_base_ops()]

    # Geometric: disabled for anatomical datasets (orientation is diagnostic)
    if not ds_meta.is_anatomical:
        train_ops.append(v2.RandomHorizontalFlip(p=aug_cfg.hflip))
        train_ops.append(v2.RandomRotation(aug_cfg.rotation_angle))

    # Photometric: reduced for texture-based datasets (fine patterns are fragile)
    if not ds_meta.is_texture_based:
        train_ops.append(
            v2.ColorJitter(
                brightness=aug_cfg.jitter_val,
                contrast=aug_cfg.jitter_val,
                saturation=aug_cfg.jitter_val if is_rgb else 0.0,
            )
        )

    train_ops.extend(
        [
            v2.RandomResizedCrop(
                size=img_size,
                scale=(aug_cfg.min_scale, 1.0),
                antialias=True,
                interpolation=v2.InterpolationMode.BILINEAR,
            ),
            v2.Normalize(mean=mean, std=std),
        ]
    )

    train_transform = v2.Compose(train_ops)

    # --- VALIDATION/INFERENCE PIPELINE ---
    # Deterministic transformations only (no augmentation)
    val_transform = v2.Compose(
        [
            *get_base_ops(),
            v2.Resize(size=img_size, antialias=True),
            v2.Normalize(mean=mean, std=std),
        ]
    )

    return train_transform, val_transform

data_handler

orchard.data_handler ¶

VisionDataset(images, labels, *, transform=None) ¶

from_npz(path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED) classmethod ¶

lazy(path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED) classmethod ¶

__len__() ¶

__getitem__(idx) ¶

DetectionDataset(images, annotations, *, transform=None) ¶

from_arrays(images, annotations, *, transform=None, max_samples=None, seed=DEFAULT_SEED) classmethod ¶

from_npz(image_path, annotation_path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED) classmethod ¶

__len__() ¶

__getitem__(idx) ¶

DatasetData(path, name, is_rgb, num_classes, annotation_path=None) dataclass ¶

DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata, task_type='classification') ¶

build(is_optuna=False) ¶

detection_collate_fn(batch) ¶

show_sample_images(loader, save_path, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, title_prefix=None) ¶

show_samples_for_dataset(loader, dataset_name, run_paths, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, resolution=None) ¶

create_synthetic_dataset(num_classes=8, samples=100, resolution=28, channels=3, name='syntheticmnist') ¶

create_synthetic_grayscale_dataset(num_classes=8, samples=100, resolution=28) ¶

create_temp_loader(dataset_path, batch_size=_DEFAULT_HEALTHCHECK_BATCH_SIZE) ¶

ensure_dataset_npz(metadata, retries=5, delay=5.0) ¶

load_dataset(metadata) ¶

get_dataloaders(metadata, dataset_cfg, training_cfg, aug_cfg, num_workers, is_optuna=False, task_type='classification') ¶

get_augmentations_description(aug_cfg, img_size, mixup_alpha, ds_meta=None) ¶

get_pipeline_transforms(aug_cfg, img_size, ds_meta, *, force_rgb=True, norm_mean=None, norm_std=None) ¶

`orchard.data_handler` ¶

`VisionDataset(images, labels, *, transform=None)` ¶

`from_npz(path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED)` `classmethod` ¶

`lazy(path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED)` `classmethod` ¶

`len()` ¶

`getitem(idx)` ¶

`DetectionDataset(images, annotations, *, transform=None)` ¶

`from_arrays(images, annotations, *, transform=None, max_samples=None, seed=DEFAULT_SEED)` `classmethod` ¶

`from_npz(image_path, annotation_path, split='train', *, transform=None, max_samples=None, seed=DEFAULT_SEED)` `classmethod` ¶

`len()` ¶

`getitem(idx)` ¶

`DatasetData(path, name, is_rgb, num_classes, annotation_path=None)` `dataclass` ¶

`DataLoaderFactory(dataset_cfg, training_cfg, aug_cfg, num_workers, metadata, task_type='classification')` ¶

`build(is_optuna=False)` ¶

`detection_collate_fn(batch)` ¶

`show_sample_images(loader, save_path, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, title_prefix=None)` ¶

`show_samples_for_dataset(loader, dataset_name, run_paths, *, mean=None, std=None, arch_name='Model', fig_dpi=_DEFAULT_DPI, num_samples=16, resolution=None)` ¶

`create_synthetic_dataset(num_classes=8, samples=100, resolution=28, channels=3, name='syntheticmnist')` ¶

`create_synthetic_grayscale_dataset(num_classes=8, samples=100, resolution=28)` ¶

`create_temp_loader(dataset_path, batch_size=_DEFAULT_HEALTHCHECK_BATCH_SIZE)` ¶

`ensure_dataset_npz(metadata, retries=5, delay=5.0)` ¶

`load_dataset(metadata)` ¶

`get_dataloaders(metadata, dataset_cfg, training_cfg, aug_cfg, num_workers, is_optuna=False, task_type='classification')` ¶

`get_augmentations_description(aug_cfg, img_size, mixup_alpha, ds_meta=None)` ¶

`get_pipeline_transforms(aug_cfg, img_size, ds_meta, *, force_rgb=True, norm_mean=None, norm_std=None)` ¶