Skip to content

diagnostic

orchard.data_handler.diagnostic

Diagnostic Utilities for Health Checks and Smoke Tests.

This private submodule provides lightweight data utilities used exclusively for pipeline validation (health checks, smoke tests, CI). These are not part of the production training pipeline.

SyntheticDetectionData(image_path, annotation_path, num_classes, name)

Container for synthetic detection dataset paths and metadata.

Attributes:

Name Type Description
image_path Path

Path to images NPZ.

annotation_path Path

Path to annotations NPZ.

num_classes int

Number of object classes (excluding background).

name str

Dataset identifier.

Source code in orchard/data_handler/diagnostic/synthetic_detection.py
def __init__(
    self,
    image_path: Path,
    annotation_path: Path,
    num_classes: int,
    name: str,
) -> None:
    self.image_path = image_path
    self.annotation_path = annotation_path
    self.num_classes = num_classes
    self.name = name

create_synthetic_dataset(num_classes=8, samples=100, resolution=28, channels=3, name='syntheticmnist')

Create a synthetic NPZ-compatible dataset for testing.

This function generates random image data and labels, saves them to a temporary .npz file, and returns a DatasetData object that can be used with the existing data pipeline.

Parameters:

Name Type Description Default
num_classes int

Number of target categories (default: 8)

8
samples int

Number of training samples (default: 100)

100
resolution int

Image resolution (HxW) (default: 28)

28
channels int

Number of color channels (default: 3 for RGB)

3
name str

Dataset name for identification (default: "syntheticmnist")

'syntheticmnist'

Returns:

Name Type Description
DatasetData DatasetData

A data object compatible with the existing pipeline

Example

data = create_synthetic_dataset(num_classes=8, samples=100) train_loader, val_loader, test_loader = get_dataloaders( ... data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers ... )

Source code in orchard/data_handler/diagnostic/synthetic.py
def create_synthetic_dataset(
    num_classes: int = 8,  # pragma: no mutate
    samples: int = 100,  # pragma: no mutate
    resolution: int = 28,  # pragma: no mutate
    channels: int = 3,  # pragma: no mutate
    name: str = "syntheticmnist",  # pragma: no mutate
) -> DatasetData:
    """
    Create a synthetic NPZ-compatible dataset for testing.

    This function generates random image data and labels, saves them to a
    temporary .npz file, and returns a DatasetData object that can be used
    with the existing data pipeline.

    Args:
        num_classes: Number of target categories (default: 8)
        samples: Number of training samples (default: 100)
        resolution: Image resolution (HxW) (default: 28)
        channels: Number of color channels (default: 3 for RGB)
        name: Dataset name for identification (default: "syntheticmnist")

    Returns:
        DatasetData: A data object compatible with the existing pipeline

    Example:
        >>> data = create_synthetic_dataset(num_classes=8, samples=100)
        >>> train_loader, val_loader, test_loader = get_dataloaders(
        ...     data, cfg.dataset, cfg.training, cfg.augmentation, cfg.num_workers
        ... )
    """
    # pragma: no mutate start
    rng = np.random.default_rng(_SYNTHETIC_SEED)

    # Generate synthetic image data
    train_images = rng.integers(
        0,
        _SYNTHETIC_PIXEL_RANGE,
        (samples, resolution, resolution, channels),
        dtype=np.uint8,
    )
    train_labels = rng.integers(0, num_classes, (samples, 1), dtype=np.uint8)

    # Validation and test sets are smaller (10% of training size each)
    val_samples = max(MIN_SPLIT_SAMPLES, samples // 10)
    test_samples = max(MIN_SPLIT_SAMPLES, samples // 10)

    val_images = rng.integers(
        0,
        _SYNTHETIC_PIXEL_RANGE,
        (val_samples, resolution, resolution, channels),
        dtype=np.uint8,
    )
    val_labels = rng.integers(0, num_classes, (val_samples, 1), dtype=np.uint8)

    test_images = rng.integers(
        0,
        _SYNTHETIC_PIXEL_RANGE,
        (test_samples, resolution, resolution, channels),
        dtype=np.uint8,
    )
    test_labels = rng.integers(0, num_classes, (test_samples, 1), dtype=np.uint8)

    # Create a temporary .npz file with standard format
    temp_file = tempfile.NamedTemporaryFile(
        suffix=".npz", delete=False, prefix="synthetic_dataset_"
    )
    # pragma: no mutate end
    temp_path = Path(temp_file.name)
    temp_file.close()

    # Save in NPZ format with correct key names
    np.savez(
        temp_path,
        train_images=train_images,
        train_labels=train_labels,
        val_images=val_images,
        val_labels=val_labels,
        test_images=test_images,
        test_labels=test_labels,
    )

    # Return a DatasetData object with all required parameters
    is_rgb = channels == 3

    return DatasetData(
        path=temp_path,
        name=name,
        is_rgb=is_rgb,
        num_classes=num_classes,
    )

create_synthetic_grayscale_dataset(num_classes=8, samples=100, resolution=28)

Create a synthetic grayscale NPZ dataset for testing.

Convenience function for creating single-channel (grayscale) synthetic data.

Parameters:

Name Type Description Default
num_classes int

Number of target categories (default: 8)

8
samples int

Number of training samples (default: 100)

100
resolution int

Image resolution (HxW) (default: 28)

28

Returns:

Name Type Description
DatasetData DatasetData

A grayscale data object compatible with the pipeline

Source code in orchard/data_handler/diagnostic/synthetic.py
def create_synthetic_grayscale_dataset(
    num_classes: int = 8,  # pragma: no mutate
    samples: int = 100,  # pragma: no mutate
    resolution: int = 28,  # pragma: no mutate
) -> DatasetData:
    """
    Create a synthetic grayscale NPZ dataset for testing.

    Convenience function for creating single-channel (grayscale) synthetic data.

    Args:
        num_classes: Number of target categories (default: 8)
        samples: Number of training samples (default: 100)
        resolution: Image resolution (HxW) (default: 28)

    Returns:
        DatasetData: A grayscale data object compatible with the pipeline
    """
    return create_synthetic_dataset(
        num_classes=num_classes,
        samples=samples,
        resolution=resolution,
        channels=1,
        name="syntheticmnist_gray",
    )

create_synthetic_detection_dataset(num_classes=4, samples=50, resolution=64, channels=3, name='synthetic_detection')

Create a synthetic detection dataset for testing.

Generates random images with random bounding boxes and saves them as NPZ files (images + annotations separately).

Parameters:

Name Type Description Default
num_classes int

Number of object categories (default: 4).

4
samples int

Number of training images (default: 50).

50
resolution int

Image size in pixels (default: 64).

64
channels int

Color channels (default: 3).

3
name str

Dataset identifier (default: "synthetic_detection").

'synthetic_detection'

Returns:

Type Description
SyntheticDetectionData

SyntheticDetectionData with paths to generated NPZ files.

Source code in orchard/data_handler/diagnostic/synthetic_detection.py
def create_synthetic_detection_dataset(
    num_classes: int = 4,  # pragma: no mutate
    samples: int = 50,  # pragma: no mutate
    resolution: int = 64,  # pragma: no mutate
    channels: int = 3,  # pragma: no mutate
    name: str = "synthetic_detection",  # pragma: no mutate
) -> SyntheticDetectionData:
    """
    Create a synthetic detection dataset for testing.

    Generates random images with random bounding boxes and saves them
    as NPZ files (images + annotations separately).

    Args:
        num_classes: Number of object categories (default: 4).
        samples: Number of training images (default: 50).
        resolution: Image size in pixels (default: 64).
        channels: Color channels (default: 3).
        name: Dataset identifier (default: "synthetic_detection").

    Returns:
        SyntheticDetectionData with paths to generated NPZ files.
    """
    rng = np.random.default_rng(_SYNTHETIC_SEED)  # pragma: no mutate

    train_imgs, train_boxes, train_labels = _generate_split(
        rng, samples, resolution, channels, num_classes
    )
    # pragma: no mutate start
    val_samples = max(MIN_SPLIT_SAMPLES, samples // 10)
    test_samples = max(MIN_SPLIT_SAMPLES, samples // 10)
    # pragma: no mutate end

    val_imgs, val_boxes, val_labels = _generate_split(
        rng, val_samples, resolution, channels, num_classes
    )
    test_imgs, test_boxes, test_labels = _generate_split(
        rng, test_samples, resolution, channels, num_classes
    )

    # Save images NPZ
    # pragma: no mutate start
    img_file = tempfile.NamedTemporaryFile(suffix=".npz", delete=False, prefix="det_images_")
    # pragma: no mutate end
    img_path = Path(img_file.name)
    img_file.close()
    np.savez(
        img_path,
        train_images=train_imgs,
        val_images=val_imgs,
        test_images=test_imgs,
    )

    # Save annotations NPZ (object arrays for variable-length boxes)
    # pragma: no mutate start
    ann_file = tempfile.NamedTemporaryFile(suffix=".npz", delete=False, prefix="det_annotations_")
    # pragma: no mutate end
    ann_path = Path(ann_file.name)
    ann_file.close()

    def _to_object_array(lst: list[npt.NDArray[Any]]) -> npt.NDArray[Any]:
        arr = np.empty(len(lst), dtype=object)
        for i, v in enumerate(lst):
            arr[i] = v
        return arr

    np.savez(
        ann_path,
        train_boxes=_to_object_array(train_boxes),
        train_labels=_to_object_array(train_labels),
        val_boxes=_to_object_array(val_boxes),
        val_labels=_to_object_array(val_labels),
        test_boxes=_to_object_array(test_boxes),
        test_labels=_to_object_array(test_labels),
    )

    return SyntheticDetectionData(
        image_path=img_path,
        annotation_path=ann_path,
        num_classes=num_classes,
        name=name,
    )

create_temp_loader(dataset_path, batch_size=_DEFAULT_HEALTHCHECK_BATCH_SIZE)

Load a NPZ dataset lazily and return a DataLoader for health checks.

This avoids loading the entire dataset into RAM at once, which is critical for large datasets (e.g., 224x224 images).

Source code in orchard/data_handler/diagnostic/temp_loader.py
def create_temp_loader(
    dataset_path: Path, batch_size: int = _DEFAULT_HEALTHCHECK_BATCH_SIZE
) -> DataLoader[Any]:
    """
    Load a NPZ dataset lazily and return a DataLoader for health checks.

    This avoids loading the entire dataset into RAM at once, which is critical
    for large datasets (e.g., 224x224 images).
    """
    dataset = VisionDataset.lazy(dataset_path)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return loader