Skip to content

fetchers

orchard.data_handler.fetchers

Domain-Specific Dataset Fetchers.

Each module in this sub-package handles the download and conversion logic for a single dataset domain (MedMNIST, Galaxy10, etc.), keeping the main fetcher dispatcher clean as new sources are added.

Design note: fetcher modules intentionally duplicate some logic (e.g. stratified splitting) rather than sharing a base class. Each fetcher is a self-contained adapter to an external resource whose URL, format, or availability may change without notice. Isolation ensures that breaking changes in one source never cascade to others, and that any single fetcher can be removed cleanly.

ensure_cifar_npz(metadata)

Ensures a CIFAR dataset is downloaded and converted to NPZ format.

Supports both CIFAR-10 and CIFAR-100 via metadata.name routing.

Parameters:

Name Type Description Default
metadata DatasetMetadata

DatasetMetadata with name ('cifar10' or 'cifar100') and path

required

Returns:

Type Description
Path

Path to validated NPZ file

Source code in orchard/data_handler/fetchers/cifar_converter.py
def ensure_cifar_npz(metadata: DatasetMetadata) -> Path:
    """
    Ensures a CIFAR dataset is downloaded and converted to NPZ format.

    Supports both CIFAR-10 and CIFAR-100 via metadata.name routing.

    Args:
        metadata: DatasetMetadata with name ('cifar10' or 'cifar100') and path

    Returns:
        Path to validated NPZ file
    """
    target_npz = metadata.path

    if target_npz.exists():
        logger.debug(
            "%s%s %-18s: %s found at %s",
            LogStyle.INDENT,
            LogStyle.ARROW,
            "Dataset",
            metadata.display_name,
            target_npz.name,
        )
        return target_npz

    from torchvision.datasets import CIFAR10, CIFAR100

    if metadata.name == "cifar100":
        cifar_cls = CIFAR100
    else:
        cifar_cls = CIFAR10

    return _download_and_convert(metadata, cifar_cls)

ensure_galaxy10_npz(metadata)

Ensures Galaxy10 is downloaded and converted to NPZ format.

Parameters:

Name Type Description Default
metadata DatasetMetadata

DatasetMetadata with URL and path

required

Returns:

Type Description
Path

Path to validated NPZ file

Source code in orchard/data_handler/fetchers/galaxy10_converter.py
def ensure_galaxy10_npz(metadata: DatasetMetadata) -> Path:
    """
    Ensures Galaxy10 is downloaded and converted to NPZ format.

    Args:
        metadata: DatasetMetadata with URL and path

    Returns:
        Path to validated NPZ file
    """
    from ...core import md5_checksum

    target_npz = metadata.path

    # Check if NPZ already exists
    if target_npz.exists():
        actual_md5 = md5_checksum(target_npz)
        if (
            actual_md5 == metadata.md5_checksum
            or metadata.md5_checksum == "placeholder_will_be_calculated_after_conversion"
        ):
            logger.debug(
                "%s%s %-18s: Galaxy10 found at %s",
                LogStyle.INDENT,
                LogStyle.ARROW,
                "Dataset",
                target_npz.name,
            )
            return target_npz
        else:
            logger.warning("Galaxy10 NPZ MD5 mismatch, regenerating...")
            target_npz.unlink()

    # Download HDF5
    h5_path = target_npz.parent / "Galaxy10_DECals.h5"
    download_galaxy10_h5(metadata.url, h5_path)

    # Convert to NPZ
    target_size = metadata.native_resolution or 224
    convert_galaxy10_to_npz(
        h5_path=h5_path,
        output_npz=target_npz,
        target_size=target_size,
    )

    # Report MD5
    actual_md5 = md5_checksum(target_npz)
    logger.info("%s%s %-18s: %s", LogStyle.INDENT, LogStyle.ARROW, "MD5", actual_md5)

    if metadata.md5_checksum == "placeholder_will_be_calculated_after_conversion":
        logger.info(
            '%s%s %-18s: Update metadata.md5_checksum = "%s"',
            LogStyle.INDENT,
            LogStyle.ARROW,
            "Action Required",
            actual_md5,
        )

    return target_npz

ensure_medmnist_npz(metadata, retries=5, delay=5.0)

Downloads a MedMNIST NPZ file with retries and MD5 validation.

Implements a three-phase strategy
  1. Return immediately if a valid local copy already exists.
  2. Delete any corrupted local copy.
  3. Stream-download with retry loop and atomic file replacement.

Parameters:

Name Type Description Default
metadata DatasetMetadata

Metadata containing URL, MD5, name and target path.

required
retries int

Max number of download attempts.

5
delay float

Base delay (seconds) between retries (quadratic backoff on 429).

5.0

Returns:

Name Type Description
Path Path

Path to the successfully validated .npz file.

Raises:

Type Description
OrchardDatasetError

If all download attempts fail.

Source code in orchard/data_handler/fetchers/medmnist_fetcher.py
def ensure_medmnist_npz(
    metadata: DatasetMetadata,
    retries: int = 5,
    delay: float = 5.0,
) -> Path:
    """
    Downloads a MedMNIST NPZ file with retries and MD5 validation.

    Implements a three-phase strategy:
        1. Return immediately if a valid local copy already exists.
        2. Delete any corrupted local copy.
        3. Stream-download with retry loop and atomic file replacement.

    Args:
        metadata (DatasetMetadata): Metadata containing URL, MD5, name and target path.
        retries (int): Max number of download attempts.
        delay (float): Base delay (seconds) between retries (quadratic backoff on 429).

    Returns:
        Path: Path to the successfully validated .npz file.

    Raises:
        OrchardDatasetError: If all download attempts fail.
    """
    target_npz = metadata.path

    # 1. Validation of existing file
    if _is_valid_npz(target_npz, metadata.md5_checksum):
        logger.debug(
            "%s%s %-18s: '%s' found at %s",
            LogStyle.INDENT,
            LogStyle.ARROW,
            "Dataset",
            metadata.name,
            target_npz.name,
        )
        return target_npz

    # 2. Cleanup corrupted file
    if target_npz.exists():
        logger.warning("Corrupted dataset found, deleting: %s", target_npz)
        target_npz.unlink()

    # 3. Download logic with retries
    logger.info("%s%s %-18s: %s", LogStyle.INDENT, LogStyle.ARROW, "Downloading", metadata.name)
    target_npz.parent.mkdir(parents=True, exist_ok=True)
    tmp_path = target_npz.with_suffix(".tmp")

    for attempt in range(1, retries + 1):
        try:
            _stream_download(metadata.url, tmp_path)

            if not _is_valid_npz(tmp_path, metadata.md5_checksum):
                actual_md5 = md5_checksum(tmp_path)
                logger.error("MD5 mismatch: expected %s, got %s", metadata.md5_checksum, actual_md5)
                raise OrchardDatasetError("Downloaded file failed MD5 or header validation")

            # Atomic move
            tmp_path.replace(target_npz)
            logger.info(
                "%s%s %-18s: %s", LogStyle.INDENT, LogStyle.SUCCESS, "Verified", metadata.name
            )
            return target_npz

        except (OrchardDatasetError, OSError) as e:
            if tmp_path.exists():
                tmp_path.unlink()

            if attempt == retries:
                logger.error("Download failed after %d attempts", retries)
                raise OrchardDatasetError(f"Could not download {metadata.name}") from e

            actual_delay = _retry_delay(e, delay, attempt)
            logger.warning(
                "Attempt %d/%d failed: %s. Retrying in %ss...", attempt, retries, e, actual_delay
            )
            time.sleep(actual_delay)

    raise OrchardDatasetError("Unexpected error in dataset download logic.")  # pragma: no cover