Skip to content

galaxy10_converter

orchard.data_handler.fetchers.galaxy10_converter

Galaxy10 DECals Dataset Converter.

Downloads and converts Galaxy10 DECals HDF5 dataset to NPZ format compatible with the Orchard ML pipeline. Creates train/val/test splits.

download_galaxy10_h5(url, target_h5, retries=3, timeout=600, chunk_size=8192)

Downloads Galaxy10 HDF5 file with retry logic.

Parameters:

Name Type Description Default
url str

Download URL

required
target_h5 Path

Path to save HDF5 file

required
retries int

Number of download attempts

3
timeout int

Download timeout in seconds

600
chunk_size int

Streaming chunk size in bytes.

8192
Source code in orchard/data_handler/fetchers/galaxy10_converter.py
def download_galaxy10_h5(
    url: str,
    target_h5: Path,
    retries: int = 3,
    timeout: int = 600,
    chunk_size: int = 8192,
) -> None:
    """
    Downloads Galaxy10 HDF5 file with retry logic.

    Args:
        url: Download URL
        target_h5: Path to save HDF5 file
        retries: Number of download attempts
        timeout: Download timeout in seconds
        chunk_size: Streaming chunk size in bytes.
    """
    if target_h5.exists():
        logger.info(
            "%s%s %-18s: %s",
            LogStyle.INDENT,
            LogStyle.ARROW,
            "HDF5 Cache",
            target_h5.name,
        )
        return

    target_h5.parent.mkdir(parents=True, exist_ok=True)
    tmp_path = target_h5.with_suffix(".tmp")

    for attempt in range(1, retries + 1):
        try:
            logger.info(
                "%s%s %-18s: Galaxy10 (attempt %d/%d)",
                LogStyle.INDENT,
                LogStyle.ARROW,
                "Downloading",
                attempt,
                retries,
            )

            with requests.get(url, timeout=timeout, stream=True) as r:
                r.raise_for_status()

                with open(tmp_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=chunk_size):
                        if chunk:
                            f.write(chunk)

            tmp_path.replace(target_h5)
            logger.info(
                "%s%s %-18s: %s",
                LogStyle.INDENT,
                LogStyle.SUCCESS,
                "Downloaded",
                target_h5.name,
            )
            return

        except OSError as e:
            if tmp_path.exists():
                tmp_path.unlink()

            if attempt == retries:
                raise OrchardDatasetError(
                    f"Failed to download Galaxy10 after {retries} attempts"
                ) from e

            logger.warning("Download attempt %d failed: %s", attempt, e)

    raise OrchardDatasetError("Unexpected error in Galaxy10 download")  # pragma: no cover

convert_galaxy10_to_npz(h5_path, output_npz, target_size=224, seed=42)

Converts Galaxy10 HDF5 to NPZ format with train/val/test splits.

Parameters:

Name Type Description Default
h5_path Path

Path to downloaded HDF5 file

required
output_npz Path

Path for output NPZ file

required
target_size int

Target image size (default 224)

224
seed int

Random seed for splits

42
Source code in orchard/data_handler/fetchers/galaxy10_converter.py
def convert_galaxy10_to_npz(
    h5_path: Path,
    output_npz: Path,
    target_size: int = 224,
    seed: int = 42,
) -> None:
    """
    Converts Galaxy10 HDF5 to NPZ format with train/val/test splits.

    Args:
        h5_path: Path to downloaded HDF5 file
        output_npz: Path for output NPZ file
        target_size: Target image size (default 224)
        seed: Random seed for splits
    """
    logger.info(
        "%s%s %-18s: Galaxy10 → NPZ (%dx%d)",
        LogStyle.INDENT,
        LogStyle.ARROW,
        "Converting",
        target_size,
        target_size,
    )

    with h5py.File(h5_path, "r") as f:
        images = np.array(f["images"])
        labels = np.array(f["ans"])

        logger.info(
            "%s%s %-18s: %d images (%s)",
            LogStyle.INDENT,
            LogStyle.ARROW,
            "Loaded",
            len(images),
            images.shape,
        )

        # Resize if needed
        if images.shape[1] != target_size or images.shape[2] != target_size:
            logger.info(
                "%s%s %-18s: %dx%d%dx%d",
                LogStyle.INDENT,
                LogStyle.ARROW,
                "Resizing",
                images.shape[1],
                images.shape[2],
                target_size,
                target_size,
            )
            resized_images = []

            for img in images:
                pil_img = Image.fromarray(img.astype(np.uint8))
                # Use Image.Resampling.BILINEAR for Pillow >= 10.0.0
                resample = getattr(Image, "Resampling", Image).BILINEAR
                pil_img = pil_img.resize((target_size, target_size), resample)
                resized_images.append(np.array(pil_img))

            images = np.array(resized_images, dtype=np.uint8)

        labels = labels.astype(np.int64).reshape(-1, 1)

        # Create splits (70/15/15)
        train_imgs, train_labels, val_imgs, val_labels, test_imgs, test_labels = _create_splits(
            images, labels, seed=seed
        )

        # Save as NPZ
        np.savez_compressed(
            output_npz,
            train_images=train_imgs,
            train_labels=train_labels,
            val_images=val_imgs,
            val_labels=val_labels,
            test_images=test_imgs,
            test_labels=test_labels,
        )

        logger.info(
            "%s%s %-18s: %s",
            LogStyle.INDENT,
            LogStyle.SUCCESS,
            "NPZ Created",
            output_npz.name,
        )
        logger.info(
            "%s%s %-18s: Train: %d, Val: %d, Test: %d",
            LogStyle.INDENT,
            LogStyle.ARROW,
            "Splits",
            len(train_imgs),
            len(val_imgs),
            len(test_imgs),
        )

ensure_galaxy10_npz(metadata)

Ensures Galaxy10 is downloaded and converted to NPZ format.

Parameters:

Name Type Description Default
metadata DatasetMetadata

DatasetMetadata with URL and path

required

Returns:

Type Description
Path

Path to validated NPZ file

Source code in orchard/data_handler/fetchers/galaxy10_converter.py
def ensure_galaxy10_npz(metadata: DatasetMetadata) -> Path:
    """
    Ensures Galaxy10 is downloaded and converted to NPZ format.

    Args:
        metadata: DatasetMetadata with URL and path

    Returns:
        Path to validated NPZ file
    """
    from ...core import md5_checksum

    target_npz = metadata.path

    # Check if NPZ already exists
    if target_npz.exists():
        actual_md5 = md5_checksum(target_npz)
        if (
            actual_md5 == metadata.md5_checksum
            or metadata.md5_checksum == "placeholder_will_be_calculated_after_conversion"
        ):
            logger.debug(
                "%s%s %-18s: Galaxy10 found at %s",
                LogStyle.INDENT,
                LogStyle.ARROW,
                "Dataset",
                target_npz.name,
            )
            return target_npz
        else:
            logger.warning("Galaxy10 NPZ MD5 mismatch, regenerating...")
            target_npz.unlink()

    # Download HDF5
    h5_path = target_npz.parent / "Galaxy10_DECals.h5"
    download_galaxy10_h5(metadata.url, h5_path)

    # Convert to NPZ
    target_size = metadata.native_resolution or 224
    convert_galaxy10_to_npz(
        h5_path=h5_path,
        output_npz=target_npz,
        target_size=target_size,
    )

    # Report MD5
    actual_md5 = md5_checksum(target_npz)
    logger.info("%s%s %-18s: %s", LogStyle.INDENT, LogStyle.ARROW, "MD5", actual_md5)

    if metadata.md5_checksum == "placeholder_will_be_calculated_after_conversion":
        logger.info(
            '%s%s %-18s: Update metadata.md5_checksum = "%s"',
            LogStyle.INDENT,
            LogStyle.ARROW,
            "Action Required",
            actual_md5,
        )

    return target_npz