Skip to content

evaluation_pipeline

orchard.evaluation.evaluation_pipeline

Final Evaluation Pipeline.

Top-level orchestrator that chains inference, visualization, and reporting into a single run_final_evaluation call. Coordinates:

  1. Test-set inference via evaluator.evaluate_model (with optional TTA).
  2. Artifact generation — confusion matrix, training curves, prediction grid.
  3. Structured report (Excel/CSV/JSON) via reporting.create_structured_report.
  4. Metric logging to the experiment tracker (MLflow) when enabled.

This module is the last stage of the training lifecycle, invoked by ModelTrainer after best-weight restoration.

run_final_evaluation(model, test_loader, train_losses, val_metrics_history, class_names, paths, training, dataset, augmentation, evaluation, arch_name, aug_info='N/A', tracker=None)

Execute the complete evaluation pipeline.

Coordinates full-set inference (with TTA support), visualizes metrics, and generates the final structured report.

Parameters:

Name Type Description Default
model Module

Trained model for evaluation (already on target device).

required
test_loader DataLoader[Any]

DataLoader for test set.

required
train_losses list[float]

Training loss history per epoch.

required
val_metrics_history list[Mapping[str, float]]

Validation metrics history per epoch.

required
class_names list[str]

List of class label strings.

required
paths RunPaths

RunPaths for artifact output.

required
training TrainingConfig

Training sub-config (use_tta, hyperparameters for report).

required
dataset DatasetConfig

Dataset sub-config (resolution, metadata, normalization).

required
augmentation AugmentationConfig

Augmentation sub-config (TTA transforms).

required
evaluation EvaluationConfig

Evaluation sub-config (plot flags, report format).

required
arch_name str

Architecture identifier (e.g. "resnet_18").

required
aug_info str

Augmentation description string for report.

'N/A'
tracker TrackerProtocol | None

Optional experiment tracker for final metrics.

None

Returns:

Type Description
tuple[float, float, float]

tuple[float, float, float]: A 3-tuple of:

  • macro_f1 -- Macro-averaged F1 score
  • test_acc -- Test set accuracy
  • test_auc -- Test set AUC (NaN if computation failed)
Source code in orchard/evaluation/evaluation_pipeline.py
def run_final_evaluation(
    model: nn.Module,
    test_loader: DataLoader[Any],
    train_losses: list[float],
    val_metrics_history: list[Mapping[str, float]],
    class_names: list[str],
    paths: RunPaths,
    training: TrainingConfig,
    dataset: DatasetConfig,
    augmentation: AugmentationConfig,
    evaluation: EvaluationConfig,
    arch_name: str,
    aug_info: str = "N/A",  # pragma: no mutate
    tracker: TrackerProtocol | None = None,
) -> tuple[float, float, float]:
    """
    Execute the complete evaluation pipeline.

    Coordinates full-set inference (with TTA support), visualizes metrics,
    and generates the final structured report.

    Args:
        model: Trained model for evaluation (already on target device).
        test_loader: DataLoader for test set.
        train_losses: Training loss history per epoch.
        val_metrics_history: Validation metrics history per epoch.
        class_names: List of class label strings.
        paths: RunPaths for artifact output.
        training: Training sub-config (use_tta, hyperparameters for report).
        dataset: Dataset sub-config (resolution, metadata, normalization).
        augmentation: Augmentation sub-config (TTA transforms).
        evaluation: Evaluation sub-config (plot flags, report format).
        arch_name: Architecture identifier (e.g. ``"resnet_18"``).
        aug_info: Augmentation description string for report.
        tracker: Optional experiment tracker for final metrics.

    Returns:
        tuple[float, float, float]: A 3-tuple of:

            - **macro_f1** -- Macro-averaged F1 score
            - **test_acc** -- Test set accuracy
            - **test_auc** -- Test set AUC (NaN if computation failed)
    """
    # Resolve device from model (already placed on the correct device by the trainer)
    device = next(model.parameters()).device

    # Filesystem-safe architecture tag (e.g. "timm/model" → "timm_model")
    arch_tag = arch_name.replace("/", "_")

    # --- 1) Inference & Metrics ---
    # Performance on the full test set
    all_preds, all_labels, test_metrics, macro_f1 = evaluate_model(
        model,
        test_loader,
        device=device,
        use_tta=training.use_tta,
        is_anatomical=dataset.effective_is_anatomical,
        is_texture_based=dataset.effective_is_texture_based,
        aug_cfg=augmentation,
        resolution=dataset.resolution,
    )

    # --- 2) Visualizations ---
    ctx = PlotContext(
        arch_name=arch_name,
        resolution=dataset.resolution,
        fig_dpi=evaluation.fig_dpi,
        plot_style=evaluation.plot_style,
        cmap_confusion=evaluation.cmap_confusion,
        grid_cols=evaluation.grid_cols,
        n_samples=evaluation.n_samples,
        fig_size_predictions=evaluation.fig_size_predictions,
        mean=dataset.mean,
        std=dataset.std,
        use_tta=training.use_tta,
        is_anatomical=dataset.effective_is_anatomical,
        is_texture_based=dataset.effective_is_texture_based,
    )

    # Diagnostic Confusion Matrix
    if evaluation.save_confusion_matrix:
        plot_confusion_matrix(
            all_labels=all_labels,
            all_preds=all_preds,
            classes=class_names,
            out_path=paths.get_fig_path(f"confusion_matrix_{arch_tag}_{dataset.resolution}.png"),
            ctx=ctx,
        )

    # Historical Training Curves
    val_acc_list = [m[METRIC_ACCURACY] for m in val_metrics_history]
    plot_training_curves(
        train_losses=train_losses,
        val_accuracies=val_acc_list,
        out_path=paths.get_fig_path(f"training_curves_{arch_tag}_{dataset.resolution}.png"),
        ctx=ctx,
    )

    # Lazy-loaded prediction grid (samples from loader)
    if evaluation.save_predictions_grid:
        show_predictions(
            model=model,
            loader=test_loader,
            device=device,
            classes=class_names,
            save_path=paths.get_fig_path(f"sample_predictions_{arch_tag}_{dataset.resolution}.png"),
            ctx=ctx,
        )

    # --- 3) Structured Reporting ---
    # Aggregates everything into a formatted report (xlsx/csv/json)
    final_log = paths.logs / "session.log"

    report = create_structured_report(
        val_metrics=val_metrics_history,
        test_metrics=test_metrics,
        macro_f1=macro_f1,
        train_losses=train_losses,
        best_path=paths.best_model_path,
        log_path=final_log,
        arch_name=arch_name,
        dataset=dataset,
        training=training,
        aug_info=aug_info,
    )
    report.save(paths.final_report_path, fmt=evaluation.report_format)

    test_acc = test_metrics[METRIC_ACCURACY]
    test_auc = test_metrics[METRIC_AUC]

    # Log test metrics to experiment tracker
    if tracker is not None:
        tracker.log_test_metrics(test_acc=test_acc, macro_f1=macro_f1)

    logger.info("%s%s Final Evaluation Phase Complete.", LogStyle.INDENT, LogStyle.SUCCESS)

    return macro_f1, test_acc, test_auc