Source code for medsegpy.evaluation.evaluator

"""Dataset evaluator.

Adopted from Facebook's detectron2.
https://github.com/facebookresearch/detectron2
"""
import datetime
import logging
import time
from typing import Sequence, Union

from medsegpy.data import DataLoader
from medsegpy.data.im_gens import Generator, GeneratorState
from medsegpy.utils.logger import log_every_n_seconds


[docs]class DatasetEvaluator: """ Base class for a dataset evaluator. The function :func:`inference_on_dataset` runs the model over all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs. This class will accumulate information of the inputs/outputs (by :meth:`process`), and produce evaluation results in the end (by :meth:`evaluate`). """
[docs] def reset(self): """ Preparation for a new round of evaluation. Should be called before starting a round of evaluation. """
pass
[docs] def process(self, inputs, outputs): """ Process an input/output pair. Args: scan_id: the scan id corresponding to the input/output inputs (List[Dict]]: the inputs that are used to call the model. Can also contain scan specific fields. These fields should start with "scan_". outputs (List[Dict]): List of outputs from the model. Each dict should contain at least the following keys: * "y_true": Ground truth results * "y_pred": Predicted probabilities. * "time_elapsed": Amount of time to load data and run model. """
pass
[docs] def evaluate(self): """Evaluate/summarize the performance, after processing all input/output pairs. Returns: dict: A new evaluator class can return a dict of arbitrary format as long as the user can process the results. In our train_net.py, we expect the following format: * key: the name of the task (e.g., bbox) * value: a dict of {metric name: score}, e.g.: {"AP50": 80} """
pass
[docs]def inference_on_dataset( model, data_loader: Union[DataLoader, Generator], evaluator: Union[DatasetEvaluator, Sequence[DatasetEvaluator]], ): """ Run model on the data_loader and evaluate the metrics with evaluator. The model will be used in eval mode. Args: model (keras.Model): generator: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use :class:`DatasetEvaluators([])` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ if isinstance(evaluator, DatasetEvaluator): evaluator = [evaluator] for e in evaluator: e.reset() num_warmup = 1 start_time = time.perf_counter() total_compute_time = 0 total_processing_time = 0 total_inference_time = 0 if isinstance(data_loader, Generator): iter_loader = data_loader.img_generator_test total = data_loader.num_scans(GeneratorState.TESTING) else: iter_loader = data_loader.inference total = data_loader.num_scans() start_compute_time = time.perf_counter() logger = logging.getLogger(__name__) for idx, (input, output) in enumerate(iter_loader(model)): total_compute_time += time.perf_counter() - start_compute_time start_processing_time = time.perf_counter() for e in evaluator: e.process([input], [output]) total_processing_time += time.perf_counter() - start_processing_time total_inference_time += output["time_elapsed"] iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_scan = total_compute_time / iters_after_start seconds_per_inference = total_inference_time / iters_after_start seconds_per_processing = total_processing_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_scan > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / scan ({:.4f} inference, " "{:.4f} processing). ETA={}".format( idx + 1, total, seconds_per_scan, seconds_per_inference, seconds_per_processing, str(eta), ), n=5, ) start_compute_time = time.perf_counter() eval_start = time.perf_counter() logger.info("Begin evaluation...") results = {e.__class__.__name__: e.evaluate() for e in evaluator} total_eval_time = time.perf_counter() - eval_start logger.info("Time Elapsed: {:.4f} seconds".format(total_compute_time + total_eval_time)) # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream # code to handle if results is None: results = {}
return results