Source code for medsegpy.utils.dl_utils

import os
import subprocess

from keras.utils import multi_gpu_model

from medsegpy.modeling.model import Model


[docs]def get_weights(experiment_dir):
    """Gets the weights file corresponding to lowest validation loss.

    Assumes that only the best weights are stored, so searching for the epoch
    should be enough.
    TODO: remove this assumption.

    Args:
        experiment_dir (str): Experiment directory where weights are stored.

    Returns:
        str: Path to weights h5 file.
    """
    files = os.listdir(experiment_dir)
    max_epoch = -1
    best_file = ""
    for file in files:
        file_fullpath = os.path.join(experiment_dir, file)
        # Ensure the file is an h5 file
        if not (
            os.path.isfile(file_fullpath) and file_fullpath.endswith(".h5") and "weights" in file
        ):
            continue

        # Get file with max epochs
        train_info = file.split(".")[1]
        epoch = int(train_info.split("-")[0])

        if epoch > max_epoch:
            max_epoch = epoch
            best_file = file_fullpath

    if not best_file:
        raise FileNotFoundError("No weights file found in %s" % experiment_dir)

    return best_file


def _check_results_file(base_path):
    """Recursively check for results.txt file."""
    if (base_path is None) or (not os.path.isdir(base_path)) or (base_path == ""):
        return []

    results_filepath = os.path.join(base_path, "results.txt")

    results_paths = []
    if os.path.isfile(results_filepath):
        results_paths.append(results_filepath)

    files = os.listdir(base_path)
    for file in files:
        possible_dir = os.path.join(base_path, file)
        if os.path.isdir(possible_dir):
            subdir_results_files = _check_results_file(possible_dir)
            results_paths.extend(subdir_results_files)

    return results_paths


[docs]def get_valid_subdirs(root_dir: str, exist_ok: bool = False):
    """Recursively search for experiments that are ready to be tested.

    Different experiments live in different folders. Based on training protocol,
    we assume that an valid experiment has completed training if its folder
    contains files "config.ini" and "pik_data.dat".

    To avoid recomputing experiments with results, `exist_ok=False` by default.

    Args:
        root_dir (str): Root folder to search.
        exist_ok (:obj:`bool`, optional): If `True`, recompute results for
            experiments.

    Return:
        List[str]: Experiment directories to test.
    """
    no_results = not exist_ok
    if (root_dir is None) or (not os.path.isdir(root_dir)) or (root_dir == []):
        return []

    subdirs = []
    config_path = os.path.join(root_dir, "config.ini")
    pik_data_path = os.path.join(root_dir, "pik_data.dat")
    test_results_dirpath = os.path.join(root_dir, "test_results")
    results_file_exists = len(_check_results_file(test_results_dirpath)) > 0

    # 1. Check if you are a valid subdirectory - must contain a pik data path
    if os.path.isfile(config_path) and os.path.isfile(pik_data_path):
        if (no_results and (not results_file_exists)) or ((not no_results)):
            subdirs.append(root_dir)

    files = os.listdir(root_dir)
    # 2. Recursively search through other subdirectories
    for file in files:
        possible_dir = os.path.join(root_dir, file)
        if os.path.isdir(possible_dir):
            rec_subdirs = get_valid_subdirs(possible_dir, no_results)
            subdirs.extend(rec_subdirs)

    return subdirs


[docs]def get_available_gpus(num_gpus: int = None):
    """Get gpu ids for gpus that are >95% free.

    Tensorflow does not support checking free memory on gpus.
    This is a crude method that relies on `nvidia-smi` to
    determine which gpus are occupied and which are free.

    Args:
        num_gpus: Number of requested gpus. If not specified,
            ids of all available gpu(s) are returned.

    Returns:
        List[int]: List of gpu ids that are free. Length
            will equal `num_gpus`, if specified.
    """
    # Built-in tensorflow gpu id.
    assert isinstance(num_gpus, (type(None), int))
    if num_gpus == 0:
        return [-1]

    num_requested_gpus = num_gpus
    num_gpus = (
        len(subprocess.check_output("nvidia-smi --list-gpus", shell=True).decode().split("\n")) - 1
    )

    out_str = subprocess.check_output("nvidia-smi | grep MiB", shell=True).decode()
    mem_str = [x for x in out_str.split() if "MiB" in x]
    # First 2 * num_gpu elements correspond to memory for gpus
    # Order: (occupied-0, total-0, occupied-1, total-1, ...)
    mems = [float(x[:-3]) for x in mem_str]
    gpu_percent_occupied_mem = [
        mems[2 * gpu_id] / mems[2 * gpu_id + 1] for gpu_id in range(num_gpus)
    ]

    available_gpus = [gpu_id for gpu_id, mem in enumerate(gpu_percent_occupied_mem) if mem < 0.05]
    if num_requested_gpus and num_requested_gpus > len(available_gpus):
        raise ValueError(
            "Requested {} gpus, only {} are free".format(num_requested_gpus, len(available_gpus))
        )

    return available_gpus[:num_requested_gpus] if num_requested_gpus else available_gpus


def num_gpus():
    if "CUDA_VISIBLE_DEVICES" not in os.environ or not os.environ["CUDA_VISIBLE_DEVICES"]:
        return 0

    return len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))


class ModelMGPU(Model):
    def __init__(self, ser_model, gpus):
        pmodel = multi_gpu_model(ser_model, gpus)
        self.__dict__.update(pmodel.__dict__)
        self._smodel = ser_model

    def __getattribute__(self, attrname):
        """Override load and save methods to be used from the serial-model. The
        serial-model holds references to the weights in the multi-gpu model.
        """
        # return Model.__getattribute__(self, attrname)
        if "load" in attrname or "save" in attrname:
            return getattr(self._smodel, attrname)

        return super(ModelMGPU, self).__getattribute__(attrname)


class _NoOpScope:
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        pass


class NoOpStrategy:
    def scope(self):
        return _NoOpScope()