Source code for autotorch.scheduler.hyperband

import pickle
import logging
import threading
import numpy as np
import multiprocessing as mp

from .fifo import FIFOScheduler
from .hyperband_stopping import HyperbandStopping_Manager
from .hyperband_promotion import HyperbandPromotion_Manager
from .reporter import DistStatusReporter, DistSemaphore
from ..utils import DeprecationHelper

__all__ = ['HyperbandScheduler', 'HyperbandStopping_Manager', 'HyperbandPromotion_Manager']

logger = logging.getLogger(__name__)


[docs]class HyperbandScheduler(FIFOScheduler):
    r"""Implements different variants of asynchronous Hyperband

    See 'type' for the different variants. One implementation detail is when
    using multiple brackets, task allocation to bracket is done randomly 
    based on a softmax probability.

    Parameters
    ----------
    train_fn : callable
        A task launch function for training.
    args : object, optional
        Default arguments for launching train_fn.
    resource : dict
        Computation resources.  For example, `{'num_cpus':2, 'num_gpus':1}`
    searcher : object, optional
        Autotorch searcher.  For example, :class:`autotorch.searcher.RandomSearcher`
    time_attr : str
        A training result attr to use for comparing time.
        Note that you can pass in something non-temporal such as
        `training_epoch` as a measure of progress, the only requirement
        is that the attribute should increase monotonically.
    reward_attr :str
        The training result objective value attribute. As with `time_attr`, this may refer to any objective value.
        Stopping procedures will use this attribute.
    max_t : float
        max time units per task.  Trials will be stopped after max_t time units (determined by
        time_attr) have passed.
    grace_period : float
        Only stop tasks at least this old in time.  Also: min_t. The units are the same as the attribute named by
        `time_attr`.
    reduction_factor : float
        Used to set halving rate and amount.  This is simply a unit-less scalar.
    brackets : int
        Number of brackets. Each bracket has a different
        grace period, all share max_t and reduction_factor.
        If brackets == 1, we just run successive halving, for
        brackets > 1, we run Hyperband.
    type : str
        Type of Hyperband scheduler:
            stopping:
                See :class:`HyperbandStopping_Manager`. Tasks and config evals are
                tightly coupled. A task is stopped at a milestone if worse than
                most others, otherwise it continues. As implemented in Ray/Tune:
                https://ray.readthedocs.io/en/latest/tune-schedulers.html#asynchronous-hyperband
            promotion:
                See :class:`HyperbandPromotion_Manager`. A config eval may be
                associated with multiple tasks over its lifetime. It is never
                terminated, but may be paused. Whenever a task becomes available,
                it may promote a config to the next milestone, if better than most
                others. If no config can be promoted, a new one is chosen. This
                variant may benefit from pause&resume, which is not directly
                supported here. As proposed in this paper (termed ASHA):
                https://arxiv.org/abs/1810.05934
    keep_size_ratios : bool
        Implemented for type 'promotion' only. If True,
        promotions are done only if the (current estimate of the) size ratio
        between rung and next rung are 1 / reduction_factor or better. This
        avoids higher rungs to get more populated than they would be in
        synchronous Hyperband. A drawback is that promotions to higher rungs
        take longer.
    maxt_pending : bool
        Relevant only if a model-based searcher is used.
        If True, register pending config at level max_t
        whenever a new evaluation is started. This has a direct effect on
        the acquisition function (for model-based variant), which operates
        at level max_t. On the other hand, it decreases the variance of the
        latent process there.
        NOTE: This could also be removed...
    dist_ip_addrs : list of str
        IP addresses of remote machines.

    Examples
    --------
    >>> import numpy as np
    >>> import autotorch as at
    >>> 
    >>> @at.args(
    ...     lr=at.Real(1e-3, 1e-2, log=True),
    ...     wd=at.Real(1e-3, 1e-2))
    >>> def train_fn(args, reporter):
    ...     print('lr: {}, wd: {}'.format(args.lr, args.wd))
    ...     for e in range(10):
    ...         dummy_accuracy = 1 - np.power(1.8, -np.random.uniform(e, 2*e))
    ...         reporter(epoch=e, accuracy=dummy_accuracy, lr=args.lr, wd=args.wd)
    >>> scheduler = at.scheduler.HyperbandScheduler(train_fn,
    ...                                             resource={'num_cpus': 2, 'num_gpus': 0},
    ...                                             num_trials=20,
    ...                                             reward_attr='accuracy',
    ...                                             time_attr='epoch',
    ...                                             grace_period=1)
    >>> scheduler.run()
    >>> scheduler.join_jobs()
    >>> scheduler.get_training_curves(plot=True)
    """
    def __init__(self, train_fn, args=None, resource=None,
                 searcher=None, search_options=None,
                 checkpoint='./exp/checkpoint.ag',
                 resume=False, num_trials=None,
                 time_out=None, max_reward=1.0,
                 time_attr="epoch",
                 reward_attr="accuracy",
                 max_t=100, grace_period=10,
                 reduction_factor=4, brackets=1,
                 visualizer='none',
                 type='stopping',
                 dist_ip_addrs=None,
                 keep_size_ratios=False,
                 maxt_pending=False):
        super().__init__(
            train_fn=train_fn, args=args, resource=resource, searcher=searcher,
            search_options=search_options, checkpoint=checkpoint, resume=resume,
            num_trials=num_trials, time_out=time_out, max_reward=max_reward, time_attr=time_attr,
            reward_attr=reward_attr, visualizer=visualizer, dist_ip_addrs=dist_ip_addrs)
        self.max_t = max_t
        self.type = type
        self.maxt_pending = maxt_pending
        if type == 'stopping':
            self.terminator = HyperbandStopping_Manager(
                time_attr, reward_attr, max_t, grace_period, reduction_factor,
                brackets)
        elif type == 'promotion':
            self.terminator = HyperbandPromotion_Manager(
                time_attr, reward_attr, max_t, grace_period, reduction_factor,
                brackets, keep_size_ratios=keep_size_ratios)
        else:
            raise AssertionError(
                "type '{}' not supported, must be 'stopping' or 'promotion'".format(
                    type))

[docs]    def add_job(self, task, **kwargs):
        """Adding a training task to the scheduler.

        Args:
            task (:class:`autotorch.scheduler.Task`): a new training task

        Relevant entries in kwargs:

            - bracket: HB bracket to be used. Has been sampled in _promote_config
            - new_config: If True, task starts new config eval, otherwise it promotes
              a config (only if type == 'promotion')

        Only if new_config == False:

            - config_key: Internal key for config
            - resume_from: config promoted from this milestone
            - milestone: config promoted to this milestone (next from resume_from)
        """
        cls = HyperbandScheduler
        if not task.resources.is_ready:
            cls.RESOURCE_MANAGER._request(task.resources)
        # reporter and terminator
        reporter = DistStatusReporter(remote=task.resources.node)
        task.args['reporter'] = reporter

        milestones = self.terminator.on_task_add(task, **kwargs)
        if kwargs.get('new_config', True):
            first_milestone = milestones[-1]
            logger.debug("Adding new task (first milestone = {}):\n{}".format(
                first_milestone, task))
            self.searcher.register_pending(
                task.args['config'], first_milestone)
            if self.maxt_pending:
                # Also introduce pending evaluation for resource max_t
                final_milestone = milestones[0]
                if final_milestone != first_milestone:
                    self.searcher.register_pending(
                        task.args['config'], final_milestone)
        else:
            # Promotion of config
            # This is a signal towards train_fn, in case it supports
            # pause and resume:
            task.args['resume_from'] = kwargs['resume_from']
            next_milestone = kwargs['milestone']
            logger.debug("Promotion task (next milestone = {}):\n{}".format(
                next_milestone, task))
            self.searcher.register_pending(
                task.args['config'], next_milestone)

        # main process
        job = cls._start_distributed_job(task, cls.RESOURCE_MANAGER)
        # reporter thread
        rp = threading.Thread(target=self._run_reporter,
                              args=(task, job, reporter, self.searcher, self.terminator),
                              daemon=False)
        rp.start()
        task_dict = self._dict_from_task(task)
        task_dict.update({'Task': task, 'Job': job, 'ReporterThread': rp})
        # checkpoint thread
        if self._checkpoint is not None:
            def _save_checkpoint_callback(fut):
                self._cleaning_tasks()
                self.save()
            job.add_done_callback(_save_checkpoint_callback)

        with self.LOCK:
            self.scheduled_tasks.append(task_dict)

    def _run_reporter(self, task, task_job, reporter, searcher, terminator):
        last_result = None
        last_updated = None
        while not task_job.done():
            reported_result = reporter.fetch()
            if 'traceback' in reported_result:
                logger.exception(reported_result['traceback'])
                reporter.move_on()
                terminator.on_task_remove(task)
                break

            if reported_result.get('done', False):
                reporter.move_on()
                terminator.on_task_complete(task, last_result)
                break
            # Call before _add_training_results, since we may be able to report
            # extra information from the bracket
            task_continues, update_searcher, next_milestone, bracket_id, rung_counts = \
                terminator.on_task_report(task, reported_result)
            reported_result['bracket'] = bracket_id
            if rung_counts is not None:
                for k, v in rung_counts.items():
                    key = 'count_at_{}'.format(k)
                    reported_result[key] = v
            self._add_training_result(
                task.task_id, reported_result, config=task.args['config'])
            if update_searcher and task_continues:
                # Update searcher with intermediate result
                # Note: If task_continues is False here, we also call
                # searcher.update, but outside the loop.
                searcher.update(
                    config=task.args['config'],
                    reward=reported_result[self._reward_attr],
                    **reported_result)
                last_updated = reported_result
                if next_milestone is not None:
                    searcher.register_pending(
                        task.args['config'], next_milestone)
            last_result = reported_result
            if task_continues:
                reporter.move_on()
            else:
                # Note: The 'terminated' signal is sent even in the promotion
                # variant. It means that the *task* terminates, while the evaluation
                # of the config is just paused
                last_result['terminated'] = True
                if self.type == 'stopping' or \
                        reported_result[self._time_attr] >= self.max_t:
                    act_str = 'terminating'
                else:
                    act_str = 'pausing'
                logger.debug(
                    'Stopping task ({} evaluation, resource = {}):\n{}'.format(
                        act_str, reported_result[self._time_attr], task))
                terminator.on_task_remove(task)
                reporter.terminate()
                #task_process.join()
                break
        # Pass all of last_result to searcher (unless this has already been
        # done)
        if last_result is not last_updated:
            last_result['done'] = True
            searcher.update(
                config=task.args['config'],
                reward=last_result[self._reward_attr], **last_result)

[docs]    def state_dict(self, destination=None):
        """Returns a dictionary containing a whole state of the Scheduler

        Examples
        --------
        >>> at.save(scheduler.state_dict(), 'checkpoint.ag')
        """
        destination = super().state_dict(destination)
        destination['terminator'] = pickle.dumps(self.terminator)
        return destination

[docs]    def load_state_dict(self, state_dict):
        """Load from the saved state dict.

        Examples
        --------
        >>> scheduler.load_state_dict(at.load('checkpoint.ag'))
        """
        super().load_state_dict(state_dict)
        self.terminator = pickle.loads(state_dict['terminator'])
        logger.info('Loading Terminator State {}'.format(self.terminator))

    def _promote_config(self):
        config, extra_kwargs = self.terminator.on_task_schedule()
        return config, extra_kwargs

    def __repr__(self):
        reprstr = self.__class__.__name__ + '(' +  \
            'terminator: ' + str(self.terminator)
        return reprstr