From bf71b169b076ece3d0a84fe8da33f7fa67c00d8d Mon Sep 17 00:00:00 2001 From: Parth Raut Date: Wed, 7 Feb 2024 21:50:32 -0500 Subject: [PATCH] finished draft for HFGPLO, fixed import errors on usage --- test.py | 23 ++++------------ zeus/optimizer/__init__.py | 1 + zeus/optimizer/power_limit.py | 52 ++++++++++++++--------------------- 3 files changed, 26 insertions(+), 50 deletions(-) diff --git a/test.py b/test.py index ebd1354e..b268c0c7 100644 --- a/test.py +++ b/test.py @@ -1,25 +1,12 @@ -from zeus.monitor import ZeusMonitor -from zeus.optimizer import GlobalPowerLimitOptimizer - -if __name__ == '__main__': - monitor = ZeusMonitor(gpu_indices=[0,1,2,3]) - - monitor.begin_window() - - measurement = monitor.end_window("heavy computation") +from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl, PreTrainedModel +from zeus.optimizer import HFGPLO +from zeus.monitor import ZeusMonitor - print(f"Energy: {measurement.total_energy} J") - print(f"Time : {measurement.time} s") - - - plo = GlobalPowerLimitOptimizer(monitor) - - # training loop +import pdb - plo.on_step_begin() +pdb.set_trace() - diff --git a/zeus/optimizer/__init__.py b/zeus/optimizer/__init__.py index d7209120..6c4c2cea 100644 --- a/zeus/optimizer/__init__.py +++ b/zeus/optimizer/__init__.py @@ -15,3 +15,4 @@ """A collection of optimizers for various knobs.""" from zeus.optimizer.power_limit import GlobalPowerLimitOptimizer +from zeus.optimizer.power_limit import HFGPLO \ No newline at end of file diff --git a/zeus/optimizer/power_limit.py b/zeus/optimizer/power_limit.py index 94810f58..3fdb891b 100644 --- a/zeus/optimizer/power_limit.py +++ b/zeus/optimizer/power_limit.py @@ -481,11 +481,29 @@ def _save_profile(self) -> None: # only import when type checking if TYPE_CHECKING: - from transformers import Trainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl, PreTrainedModel + from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl, PreTrainedModel + +# to avoid hard dependency on HuggingFace Transformers, import classes dynamically +def import_hf_classes(): + try: + from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl, PreTrainedModel + return TrainerCallback, TrainingArguments, TrainerState, TrainerControl, PreTrainedModel + except ImportError: + return None + def make_hf(cls: Type[Callback], name: str | None = None) -> Type[TrainerCallback]: + # Attempt to import HuggingFace classes + hf_classes = import_hf_classes() + if hf_classes is None: + raise ImportError("Hugging Face is not installed. Please install it to use this feature.") + + TrainerCallback, TrainingArguments, TrainerState, TrainerControl, PreTrainedModel = hf_classes + class Wrapper(TrainerCallback): + # goal: help(HFGPLO) should show the init signature of GlobalPowerLimitOptimizer + # if that doesn't work, then standard class def __init__(self, *args, **kwargs) -> None: self.plo = cls(*args, **kwargs) # keep it args, kwargs, or specify to GlobalPowerLimitOptimizer? @@ -502,43 +520,13 @@ def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: Tr self.plo.on_epoch_end() def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: - self.plo.on_evaluate() # what to set metric to? - - # NO MATCH - def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: - # self.plo.on_init_end() no match to zeus callback, should be overridden? - pass - - # NO MATCH - def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: - # self.plo.on_log() no match - pass - - # NO MATCH - def on_predict(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: - # self.plo.on_predict() no match - pass - - # NO MATCH - def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: - # self.plo.on_prediction_step() no match - pass - - # NO MATCH - def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: - # self.plo.on_save() no match - pass + self.plo.on_evaluate() # what to set metric to? think it is called with metric, look into it def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: self.plo.on_step_begin() def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: self.plo.on_step_end() - - # NO MATCH - def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: - # self.plo.on_substep_end() no match - pass def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs) -> None: self.plo.on_train_begin()