From 2dbcfa8a0cefce9e98cbf6341319dc3526f43831 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Wed, 13 Dec 2023 15:52:59 -0500 Subject: [PATCH] fix(cli): correct set arguments for `openllm import` and `openllm build` (#775) * fix(cli): correct set arguments for `openllm import` and `openllm build` Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update changelog Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- changelog.d/774.feature.md | 5 ++ changelog.d/775.change.md | 1 + openllm-python/src/openllm_cli/_factory.py | 52 +++++++++---------- openllm-python/src/openllm_cli/entrypoint.py | 28 ++++------ .../src/openllm_cli/extension/list_models.py | 2 +- 5 files changed, 40 insertions(+), 48 deletions(-) create mode 100644 changelog.d/774.feature.md create mode 100644 changelog.d/775.change.md diff --git a/changelog.d/774.feature.md b/changelog.d/774.feature.md new file mode 100644 index 000000000..4ab3cb4b7 --- /dev/null +++ b/changelog.d/774.feature.md @@ -0,0 +1,5 @@ +Mixtral is now fully supported on BentoCloud. + +```bash +openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 +``` diff --git a/changelog.d/775.change.md b/changelog.d/775.change.md new file mode 100644 index 000000000..5d6bf98b1 --- /dev/null +++ b/changelog.d/775.change.md @@ -0,0 +1 @@ +Update correct arguments for both `openllm import` and `openllm build` to be synonymous with `openllm start` diff --git a/openllm-python/src/openllm_cli/_factory.py b/openllm-python/src/openllm_cli/_factory.py index 75b27f801..309fbfc07 100644 --- a/openllm-python/src/openllm_cli/_factory.py +++ b/openllm-python/src/openllm_cli/_factory.py @@ -11,6 +11,7 @@ LiteralBackend, LiteralSerialisation, ParamSpec, + AnyCallable, get_literal_args, ) from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath @@ -25,7 +26,7 @@ class _OpenLLM_GenericInternalConfig(LLMConfig): class GenerationConfig: top_k: int = 15 - top_p: float = 0.9 + top_p: float = 0.78 temperature: float = 0.75 max_new_tokens: int = 128 @@ -118,21 +119,22 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ... ctx.params[_adapter_mapping_key][adapter_id] = name return None +def optimization_decorator(fn: FC, *, factory=click, _eager=True) -> FC | list[AnyCallable]: + shared = [ + dtype_option(factory=factory), model_version_option(factory=factory), # + backend_option(factory=factory), quantize_option(factory=factory), # + serialisation_option(factory=factory), + ] + if not _eager: return shared + return compose(*shared)(fn) def start_decorator(fn: FC) -> FC: composed = compose( _OpenLLM_GenericInternalConfig.parse, - _http_server_args, - cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'), - dtype_option(factory=cog.optgroup), - model_version_option(factory=cog.optgroup), - cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'), - workers_per_resource_option(factory=cog.optgroup), - cors_option(factory=cog.optgroup), - backend_option(factory=cog.optgroup), + parse_serve_args(), cog.optgroup.group( - 'LLM Optimization Options', - help='''Optimization related options. + 'LLM Options', + help='''The following options are related to running LLM Server as well as optimization options. OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM. @@ -140,10 +142,12 @@ def start_decorator(fn: FC) -> FC: - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/) - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml) - ''', + ''', ), - quantize_option(factory=cog.optgroup), - serialisation_option(factory=cog.optgroup), + cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'), + workers_per_resource_option(factory=cog.optgroup), + cors_option(factory=cog.optgroup), + *optimization_decorator(fn, factory=cog.optgroup, _eager=False), cog.optgroup.option( '--device', type=dantic.CUDA, @@ -200,8 +204,6 @@ def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> return group(f) return decorator -_http_server_args = parse_serve_args() - def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]: '''General ``@click`` decorator with some sauce. @@ -234,7 +236,8 @@ def adapter_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callab multiple=True, callback=_id_callback, metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]', - ) + **attrs, + )(f) def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: @@ -291,8 +294,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[ envvar='OPENLLM_BACKEND', show_envvar=True, help='Runtime to use for both serialisation/inference engine.', - **attrs, - )(f) + **attrs)(f) def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_argument( @@ -329,15 +331,9 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att ''' + ( ''' - > [!NOTE] that this will set the mode for serving within deployment.''' - if build - else '' - ) - + ''' - > [!NOTE] that quantization are currently only available in *PyTorch* models.''', - **attrs, - )(f) - + > [!NOTE] that this will set the mode for serving within deployment.''' if build else '' + ), + **attrs)(f) def workers_per_resource_option( f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py index b93cec6fd..57451d971 100644 --- a/openllm-python/src/openllm_cli/entrypoint.py +++ b/openllm-python/src/openllm_cli/entrypoint.py @@ -46,15 +46,11 @@ from ._factory import ( FC, _AnyCallable, - backend_option, - dtype_option, machine_option, model_name_argument, - model_version_option, parse_config_options, - quantize_option, - serialisation_option, start_decorator, + optimization_decorator, ) if t.TYPE_CHECKING: @@ -590,13 +586,11 @@ class ImportModelOutput(t.TypedDict): metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', help='Deprecated. Use positional argument instead.', ) -@model_version_option -@backend_option -@quantize_option -@serialisation_option +@optimization_decorator def import_command( model_id: str, deprecated_model_id: str | None, + dtype: LiteralDtype, model_version: str | None, backend: LiteralBackend | None, quantize: LiteralQuantise | None, @@ -649,6 +643,7 @@ def import_command( model_version=model_version, quantize=quantize, backend=backend, + dtype=dtype, serialisation=t.cast( LiteralSerialisation, first_not_none( @@ -712,8 +707,6 @@ class BuildBentoOutput(t.TypedDict): metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', help='Deprecated. Use positional argument instead.', ) -@dtype_option -@backend_option @click.option( '--bento-version', type=str, @@ -721,8 +714,6 @@ class BuildBentoOutput(t.TypedDict): help='Optional bento version for this BentoLLM. Default is the the model revision.', ) @click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.') -@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options') # type: ignore[misc] -@quantize_option(factory=cog.optgroup, build=True) @click.option( '--enable-features', multiple=True, @@ -732,6 +723,7 @@ class BuildBentoOutput(t.TypedDict): ', '.join(OPTIONAL_DEPENDENCIES) ), ) +@optimization_decorator @click.option( '--adapter-id', default=None, @@ -740,14 +732,12 @@ class BuildBentoOutput(t.TypedDict): help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.", ) @click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None) -@model_version_option @click.option( '--dockerfile-template', default=None, type=click.File(), help='Optional custom dockerfile template to be used with this BentoLLM.', ) -@serialisation_option @cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options') # type: ignore[misc] @cog.optgroup.option( '--containerize', @@ -787,20 +777,20 @@ def build_command( deprecated_model_id: str | None, bento_version: str | None, overwrite: bool, + dtype: LiteralDtype, + model_version: str | None, + backend: LiteralBackend | None, quantize: LiteralQuantise | None, + serialisation: LiteralSerialisation | None, machine: bool, - dtype: LiteralDtype, enable_features: tuple[str, ...] | None, adapter_id: tuple[str, ...], build_ctx: str | None, - backend: LiteralBackend | None, - model_version: str | None, dockerfile_template: t.TextIO | None, max_model_len: int | None, gpu_memory_utilization:float, containerize: bool, push: bool, - serialisation: LiteralSerialisation | None, force_push: bool, **_: t.Any, ) -> BuildBentoOutput: diff --git a/openllm-python/src/openllm_cli/extension/list_models.py b/openllm-python/src/openllm_cli/extension/list_models.py index f2a25d14e..61e4d26aa 100644 --- a/openllm-python/src/openllm_cli/extension/list_models.py +++ b/openllm-python/src/openllm_cli/extension/list_models.py @@ -18,7 +18,7 @@ @click.command('list_models', context_settings=termui.CONTEXT_SETTINGS) @model_name_argument(required=False, shell_complete=model_complete_envvar) def cli(model_name: str | None) -> DictStrAny: - '''This is equivalent to openllm models --show-available less the nice table.''' + '''List available models in lcoal store to be used wit OpenLLM.''' models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) ids_in_local_store = { k: [