From 2dbcfa8a0cefce9e98cbf6341319dc3526f43831 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Wed, 13 Dec 2023 15:52:59 -0500
Subject: [PATCH] fix(cli): correct set arguments for `openllm import` and
 `openllm build` (#775)

* fix(cli): correct set arguments for `openllm import` and `openllm build`

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: update changelog

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 changelog.d/774.feature.md                    |  5 ++
 changelog.d/775.change.md                     |  1 +
 openllm-python/src/openllm_cli/_factory.py    | 52 +++++++++----------
 openllm-python/src/openllm_cli/entrypoint.py  | 28 ++++------
 .../src/openllm_cli/extension/list_models.py  |  2 +-
 5 files changed, 40 insertions(+), 48 deletions(-)
 create mode 100644 changelog.d/774.feature.md
 create mode 100644 changelog.d/775.change.md

diff --git a/changelog.d/774.feature.md b/changelog.d/774.feature.md
new file mode 100644
index 000000000..4ab3cb4b7
--- /dev/null
+++ b/changelog.d/774.feature.md
@@ -0,0 +1,5 @@
+Mixtral is now fully supported on BentoCloud.
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1
+```
diff --git a/changelog.d/775.change.md b/changelog.d/775.change.md
new file mode 100644
index 000000000..5d6bf98b1
--- /dev/null
+++ b/changelog.d/775.change.md
@@ -0,0 +1 @@
+Update correct arguments for both `openllm import` and `openllm build` to be synonymous with `openllm start`
diff --git a/openllm-python/src/openllm_cli/_factory.py b/openllm-python/src/openllm_cli/_factory.py
index 75b27f801..309fbfc07 100644
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -11,6 +11,7 @@
   LiteralBackend,
   LiteralSerialisation,
   ParamSpec,
+  AnyCallable,
   get_literal_args,
 )
 from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath
@@ -25,7 +26,7 @@ class _OpenLLM_GenericInternalConfig(LLMConfig):
 
   class GenerationConfig:
     top_k: int = 15
-    top_p: float = 0.9
+    top_p: float = 0.78
     temperature: float = 0.75
     max_new_tokens: int = 128
 
@@ -118,21 +119,22 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
     ctx.params[_adapter_mapping_key][adapter_id] = name
   return None
 
+def optimization_decorator(fn: FC, *, factory=click, _eager=True) -> FC | list[AnyCallable]:
+  shared = [
+    dtype_option(factory=factory), model_version_option(factory=factory), #
+    backend_option(factory=factory), quantize_option(factory=factory), #
+    serialisation_option(factory=factory),
+  ]
+  if not _eager: return shared
+  return compose(*shared)(fn)
 
 def start_decorator(fn: FC) -> FC:
   composed = compose(
     _OpenLLM_GenericInternalConfig.parse,
-    _http_server_args,
-    cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'),
-    dtype_option(factory=cog.optgroup),
-    model_version_option(factory=cog.optgroup),
-    cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
-    workers_per_resource_option(factory=cog.optgroup),
-    cors_option(factory=cog.optgroup),
-    backend_option(factory=cog.optgroup),
+    parse_serve_args(),
     cog.optgroup.group(
-      'LLM Optimization Options',
-      help='''Optimization related options.
+      'LLM Options',
+      help='''The following options are related to running LLM Server as well as optimization options.
 
           OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
 
@@ -140,10 +142,12 @@ def start_decorator(fn: FC) -> FC:
 
           - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
           - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
-          ''',
+    ''',
     ),
-    quantize_option(factory=cog.optgroup),
-    serialisation_option(factory=cog.optgroup),
+    cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
+    workers_per_resource_option(factory=cog.optgroup),
+    cors_option(factory=cog.optgroup),
+    *optimization_decorator(fn, factory=cog.optgroup, _eager=False),
     cog.optgroup.option(
       '--device',
       type=dantic.CUDA,
@@ -200,8 +204,6 @@ def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) ->
     return group(f)
   return decorator
 
-_http_server_args = parse_serve_args()
-
 def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
   '''General ``@click`` decorator with some sauce.
 
@@ -234,7 +236,8 @@ def adapter_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callab
     multiple=True,
     callback=_id_callback,
     metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]',
-  )
+    **attrs,
+  )(f)
 
 
 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
@@ -291,8 +294,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[
     envvar='OPENLLM_BACKEND',
     show_envvar=True,
     help='Runtime to use for both serialisation/inference engine.',
-    **attrs,
-  )(f)
+    **attrs)(f)
 
 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_argument(
@@ -329,15 +331,9 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
       '''
     + (
       '''
-      > [!NOTE] that this will set the mode for serving within deployment.'''
-      if build
-      else ''
-    )
-    + '''
-      > [!NOTE] that quantization are currently only available in *PyTorch* models.''',
-    **attrs,
-  )(f)
-
+      > [!NOTE] that this will set the mode for serving within deployment.''' if build else ''
+    ),
+    **attrs)(f)
 
 def workers_per_resource_option(
   f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any
diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py
index b93cec6fd..57451d971 100644
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -46,15 +46,11 @@
 from ._factory import (
   FC,
   _AnyCallable,
-  backend_option,
-  dtype_option,
   machine_option,
   model_name_argument,
-  model_version_option,
   parse_config_options,
-  quantize_option,
-  serialisation_option,
   start_decorator,
+  optimization_decorator,
 )
 
 if t.TYPE_CHECKING:
@@ -590,13 +586,11 @@ class ImportModelOutput(t.TypedDict):
   metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
   help='Deprecated. Use positional argument instead.',
 )
-@model_version_option
-@backend_option
-@quantize_option
-@serialisation_option
+@optimization_decorator
 def import_command(
   model_id: str,
   deprecated_model_id: str | None,
+  dtype: LiteralDtype,
   model_version: str | None,
   backend: LiteralBackend | None,
   quantize: LiteralQuantise | None,
@@ -649,6 +643,7 @@ def import_command(
     model_version=model_version,
     quantize=quantize,
     backend=backend,
+    dtype=dtype,
     serialisation=t.cast(
       LiteralSerialisation,
       first_not_none(
@@ -712,8 +707,6 @@ class BuildBentoOutput(t.TypedDict):
   metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
   help='Deprecated. Use positional argument instead.',
 )
-@dtype_option
-@backend_option
 @click.option(
   '--bento-version',
   type=str,
@@ -721,8 +714,6 @@ class BuildBentoOutput(t.TypedDict):
   help='Optional bento version for this BentoLLM. Default is the the model revision.',
 )
 @click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
-@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options')  # type: ignore[misc]
-@quantize_option(factory=cog.optgroup, build=True)
 @click.option(
   '--enable-features',
   multiple=True,
@@ -732,6 +723,7 @@ class BuildBentoOutput(t.TypedDict):
     ', '.join(OPTIONAL_DEPENDENCIES)
   ),
 )
+@optimization_decorator
 @click.option(
   '--adapter-id',
   default=None,
@@ -740,14 +732,12 @@ class BuildBentoOutput(t.TypedDict):
   help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.",
 )
 @click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None)
-@model_version_option
 @click.option(
   '--dockerfile-template',
   default=None,
   type=click.File(),
   help='Optional custom dockerfile template to be used with this BentoLLM.',
 )
-@serialisation_option
 @cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')  # type: ignore[misc]
 @cog.optgroup.option(
   '--containerize',
@@ -787,20 +777,20 @@ def build_command(
   deprecated_model_id: str | None,
   bento_version: str | None,
   overwrite: bool,
+  dtype: LiteralDtype,
+  model_version: str | None,
+  backend: LiteralBackend | None,
   quantize: LiteralQuantise | None,
+  serialisation: LiteralSerialisation | None,
   machine: bool,
-  dtype: LiteralDtype,
   enable_features: tuple[str, ...] | None,
   adapter_id: tuple[str, ...],
   build_ctx: str | None,
-  backend: LiteralBackend | None,
-  model_version: str | None,
   dockerfile_template: t.TextIO | None,
   max_model_len: int | None,
   gpu_memory_utilization:float,
   containerize: bool,
   push: bool,
-  serialisation: LiteralSerialisation | None,
   force_push: bool,
   **_: t.Any,
 ) -> BuildBentoOutput:
diff --git a/openllm-python/src/openllm_cli/extension/list_models.py b/openllm-python/src/openllm_cli/extension/list_models.py
index f2a25d14e..61e4d26aa 100644
--- a/openllm-python/src/openllm_cli/extension/list_models.py
+++ b/openllm-python/src/openllm_cli/extension/list_models.py
@@ -18,7 +18,7 @@
 @click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
 @model_name_argument(required=False, shell_complete=model_complete_envvar)
 def cli(model_name: str | None) -> DictStrAny:
-  '''This is equivalent to openllm models --show-available less the nice table.'''
+  '''List available models in lcoal store to be used wit OpenLLM.'''
   models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
   ids_in_local_store = {
     k: [