-
Notifications
You must be signed in to change notification settings - Fork 631
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add fastdeploy server and client component (#1169)
* add backend support for fastdeploy server * fix * add code * fix * fix * add fastdeploy server component * add fastdeploy server and client * add exception description * fix * add model repository judgement * add component tab for fastdeploy client * update more tasks in fastdeploy client * sort filenames * backup config * noqa for autogenerated file * add data validation * add __init__ for package * add calculating layout for frontend * add alive server detection and optimize client * add alive server detection and optimize client * add alive server detection and optimize client * add metrics in gradio client * update presentation * Change return value to None for frontend performance data when server not ready * add get_server_config and download_pretrain_model api * add get_server_config and download_pretrain_model api * add unit for metric table * add unit for metric table * fix a bug * add judgement pretrained model download * add judgement pretrained model download * add version info for frontend * rename download model * fix a bug * add fastdeploy model list * optimize for choose configuration files * modify according to frontend need * fix name in config to model name * optimize for server list and alive judgement * keep server name as string type * optimize process judgement logic * optimize for deleting resource files * add rename resource file * fix * fix a bug * optimize code structure * optimize code structure * remove chinese tips and remove fastdeploy-python in requirements
- Loading branch information
Showing
14 changed files
with
5,145 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,4 +12,8 @@ multiprocess | |
packaging | ||
x2paddle | ||
rarfile | ||
onnx >= 1.6.0 | ||
gradio | ||
tritonclient[all] | ||
attrdict | ||
psutil | ||
onnx >= 1.6.0 |
14 changes: 14 additions & 0 deletions
14
visualdl/component/inference/fastdeploy_client/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ======================================================================= |
409 changes: 409 additions & 0 deletions
409
visualdl/component/inference/fastdeploy_client/client_app.py
Large diffs are not rendered by default.
Oops, something went wrong.
304 changes: 304 additions & 0 deletions
304
visualdl/component/inference/fastdeploy_client/http_client_manager.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,304 @@ | ||
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ======================================================================= | ||
import json | ||
import re | ||
|
||
import numpy as np | ||
import requests | ||
import tritonclient.http as httpclient | ||
from attrdict import AttrDict | ||
from tritonclient.utils import InferenceServerException | ||
|
||
|
||
def convert_http_metadata_config(metadata): | ||
metadata = AttrDict(metadata) | ||
|
||
return metadata | ||
|
||
|
||
def prepare_request(inputs_meta, inputs_data, outputs_meta): | ||
''' | ||
inputs_meta: inputs meta information from model. name: info | ||
inputs_data: users input data. name: data | ||
''' | ||
# Set the input data | ||
inputs = [] | ||
for input_dict in inputs_meta: | ||
input_name = input_dict['name'] | ||
if input_name not in inputs_data: | ||
raise RuntimeError( | ||
'Error: input name {} required for model not existed.'.format( | ||
input_name)) | ||
if input_dict['datatype'] == 'FP32': | ||
inputs_data[input_name] = inputs_data[input_name].astype( | ||
np.float32 | ||
) / 255 # image data returned by gradio is uint8, convert to fp32 | ||
if len(input_dict['shape'] | ||
) == 3 and input_dict['shape'][0] == 3: # NCHW | ||
inputs_data[input_name] = inputs_data[input_name][0].transpose( | ||
2, 0, 1) | ||
elif len(input_dict['shape'] | ||
) == 4 and input_dict['shape'][1] == 3: # NCHW | ||
inputs_data[input_name] = inputs_data[input_name].transpose( | ||
0, 3, 1, 2) | ||
infer_input = httpclient.InferInput( | ||
input_name, inputs_data[input_name].shape, input_dict['datatype']) | ||
infer_input.set_data_from_numpy(inputs_data[input_name]) | ||
inputs.append(infer_input) | ||
outputs = [] | ||
for output_dict in outputs_meta: | ||
infer_output = httpclient.InferRequestedOutput(output_dict.name) | ||
outputs.append(infer_output) | ||
return inputs, outputs | ||
|
||
|
||
metrics_table_head = """ | ||
<style> | ||
table, th {{ | ||
border:0.1px solid black; | ||
}} | ||
</style> | ||
<div> | ||
<table style="width:100%"> | ||
<tr> | ||
<th rowspan="2">模型名称</th> | ||
<th colspan="4">执行统计</th> | ||
<th colspan="5">延迟统计</th> | ||
</tr> | ||
<tr> | ||
<th>请求处理成功数</th> | ||
<th>请求处理失败数</th> | ||
<th>推理batch数</th> | ||
<th>推理样本数</th> | ||
<th>请求处理时间(ms)</th> | ||
<th>任务队列等待时间(ms)</th> | ||
<th>输入处理时间(ms)</th> | ||
<th>模型推理时间(ms)</th> | ||
<th>输出处理时间(ms)</th> | ||
</tr> | ||
{} | ||
</table> | ||
</div> | ||
<br> | ||
<br> | ||
<br> | ||
<br> | ||
<br> | ||
<div> | ||
<table style="width:100%"> | ||
<tr> | ||
<th rowspan="2">GPU</th> | ||
<th colspan="4">性能指标</th> | ||
<th colspan="2">显存</th> | ||
</tr> | ||
<tr> | ||
<th>利用率(%)</th> | ||
<th>功率(W)</th> | ||
<th>功率限制(W)</th> | ||
<th>耗电量(W)</th> | ||
<th>总量(GB)</th> | ||
<th>已使用(GB)</th> | ||
</tr> | ||
{} | ||
</table> | ||
</div> | ||
""" | ||
|
||
|
||
def get_metric_data(server_addr, metric_port): # noqa:C901 | ||
''' | ||
Get metrics data from fastdeploy server, and transform it into html table. | ||
Args: | ||
server_addr(str): fastdeployserver ip address | ||
metric_port(int): fastdeployserver metrics port | ||
Returns: | ||
htmltable(str): html table to show metrics data | ||
''' | ||
model_table = {} | ||
gpu_table = {} | ||
metric_column_name = { | ||
"Model": { | ||
"nv_inference_request_success", "nv_inference_request_failure", | ||
"nv_inference_count", "nv_inference_exec_count", | ||
"nv_inference_request_duration_us", | ||
"nv_inference_queue_duration_us", | ||
"nv_inference_compute_input_duration_us", | ||
"nv_inference_compute_infer_duration_us", | ||
"nv_inference_compute_output_duration_us" | ||
}, | ||
"GPU": { | ||
"nv_gpu_power_usage", "nv_gpu_power_limit", | ||
"nv_energy_consumption", "nv_gpu_utilization", | ||
"nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" | ||
}, | ||
"CPU": { | ||
"nv_cpu_utilization", "nv_cpu_memory_total_bytes", | ||
"nv_cpu_memory_used_bytes" | ||
} | ||
} | ||
try: | ||
res = requests.get("http://{}:{}/metrics".format( | ||
server_addr, metric_port)) | ||
except Exception: | ||
return metrics_table_head.format('', '') | ||
metric_content = res.text | ||
for content in metric_content.split('\n'): | ||
if content.startswith('#'): | ||
continue | ||
else: | ||
res = re.match(r'(\w+){(.*)} (\w+)', | ||
content) # match output by server metrics interface | ||
if not res: | ||
continue | ||
metric_name = res.group(1) | ||
model = res.group(2) | ||
value = res.group(3) | ||
infos = {} | ||
for info in model.split(','): | ||
k, v = info.split('=') | ||
v = v.strip('"') | ||
infos[k] = v | ||
if metric_name in [ | ||
"nv_inference_request_duration_us", | ||
"nv_inference_queue_duration_us", | ||
"nv_inference_compute_input_duration_us", | ||
"nv_inference_compute_infer_duration_us", | ||
"nv_inference_compute_output_duration_us" | ||
]: | ||
value = str(float(value) / 1000) | ||
elif metric_name in [ | ||
"nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" | ||
]: | ||
value = str(float(value) / 1024 / 1024 / 1024) | ||
for key, metric_names in metric_column_name.items(): | ||
if metric_name in metric_names: | ||
if key == 'Model': | ||
model_name = infos['model'] | ||
if model_name not in model_table: | ||
model_table[model_name] = {} | ||
model_table[model_name][metric_name] = value | ||
elif key == 'GPU': | ||
gpu_name = infos['gpu_uuid'] | ||
if gpu_name not in gpu_table: | ||
gpu_table[gpu_name] = {} | ||
gpu_table[gpu_name][metric_name] = value | ||
elif key == 'CPU': | ||
pass | ||
model_data_list = [] | ||
gpu_data_list = [] | ||
model_data_metric_names = [ | ||
"nv_inference_request_success", "nv_inference_request_failure", | ||
"nv_inference_exec_count", "nv_inference_count", | ||
"nv_inference_request_duration_us", "nv_inference_queue_duration_us", | ||
"nv_inference_compute_input_duration_us", | ||
"nv_inference_compute_infer_duration_us", | ||
"nv_inference_compute_output_duration_us" | ||
] | ||
gpu_data_metric_names = [ | ||
"nv_gpu_utilization", "nv_gpu_power_usage", "nv_gpu_power_limit", | ||
"nv_energy_consumption", "nv_gpu_memory_total_bytes", | ||
"nv_gpu_memory_used_bytes" | ||
] | ||
for k, v in model_table.items(): | ||
data = [] | ||
data.append(k) | ||
for data_metric in model_data_metric_names: | ||
data.append(v[data_metric]) | ||
model_data_list.append(data) | ||
for k, v in gpu_table.items(): | ||
data = [] | ||
data.append(k) | ||
for data_metric in gpu_data_metric_names: | ||
data.append(v[data_metric]) | ||
gpu_data_list.append(data) | ||
model_data = '\n'.join([ | ||
"<tr>" + '\n'.join(["<td>" + item + "</td>" | ||
for item in data]) + "</tr>" | ||
for data in model_data_list | ||
]) | ||
gpu_data = '\n'.join([ | ||
"<tr>" + '\n'.join(["<td>" + item + "</td>" | ||
for item in data]) + "</tr>" | ||
for data in gpu_data_list | ||
]) | ||
return metrics_table_head.format(model_data, gpu_data) | ||
|
||
|
||
class HttpClientManager: | ||
def __init__(self): | ||
self.clients = {} # server url: httpclient | ||
|
||
def _create_client(self, server_url): | ||
if server_url in self.clients: | ||
return self.clients[server_url] | ||
try: | ||
fastdeploy_client = httpclient.InferenceServerClient(server_url) | ||
self.clients[server_url] = fastdeploy_client | ||
return fastdeploy_client | ||
except Exception: | ||
raise RuntimeError( | ||
'Can not connect to server {}, please check your \ | ||
server address'.format(server_url)) | ||
|
||
def infer(self, server_url, model_name, model_version, inputs): | ||
fastdeploy_client = self._create_client(server_url) | ||
input_metadata, output_metadata = self.get_model_meta( | ||
server_url, model_name, model_version) | ||
inputs, outputs = prepare_request(input_metadata, inputs, | ||
output_metadata) | ||
response = fastdeploy_client.infer( | ||
model_name, inputs, model_version=model_version, outputs=outputs) | ||
|
||
results = {} | ||
for output in output_metadata: | ||
result = response.as_numpy(output.name) # datatype: numpy | ||
if output.datatype == 'BYTES': # datatype: bytes | ||
try: | ||
value = result | ||
if len(result.shape) == 1: | ||
value = result[0] | ||
elif len(result.shape) == 2: | ||
value = result[0][0] | ||
elif len(result.shape) == 3: | ||
value = result[0][0][0] | ||
result = json.loads(value) # datatype: json | ||
except Exception: | ||
pass | ||
else: | ||
result = result[0] | ||
results[output.name] = result | ||
return results | ||
|
||
def raw_infer(self, server_url, model_name, model_version, raw_input): | ||
url = 'http://{}/v2/models/{}/versions/{}/infer'.format( | ||
server_url, model_name, model_version) | ||
res = requests.post(url, data=json.dumps(json.loads(raw_input))) | ||
return json.dumps(res.json()) | ||
|
||
def get_model_meta(self, server_url, model_name, model_version): | ||
fastdeploy_client = self._create_client(server_url) | ||
try: | ||
model_metadata = fastdeploy_client.get_model_metadata( | ||
model_name=model_name, model_version=model_version) | ||
except InferenceServerException as e: | ||
raise RuntimeError("Failed to retrieve the metadata: " + str(e)) | ||
|
||
model_metadata = convert_http_metadata_config(model_metadata) | ||
|
||
input_metadata = model_metadata.inputs | ||
output_metadata = model_metadata.outputs | ||
return input_metadata, output_metadata |
Oops, something went wrong.