diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b93702 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.pyc +/build/ +/dist/ +/fleece_worker.egg-info/ +/.vscode/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..aab9f7b --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +## Installation + +### Install From PyPI +``` +pip install fleece-worker +``` + +### Install From Source +``` +pip install -e . +``` + +## Connect to a controller + +``` +python -m fleece-worker -c -t +``` +Optional: `--worker-nickname abc`, `--heartbeat-interval 10`, `-w ` + +For example: + +``` +python -m fleece-worker -c https://serving-api.colearn.cloud:8443 -t +``` + +## Try it out (deprecated) + +``` +python -m fleece-worker +``` + +``` +curl localhost:8080/forward -H 'Content-Type: application/json' -d '{"task_id":"123","step":0,"round":0,"plan":[["http://127.0.0.1:8080",["llama-2-7b-chat-slice/tok_embeddings", "llama-2-7b-chat-slice/layers.0", "llama-2-7b-chat-slice/layers.1", "llama-2-7b-chat-slice/layers.2", "llama-2-7b-chat-slice/layers.3", "llama-2-7b-chat-slice/layers.4", "llama-2-7b-chat-slice/layers.5", "llama-2-7b-chat-slice/layers.6", "llama-2-7b-chat-slice/layers.7", "llama-2-7b-chat-slice/layers.8", "llama-2-7b-chat-slice/layers.9", "llama-2-7b-chat-slice/layers.10", "llama-2-7b-chat-slice/layers.11", "llama-2-7b-chat-slice/layers.12", "llama-2-7b-chat-slice/layers.13", "llama-2-7b-chat-slice/layers.14", "llama-2-7b-chat-slice/layers.15", "llama-2-7b-chat-slice/layers.16", "llama-2-7b-chat-slice/layers.17", "llama-2-7b-chat-slice/layers.18", "llama-2-7b-chat-slice/layers.19", "llama-2-7b-chat-slice/layers.20", "llama-2-7b-chat-slice/layers.21", "llama-2-7b-chat-slice/layers.22", "llama-2-7b-chat-slice/layers.23", "llama-2-7b-chat-slice/layers.24", "llama-2-7b-chat-slice/layers.25", "llama-2-7b-chat-slice/layers.26", "llama-2-7b-chat-slice/layers.27", "llama-2-7b-chat-slice/layers.28", "llama-2-7b-chat-slice/layers.29", "llama-2-7b-chat-slice/layers.30", "llama-2-7b-chat-slice/layers.31", "llama-2-7b-chat-slice/norm", "llama-2-7b-chat-slice/output"]]],"payload":[[1, 518, 25580, 29962, 825, 338, 278, 9522, 412, 310, 1122, 11586, 895, 29973, 518, 29914, 25580, 29962]]}' +``` +``` +curl localhost:8080/forward -H 'Content-Type: application/json' -d '{"task_id":"123","step":0,"round":0,"plan":[["http://127.0.0.1:8080",["llama-2-7b-chat-slice/tok_embeddings", "llama-2-7b-chat-slice/layers.0", "llama-2-7b-chat-slice/layers.1", "llama-2-7b-chat-slice/layers.2", "llama-2-7b-chat-slice/layers.3", "llama-2-7b-chat-slice/layers.4", "llama-2-7b-chat-slice/layers.5", "llama-2-7b-chat-slice/layers.6", "llama-2-7b-chat-slice/layers.7", "llama-2-7b-chat-slice/layers.8", "llama-2-7b-chat-slice/layers.9", "llama-2-7b-chat-slice/layers.10", "llama-2-7b-chat-slice/layers.11", "llama-2-7b-chat-slice/layers.12", "llama-2-7b-chat-slice/layers.13", "llama-2-7b-chat-slice/layers.14", "llama-2-7b-chat-slice/layers.15", "llama-2-7b-chat-slice/layers.16", "llama-2-7b-chat-slice/layers.17", "llama-2-7b-chat-slice/layers.18", "llama-2-7b-chat-slice/layers.19", "llama-2-7b-chat-slice/layers.20", "llama-2-7b-chat-slice/layers.21", "llama-2-7b-chat-slice/layers.22", "llama-2-7b-chat-slice/layers.23", "llama-2-7b-chat-slice/layers.24", "llama-2-7b-chat-slice/layers.25", "llama-2-7b-chat-slice/layers.26", "llama-2-7b-chat-slice/layers.27", "llama-2-7b-chat-slice/layers.28", "llama-2-7b-chat-slice/layers.29", "llama-2-7b-chat-slice/layers.30", "llama-2-7b-chat-slice/layers.31", "llama-2-7b-chat-slice/norm", "llama-2-7b-chat-slice/output"]]],"payload":[[1, 518, 25580, 29962, 825, 338, 278, 9522, 412, 310, 1122, 11586, 895, 29973, 518, 29914, 25580, 29962], [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 2499, 1994, 1234, 411, 5952, 18282, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 29902, 626, 2675, 304, 3681, 29892, 825, 881, 306, 1074, 29973, 518, 29914, 25580, 29962], [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 2499, 1994, 1234, 411, 953, 3848, 275, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 5328, 304, 748, 515, 1522, 823, 292, 304, 23526, 29973, 518, 29914, 25580, 29962]]}' +``` +> note that the model will be automatically downloaded to `~/.cache` diff --git a/bench.py b/bench.py new file mode 100644 index 0000000..bbd3605 --- /dev/null +++ b/bench.py @@ -0,0 +1,57 @@ +import torch +import pandas as pd +import argparse + +fleece_worker = __import__("fleece-worker") + +worker = fleece_worker.Worker() + +worker.start_layer_forward_engine() + +parser = argparse.ArgumentParser(description='Run the estimation') +parser.add_argument('--model', '-m', type=str, required=True) +args = parser.parse_args() +model = args.model +df_layers = pd.read_csv("./specs/fleece_layers.csv") +layer_names = [] +for idx, row in df_layers.iterrows(): + if not row["From_model"] == model: + continue + layer_name = row["Layer_name"] + if row["Repetition"] == 1: + layer_names.append(layer_name) + else: + for i in range(min(row["Repetition"], 5)): + layer_names.append(f"{layer_name}.{i}") +# example 1 +print("[") +worker.preload_layers(layer_names) +h = torch.tensor([[1, 518, 25580, 29962, 825, 338, 278, 9522, 412, 310, 1122, 11586, 895, 29973, 518, 29914, 25580, 29962]], device="cuda") +start_pos = 0 +is_new_task = start_pos == 0 +kv_cache_dict = dict() +for _ in range(16): + bsz = h.shape[0] + seqlen = h.shape[1] + _, kv_cache_dict = worker.layers_forward(h, layer_names, bsz, is_new_task, 0, start_pos, seqlen, kv_cache_dict) + is_new_task = False + start_pos += seqlen + h = torch.tensor([[29962]], device="cuda") + +# # example 2 +hidden_dim = 4092 if model == "llama-2-7b-chat-slice" else 8192 if model == "llama-2-70b-chat-slice" else 8192 +layer_names = [f"{model}/layers.0", f"{model}/layers.1"] +worker.preload_layers(layer_names) +h = torch.randn((1, 18, 8192), dtype=torch.float16, device="cuda") +start_pos = 0 +is_new_task = start_pos == 0 +kv_cache_dict = dict() +for _ in range(16): + bsz = h.shape[0] + seqlen = h.shape[1] + _, kv_cache_dict = worker.layers_forward(h, layer_names, bsz, is_new_task, 0, start_pos, seqlen, kv_cache_dict) + is_new_task = False + start_pos += seqlen + h = torch.randn((1, 1, 8192), dtype=torch.float16, device="cuda") + +print("]") \ No newline at end of file diff --git a/fleece-worker/__init__.py b/fleece-worker/__init__.py new file mode 100644 index 0000000..80001ee --- /dev/null +++ b/fleece-worker/__init__.py @@ -0,0 +1,3 @@ +__version__ = "0.1.0" + +from .worker import Worker diff --git a/fleece-worker/__main__.py b/fleece-worker/__main__.py new file mode 100644 index 0000000..9bdb747 --- /dev/null +++ b/fleece-worker/__main__.py @@ -0,0 +1,187 @@ +from typing import List, Tuple, Optional +from fastapi import FastAPI, HTTPException +from peerrtc.peer import Peer +from pydantic import BaseModel +import anyio +import uvicorn +from .worker import Worker +from .__init__ import __version__ +import argparse +import requests +import json +import torch +import concurrent.futures +from anyio.from_thread import BlockingPortal + +app = FastAPI() +worker = Worker() + + +class LayersRequest(BaseModel): + layer_names: List[str] + + +def preload_layers(req: LayersRequest): + try: + worker.preload_layers(req.layer_names) + return None + except Exception as e: + print(e) + raise HTTPException(status_code=500, detail="Internal Server Error") + + +def unload_layers(req: LayersRequest): + try: + worker.unload_layers(req.layer_names) + return None + except Exception as e: + print(e) + raise HTTPException(status_code=500, detail="Internal Server Error") + + +class ForwardRequest(BaseModel): + task_id: str + plan: List[Tuple[str, List[str]]] + step: int + round: int = -1 + payload: Optional[List] = None + max_total_len: int = 2048 + temperature: float = 0.0 + top_p: float = 0.9 + task_manager_url: Optional[str] = None + signature: Optional[str] = None + timestamp: Optional[int] = None + + +executor = concurrent.futures.ThreadPoolExecutor(max_workers=64) + + +def forward(req: ForwardRequest): + try: + executor.submit(worker.forward, req.task_id, req.plan, req.step, req.round, req.payload, req.max_total_len, req.temperature, req.top_p, + req.task_manager_url, req.signature, req.timestamp) + return None + except Exception as e: + print(e) + raise HTTPException(status_code=500, detail="Internal Server Error") + + +class GetInfoRequest(BaseModel): + node_list: List[str] = [] + timeout: int = 30 + + +class GetInfoResponse(BaseModel): + worker_nickname: str + gpu_mem_info: Tuple[int, int] = [0, 0] + latency_list: List[Optional[float]] = [] + + +def get_info(req: GetInfoRequest) -> GetInfoResponse: + try: + worker_nickname, gpu_mem_info, latency_list = worker.get_info( + req.node_list, req.timeout + ) + return GetInfoResponse( + worker_nickname=worker_nickname, + gpu_mem_info=gpu_mem_info, + latency_list=latency_list, + ) + except Exception as e: + print(e) + raise HTTPException(status_code=500, detail="Internal Server Error") + + +async def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("-c", "--controller-url") + parser.add_argument("-w", "--worker-url") + parser.add_argument("-t", "--api-token") + parser.add_argument("--port") + parser.add_argument("--worker-nickname") + parser.add_argument("--heartbeat-interval") + args = parser.parse_args() + if args.worker_url is not None: + worker_url = args.worker_url + parsed = worker_url.split(':') + if len(parsed) >= 3: + port = int(parsed[2]) + else: + port = 8080 + else: + worker_url = "none" + port = 8080 + if args.port is not None: + port = int(args.port) + worker.port = port + if args.api_token is not None: + worker.api_token = args.api_token + if args.worker_nickname is not None: + worker.worker_nickname = args.worker_nickname + if args.heartbeat_interval is not None: + worker.heartbeat_interval = int(args.heartbeat_interval) + if args.controller_url is not None: + worker.controller_url = args.controller_url + data = {"url": worker_url, "version": __version__} + if worker.worker_nickname is not None: + data["nickname"] = worker.worker_nickname + if torch.cuda.is_available(): + model = torch.cuda.get_device_name() + memory = torch.cuda.mem_get_info() + data["gpu_model"] = model + data["gpu_total_memory"] = memory[1] + data["gpu_remaining_memory"] = memory[0] + else: + data["gpu_model"] = "CPU" + data["gpu_total_memory"] = 0 + data["gpu_remaining_memory"] = 0 + r = requests.post(f"{args.controller_url}/register_worker", + json=data, + headers={"api-token": worker.api_token}) + res = json.loads(r.content) + worker.worker_id = res["id"] + worker.pull_worker_url() + worker.start_heartbeat_daemon() + worker.start_layer_forward_engine() + + print("Worker ID: ", worker.worker_id) + + r = requests.get( + f"{args.controller_url}/get_network_servers", + headers={"api-token": worker.api_token} + ) + + servers = json.loads(r.content) + signaling = servers["signaling"]["url"] + turns = servers["turn"] + async with BlockingPortal() as portal: + worker.async_portal = portal + async with anyio.create_task_group() as tg: + worker.peer = Peer( + worker.worker_id, + signaling, + [(turn["url"], turn["username"], turn["password"]) for turn in turns], + { + "preload_layers": preload_layers, + "unload_layers": unload_layers, + "forward": forward, + "get_info": get_info, + }, + tg, + ) + + # start the FastAPI server when public IP is available + if worker_url != "none": + app.add_api_route("/preload_layers", preload_layers, methods=["POST"]) + app.add_api_route("/unload_layers", unload_layers, methods=["POST"]) + app.add_api_route("/forward", forward, methods=["POST"]) + app.add_api_route("/get_info", get_info, methods=["POST"]) + + uviconfig = uvicorn.Config(app, host="0.0.0.0", port=port, access_log=True) + uviserver = uvicorn.Server(uviconfig) + tg.start_soon(uviserver.serve) + await portal.sleep_until_stopped() + + +if __name__ == '__main__': + anyio.run(main) diff --git a/fleece-worker/model.py b/fleece-worker/model.py new file mode 100644 index 0000000..9cd2a27 --- /dev/null +++ b/fleece-worker/model.py @@ -0,0 +1,502 @@ +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + +import math +from dataclasses import dataclass +from typing import Optional, Tuple, List + +import torch +import torch.nn.functional as F +from torch import nn + + +@dataclass +class ModelArgs: + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + + max_batch_size: int = 32 + max_seq_len: int = 2048 + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + """ + Initialize the RMSNorm normalization layer. + + Args: + dim (int): The dimension of the input tensor. + eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. + + Attributes: + eps (float): A small value added to the denominator for numerical stability. + weight (nn.Parameter): Learnable scaling parameter. + + """ + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + """ + Apply the RMSNorm normalization to the input tensor. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The normalized tensor. + + """ + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + @torch.inference_mode() + def forward(self, x): + """ + Forward pass through the RMSNorm layer. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The output tensor after applying RMSNorm. + + """ + output = self._norm(x.float()).type_as(x) + return output * self.weight + + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): + """ + Precompute the frequency tensor for complex exponentials (cis) with given dimensions. + + This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' + and the end index 'end'. The 'theta' parameter scales the frequencies. + The returned tensor contains complex values in complex64 data type. + + Args: + dim (int): Dimension of the frequency tensor. + end (int): End index for precomputing frequencies. + theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0. + + Returns: + torch.Tensor: Precomputed frequency tensor with complex exponentials. + + + + + """ + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) # type: ignore + freqs = torch.outer(t, freqs).float() # type: ignore + freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 + return freqs_cis + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + """ + Reshape frequency tensor for broadcasting it with another tensor. + + This function reshapes the frequency tensor to have the same shape as the target tensor 'x' + for the purpose of broadcasting the frequency tensor during element-wise operations. + + Args: + freqs_cis (torch.Tensor): Frequency tensor to be reshaped. + x (torch.Tensor): Target tensor for broadcasting compatibility. + + Returns: + torch.Tensor: Reshaped frequency tensor. + + Raises: + AssertionError: If the frequency tensor doesn't match the expected shape. + AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions. + """ + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[1], x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary embeddings to input tensors using the given frequency tensor. + + This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided + frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor + is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are + returned as real tensors. + + Args: + xq (torch.Tensor): Query tensor to apply rotary embeddings. + xk (torch.Tensor): Key tensor to apply rotary embeddings. + freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + + + + """ + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: + """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" + bs, slen, n_kv_heads, head_dim = x.shape + if n_rep == 1: + return x + return ( + x[:, :, :, None, :] + .expand(bs, slen, n_kv_heads, n_rep, head_dim) + .reshape(bs, slen, n_kv_heads * n_rep, head_dim) + ) + + +class Attention(nn.Module): + """Multi-head attention module.""" + + def __init__(self, args: ModelArgs): + """ + Initialize the Attention module. + + Args: + args (ModelArgs): Model configuration parameters. + + Attributes: + n_kv_heads (int): Number of key and value heads. + n_local_heads (int): Number of local query heads. + n_local_kv_heads (int): Number of local key and value heads. + n_rep (int): Number of repetitions for local heads. + head_dim (int): Dimension size of each attention head. + wq (ColumnParallelLinear): Linear transformation for queries. + wk (ColumnParallelLinear): Linear transformation for keys. + wv (ColumnParallelLinear): Linear transformation for values. + wo (RowParallelLinear): Linear transformation for output. + cache_k (torch.Tensor): Cached keys for attention. + cache_v (torch.Tensor): Cached values for attention. + + """ + super().__init__() + self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads + # model_parallel_size = fs_init.get_model_parallel_world_size() + self.n_local_heads = args.n_heads # // model_parallel_size + self.n_local_kv_heads = self.n_kv_heads # // model_parallel_size + self.n_rep = self.n_local_heads // self.n_local_kv_heads + self.head_dim = args.dim // args.n_heads + + self.wq = torch.nn.utils.skip_init(nn.Linear, + args.dim, + args.n_heads * self.head_dim, + bias=False, + ) + self.wk = torch.nn.utils.skip_init(nn.Linear, + args.dim, + self.n_kv_heads * self.head_dim, + bias=False, + ) + self.wv = torch.nn.utils.skip_init(nn.Linear, + args.dim, + self.n_kv_heads * self.head_dim, + bias=False, + ) + self.wo = torch.nn.utils.skip_init(nn.Linear, + args.n_heads * self.head_dim, + args.dim, + bias=False, + ) + + # self.cache_k = torch.zeros( + # ( + # args.max_batch_size, + # args.max_seq_len, + # self.n_local_kv_heads, + # self.head_dim, + # ) + # ) + # self.cache_v = torch.zeros( + # ( + # args.max_batch_size, + # args.max_seq_len, + # self.n_local_kv_heads, + # self.head_dim, + # ) + # ) + + @torch.inference_mode() + def forward( + self, + x: torch.Tensor, + x_list: List[torch.Tensor], + start_pos_list: List[int], + global_freqs_cis: torch.Tensor, + kv_cache_list: List[Tuple[torch.Tensor, torch.Tensor]], + ): + """ + Forward pass of the attention module. + + Args: + x (torch.Tensor): Input tensor. + start_pos (int): Starting position for caching. + freqs_cis (torch.Tensor): Precomputed frequency tensor. + mask (torch.Tensor, optional): Attention mask tensor. + + Returns: + torch.Tensor: Output tensor after attention. + + """ + xq_, xk_, xv_ = self.wq(x), self.wk(x), self.wv(x) + + start = 0 + output_list = [] + for i, x in enumerate(x_list): + bsz, seqlen, _ = x.shape + xq = xq_[start:start+(bsz * seqlen)].view(bsz, seqlen, self.n_local_heads, self.head_dim) + xk = xk_[start:start+(bsz * seqlen)].view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + xv = xv_[start:start+(bsz * seqlen)].view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + start += (bsz * seqlen) + + start_pos = start_pos_list[i] + kv_cache = kv_cache_list[i] + freqs_cis = global_freqs_cis[start_pos: start_pos + seqlen] + mask = None + if seqlen > 1: + mask = torch.full( + (1, 1, seqlen, seqlen), float("-inf"), device=x.device + ) + mask = torch.triu(mask, diagonal=start_pos + 1).type_as(x) + + xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + + # self.cache_k = self.cache_k.to(xq) + # self.cache_v = self.cache_v.to(xq) + + cache_k, cache_v = kv_cache + cache_k[:bsz, start_pos: start_pos + seqlen] = xk + cache_v[:bsz, start_pos: start_pos + seqlen] = xv + + keys = cache_k[:bsz, : start_pos + seqlen] + values = cache_v[:bsz, : start_pos + seqlen] + + # repeat k/v heads if n_kv_heads < n_heads + keys = repeat_kv(keys, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + values = repeat_kv(values, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + + xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + keys = keys.transpose(1, 2) + values = values.transpose(1, 2) + scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim) + if mask is not None: + scores = scores + mask # (bs, n_local_heads, seqlen, cache_len + seqlen) + scores = F.softmax(scores.float(), dim=-1).type_as(xq) + output = torch.matmul(scores, values) # (bs, n_local_heads, seqlen, head_dim) + output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) + output_list.append(output) + _, _, sz = output_list[0].shape + output = torch.cat([x.view(-1, sz) for x in output_list]) + return self.wo(output) + + +class FeedForward(nn.Module): + def __init__( + self, + dim: int, + hidden_dim: int, + multiple_of: int, + ffn_dim_multiplier: Optional[float], + ): + """ + Initialize the FeedForward module. + + Args: + dim (int): Input dimension. + hidden_dim (int): Hidden dimension of the feedforward layer. + multiple_of (int): Value to ensure hidden dimension is a multiple of this value. + ffn_dim_multiplier (float, optional): Custom multiplier for hidden dimension. Defaults to None. + + Attributes: + w1 (ColumnParallelLinear): Linear transformation for the first layer. + w2 (RowParallelLinear): Linear transformation for the second layer. + w3 (ColumnParallelLinear): Linear transformation for the third layer. + + """ + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = torch.nn.utils.skip_init(nn.Linear, + dim, hidden_dim, bias=False, + ) + self.w2 = torch.nn.utils.skip_init(nn.Linear, + hidden_dim, dim, bias=False, + ) + self.w3 = torch.nn.utils.skip_init(nn.Linear, + dim, hidden_dim, bias=False, + ) + + @torch.inference_mode() + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +class TransformerBlock(nn.Module): + def __init__(self, args: ModelArgs): + """ + Initialize a TransformerBlock. + + Args: + layer_id (int): Identifier for the layer. + args (ModelArgs): Model configuration parameters. + + Attributes: + n_heads (int): Number of attention heads. + dim (int): Dimension size of the model. + head_dim (int): Dimension size of each attention head. + attention (Attention): Attention module. + feed_forward (FeedForward): FeedForward module. + layer_id (int): Identifier for the layer. + attention_norm (RMSNorm): Layer normalization for attention output. + ffn_norm (RMSNorm): Layer normalization for feedforward output. + + """ + super().__init__() + self.n_heads = args.n_heads + self.dim = args.dim + self.head_dim = args.dim // args.n_heads + self.attention = Attention(args) + self.feed_forward = FeedForward( + dim=args.dim, + hidden_dim=4 * args.dim, + multiple_of=args.multiple_of, + ffn_dim_multiplier=args.ffn_dim_multiplier, + ) + # self.layer_id = layer_id + self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) + self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) + + @torch.inference_mode() + def forward( + self, + x_list: List[torch.Tensor], + start_pos_list: List[int], + global_freqs_cis: torch.Tensor, + kv_cache_list: List[Tuple[torch.Tensor, torch.Tensor]], + ): + """ + Perform a forward pass through the TransformerBlock. + + Args: + x (torch.Tensor): Input tensor. + start_pos (int): Starting position for attention caching. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + mask (torch.Tensor, optional): Masking tensor for attention. Defaults to None. + + Returns: + torch.Tensor: Output tensor after applying attention and feedforward layers. + + """ + _, _, sz = x_list[0].shape + x = torch.cat([x.view(-1, sz) for x in x_list]) + h = x + self.attention.forward( + self.attention_norm(x), x_list, start_pos_list, global_freqs_cis, kv_cache_list + ) + out = h + self.feed_forward.forward(self.ffn_norm(h)) + out_list = [] + start = 0 + for x in x_list: + bsz, seqlen, sz = x.shape + out_list.append(out[start:start+(bsz * seqlen)].view(bsz, seqlen, sz)) + start += (bsz * seqlen) + return out_list + + +# class Transformer(nn.Module): +# def __init__(self, params: ModelArgs): +# """ +# Initialize a Transformer model. + +# Args: +# params (ModelArgs): Model configuration parameters. + +# Attributes: +# params (ModelArgs): Model configuration parameters. +# vocab_size (int): Vocabulary size. +# n_layers (int): Number of layers in the model. +# tok_embeddings (ParallelEmbedding): Token embeddings. +# layers (torch.nn.ModuleList): List of Transformer blocks. +# norm (RMSNorm): Layer normalization for the model output. +# output (ColumnParallelLinear): Linear layer for final output. +# freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + +# """ +# super().__init__() +# self.params = params +# self.vocab_size = params.vocab_size +# self.n_layers = params.n_layers + +# self.tok_embeddings = torch.nn.utils.skip_init(nn.Embedding, +# params.vocab_size, params.dim +# ) + +# self.layers = torch.nn.ModuleList() +# for _ in range(params.n_layers): +# self.layers.append(TransformerBlock(params)) + +# self.norm = RMSNorm(params.dim, eps=params.norm_eps) +# self.output = torch.nn.utils.skip_init(nn.Linear, +# params.dim, params.vocab_size, bias=False, +# ) + +# self.freqs_cis = precompute_freqs_cis( +# # Note that self.params.max_seq_len is multiplied by 2 because the token limit for the Llama 2 generation of models is 4096. +# # Adding this multiplier instead of using 4096 directly allows for dynamism of token lengths while training or fine-tuning. +# self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 +# ) + +# @torch.inference_mode() +# def forward(self, tokens: torch.Tensor, start_pos: int): +# """ +# Perform a forward pass through the Transformer model. + +# Args: +# tokens (torch.Tensor): Input token indices. +# start_pos (int): Starting position for attention caching. + +# Returns: +# torch.Tensor: Output logits after applying the Transformer model. + +# """ +# _bsz, seqlen = tokens.shape +# h = self.tok_embeddings(tokens) +# self.freqs_cis = self.freqs_cis.to(h.device) +# freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] + +# mask = None +# if seqlen > 1: +# mask = torch.full( +# (1, 1, seqlen, seqlen), float("-inf"), device=tokens.device +# ) +# mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h) + +# for layer in self.layers: +# h = layer(h, start_pos, freqs_cis, mask) +# h = self.norm(h) +# output = self.output(h).float() +# return output diff --git a/fleece-worker/tokenizer.py b/fleece-worker/tokenizer.py new file mode 100644 index 0000000..bb70677 --- /dev/null +++ b/fleece-worker/tokenizer.py @@ -0,0 +1,67 @@ +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + +import os +from logging import getLogger +from typing import List + +from sentencepiece import SentencePieceProcessor + + +logger = getLogger() + + +class Tokenizer: + """tokenizing and encoding/decoding text using SentencePiece.""" + def __init__(self, model_path: str): + """ + Initializes the Tokenizer with a SentencePiece model. + + Args: + model_path (str): The path to the SentencePiece model file. + """ + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + logger.info(f"Reloaded SentencePiece model from {model_path}") + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.pad_id() + logger.info( + f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" + ) + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + def encode(self, s: str, bos: bool, eos: bool) -> List[int]: + """ + Encodes a string into a list of token IDs. + + Args: + s (str): The input string to be encoded. + bos (bool): Whether to prepend the beginning-of-sequence token. + eos (bool): Whether to append the end-of-sequence token. + + Returns: + List[int]: A list of token IDs. + """ + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + """ + Decodes a list of token IDs into a string. + + Args: + t (List[int]): The list of token IDs to be decoded. + + Returns: + str: The decoded string. + """ + return self.sp_model.decode(t) diff --git a/fleece-worker/worker.py b/fleece-worker/worker.py new file mode 100644 index 0000000..f01bede --- /dev/null +++ b/fleece-worker/worker.py @@ -0,0 +1,819 @@ +from typing import List, Optional, Tuple, Dict, Any, Set +import os +import torch +from torch import nn +from .model import ModelArgs, TransformerBlock, RMSNorm, precompute_freqs_cis +from peerrtc.peer import Peer +import requests +import threading +import concurrent.futures +import time +import socket +from urllib.parse import urlparse +import json +from cryptography.hazmat.primitives.asymmetric import ec +from cryptography.hazmat.primitives import hashes +import queue + +torch.set_default_device("cpu") + +llama_2_7b_args = {"dim": 4096, "multiple_of": 256, "n_heads": 32, "n_layers": 32, "norm_eps": 1e-06, "vocab_size": 32000} +llama_2_13b_args = {"dim": 5120, "multiple_of": 256, "n_heads": 40, "n_layers": 40, "norm_eps": 1e-05, "vocab_size": 32000} +llama_2_70b_args = {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": 32000} + +if torch.cuda.is_available(): + main_device = "cuda" + main_dtype = torch.float16 + torch.set_default_dtype(torch.float16) +else: + main_device = "cpu" + main_dtype = torch.float32 + torch.set_default_dtype(torch.float32) +global_freqs_cis = precompute_freqs_cis(128, 4096).to(main_device) +# tokenizer = Tokenizer(model_path="/home/ubuntu/llama/tokenizer.model") +# print(tokenizer.bos_id) 1 +# print(tokenizer.eos_id) 2 +# print(tokenizer.pad_id) -1 +# print(tokenizer.n_words) 32000 + + +def parse_layer_name(layer_name: str): + s = layer_name.split('/') + return s[0], s[1] + + +KV_CACHE_BLOCK = 512 + + +def get_kv_cache_length(cur, seqlen): + while cur < seqlen: + cur += KV_CACHE_BLOCK + return cur + + +def get_kv_cache(x, start_pos, kv_cache, model): + bsz, seqlen, _ = x.shape + if kv_cache is None: + length = get_kv_cache_length(0, start_pos + seqlen) + cache_k = torch.zeros( + ( + bsz, + length, + model.attention.n_local_kv_heads, + model.attention.head_dim, + ), + device=main_device + ) + cache_v = torch.zeros( + ( + bsz, + length, + model.attention.n_local_kv_heads, + model.attention.head_dim, + ), + device=main_device + ) + return (cache_k, cache_v) + old_cache_k, old_cache_v = kv_cache + if start_pos + seqlen > old_cache_k.shape[1]: + length = get_kv_cache_length(old_cache_k.shape[1], start_pos + seqlen) + cache_k = torch.zeros( + ( + bsz, + length, + model.attention.n_local_kv_heads, + model.attention.head_dim, + ), + device=main_device + ) + cache_v = torch.zeros( + ( + bsz, + length, + model.attention.n_local_kv_heads, + model.attention.head_dim, + ), + device=main_device + ) + cache_k[:, :start_pos, :, :], cache_v[:, :start_pos, :, :] = old_cache_k[:, :start_pos, :, :], old_cache_v[:, :start_pos, :, :] + del_tensor(old_cache_k) + del_tensor(old_cache_v) + del kv_cache + return (cache_k, cache_v) + else: + return kv_cache + + +def del_tensor(t): + t.detach() + t.grad = None + t.untyped_storage().resize_(0) + + +executor = concurrent.futures.ThreadPoolExecutor(max_workers=400) +executor_forward = concurrent.futures.ThreadPoolExecutor(max_workers=40) + + +def requests_post(url, headers=None, json=None, worker=None, to_worker_id=None): + try: + if to_worker_id is not None: + st = time.monotonic() + r = requests.post(url, headers=headers, json=json) + assert r.status_code == 200 + if to_worker_id is not None: + en = time.monotonic() + latency = (en-st)*1000 + worker.perf_network.append((to_worker_id, latency)) + except Exception: + if worker is not None: + worker.cancel_task(json["task_id"]) + + +def send_request(url, headers=None, json=None, exec=None, worker=None, to_worker_id=None): + if exec is None: + executor.submit(requests_post, url, headers, json, worker, to_worker_id) + else: + exec.submit(requests_post, url, headers, json, worker, to_worker_id) + + +executor_latency_test = concurrent.futures.ThreadPoolExecutor(max_workers=40) + + +def latency_test(host: str, port: int, timeout=60): + st = time.monotonic() + try: + s = socket.create_connection((host, port), timeout=timeout) + s.shutdown(socket.SHUT_RD) + except socket.timeout: + return None + except OSError: + return None + en = time.monotonic() + return (en-st)*1000 + + +def measure_latency(node_list: List[str], timeout): + # executor_latency_test + jobs = [] + for node in node_list: + parsed_url = urlparse(node) + host = parsed_url.hostname + if parsed_url.port is not None: + port = parsed_url.port + elif parsed_url.scheme == "http": + port = 80 + elif parsed_url.scheme == "https": + port = 443 + else: + port = 22 + jobs.append(executor_latency_test.submit(latency_test, host, port, timeout)) + ans = [] + for job in jobs: + ans.append(job.result()) + return ans + + +class LayerForward: + def __init__( + self, + h: torch.Tensor, + layer_names: List, + bsz: int, + is_new_task: bool, + round: int, + start_pos: int, + seqlen: int, + kv_cache_dict: Dict, + call_back_queue: queue.Queue, + ): + self.h = h + self.layer_names = layer_names + self.bsz = bsz + self.is_new_task = is_new_task + self.round = round + self.start_pos = start_pos + self.seqlen = seqlen + self.kv_cache_dict = kv_cache_dict + self.call_back_queue = call_back_queue + + +class Worker: + def __init__( + self, + worker_id: str = None, + # mirror_url: str = "TODO", + cache_dir: str = "~/.cache/fleece-worker/models", + ): + self.worker_id = worker_id + # self.mirror_url = mirror_url + self.controller_url = None + self.api_token = None + self.worker_nickname = worker_id + self.heartbeat_interval = 300 + self.tm_pubkeys = {} + self.worker_urls = {} + self.perf_computation = [] + self.perf_network = [] + self.peer: Optional[Peer] = None + self.async_portal = None + + self.cache_dir = os.path.expanduser(cache_dir) + self.layers = dict() + self.task_info: Dict[(str, int), Tuple[int, Dict[str, Any]]] = dict() + self.mutex = threading.Lock() + self.task_prompt_tokens: Dict[str, torch.Tensor] = dict() + self.task_eos_reached: Dict[str, torch.Tensor] = dict() + self.task_local_steps: Dict[str, List[int]] = dict() + self.task_update_queue: Dict[str, queue.Queue[Tuple[int, List[int]]]] = dict() + self.layer_forward_engine_queue: queue.Queue[LayerForward] = queue.Queue() + self.canceled_task: Set[str] = set() + + def fetch_layer(self, full_layer_name): + model_name, layer_name = parse_layer_name(full_layer_name) + if model_name.startswith("dummy"): + return None + path = os.path.join(self.cache_dir, model_name, f"{layer_name}.pt") + if not os.path.exists(path): # TODO lock + os.makedirs(os.path.join(self.cache_dir, model_name), exist_ok=True) + with requests.get(f"https://huggingface.co/colearn/{model_name}/resolve/main/{layer_name}.pt", stream=True) as r: + r.raise_for_status() + with open(path, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + f.close() + return path + + def preload_layers(self, layer_names: List[str]): + with self.mutex: # TODO + ths = [] + for full_layer_name in layer_names: + if full_layer_name in self.layers: + continue + th = executor.submit(self.fetch_layer, full_layer_name) + ths.append((full_layer_name, th)) + for full_layer_name, th in ths: + path = th.result() + model_name, layer_name = parse_layer_name(full_layer_name) + if model_name.startswith("dummy"): + continue + if model_name.startswith("llama-2-7b"): + model_args = ModelArgs(**llama_2_7b_args) + elif model_name.startswith("llama-2-13b"): + model_args = ModelArgs(**llama_2_13b_args) + elif model_name.startswith("llama-2-70b"): + model_args = ModelArgs(**llama_2_70b_args) + else: + raise NotImplementedError("Unknown model") + if layer_name == "tok_embeddings": + l = torch.nn.utils.skip_init(nn.Embedding, model_args.vocab_size, model_args.dim) + elif layer_name.startswith("layer"): + l = TransformerBlock(model_args) + elif layer_name == "norm": + l = RMSNorm(model_args.dim, eps=model_args.norm_eps) + elif layer_name == "output": + l = torch.nn.utils.skip_init(nn.Linear, model_args.dim, model_args.vocab_size, bias=False) + else: + raise NotImplementedError("Unknown layers") + l.load_state_dict(torch.load(path, map_location="cpu")) + l.to(main_device) + self.layers[full_layer_name] = l + + def unload_layers(self, layer_names: List[str]): + for full_layer_name in layer_names: + if full_layer_name not in self.layers: + continue # TODO continue or warning? + del self.layers[full_layer_name] + torch.cuda.empty_cache() + + def cancel_task(self, task_id: str): + self.del_task(task_id) + self.canceled_task.add(task_id) + + def del_task(self, task_id: str): + steps = self.task_local_steps.pop(task_id, None) + if steps is None: + return + if task_id in self.task_prompt_tokens: + del self.task_prompt_tokens[task_id] + if task_id in self.task_eos_reached: + del self.task_eos_reached[task_id] + if task_id in self.task_update_queue: + self.task_update_queue[task_id].put((None, None)) + for step in steps: + _, kv_cache_dict = self.task_info[(task_id, step)] + for _, kv_cache in kv_cache_dict.items(): + k_cache, v_cache = kv_cache + del_tensor(k_cache) + del_tensor(v_cache) + del self.task_info[(task_id, step)] + torch.cuda.empty_cache() + + def pull_worker_url(self): + r = requests.get(f"{self.controller_url}/get_worker_list", + headers={"api-token": self.api_token}) + res = json.loads(r.content) + for worker in res["workers"]: + self.worker_urls[worker["worker_id"]] = worker["url"] + + def get_worker_url(self, worker_id): + if worker_id not in self.worker_urls: + self.pull_worker_url() + return self.worker_urls.get(worker_id) + + def verify(self, tm_url, task_id, plan, timestamp, signature_hex): + public_key_bytes = bytes.fromhex(self.tm_pubkeys[tm_url]) + public_key = ec.EllipticCurvePublicKey.from_encoded_point( + ec.SECP256K1(), public_key_bytes + ) + signed_bytes = task_id.encode()+str(timestamp).encode() + for x in plan: + signed_bytes += x[0].encode() + for y in x[1]: + signed_bytes += y.encode() + try: + public_key.verify( + bytes.fromhex(signature_hex), + signed_bytes, + ec.ECDSA(hashes.SHA256()) + ) + return True + except: + return False + + def send_forward(self, to_worker_id, data): + if to_worker_id == self.worker_id: + executor.submit(self.forward, **data) + return + url = self.get_worker_url(to_worker_id) + if (url is not None and url != "none") and to_worker_id != self.worker_id: + if to_worker_id == self.worker_id: + # self.forward(**data) + send_request( + f"http://127.0.0.1:{self.port}/forward", + json=data, + exec=executor_forward, + worker=self, + to_worker_id=to_worker_id) + else: + send_request( + f"{self.get_worker_url(to_worker_id)}/forward", + json=data, + exec=executor_forward, + worker=self, + to_worker_id=to_worker_id) + else: + async def send(): + connection = await self.peer.connect(to_worker_id) + reply = await connection.send("forward", data) + if reply.status_code != 200: + self.cancel_task(data["task_id"]) + self.async_portal.call(self.peer.tg.start_soon, send) + + def layer_forward_engine_step(self, task_list: List[LayerForward]): + task = task_list[0] + with torch.inference_mode(): + input_shapes = [list(t.h.shape) for t in task_list] + st = time.monotonic() + latency_dict = {} + for full_layer_name in task.layer_names: + layer_st = time.monotonic() + model_name, layer_name = parse_layer_name(full_layer_name) + if model_name.startswith("dummy"): + for t in task_list: + if layer_name == "output": + t.h = torch.zeros((t.bsz, 1, 32000), dtype=main_dtype, device=main_device) + t.h[:, :, t.round+10] = 1.0 + if t.round >= 320: + t.h = torch.zeros((t.bsz, 1, 32000), dtype=main_dtype, device=main_device) + t.h[:, :, 2] = 1.0 + # time.sleep(0.01) + continue + if layer_name == "tok_embeddings": + h = torch.cat([t.h.view(-1) for t in task_list]) + h = self.layers[full_layer_name](h) + sz = h.shape[-1] + start = 0 + for t in task_list: + bsz, seqlen = t.h.shape + t.h = h[start:start+(bsz * seqlen)].view(bsz, seqlen, sz) + start += (bsz * seqlen) + elif layer_name.startswith("layers."): + kv_cache_list = [] + for t in task_list: + if t.is_new_task: + if torch.cuda.is_available(): + gpu_mem_info = torch.cuda.mem_get_info() + if gpu_mem_info[0]/gpu_mem_info[1] < 0.05 and gpu_mem_info[0] < 2e9: + return None, None # TODO need fix + kv_cache_list.append(get_kv_cache(t.h, t.start_pos, None, self.layers[full_layer_name])) + else: + kv_cache_list.append(get_kv_cache(t.h, t.start_pos, t.kv_cache_dict[full_layer_name], self.layers[full_layer_name])) + h_list = self.layers[full_layer_name]([t.h for t in task_list], [t.start_pos for t in task_list], global_freqs_cis, kv_cache_list) + for i, t in enumerate(task_list): + t.h = h_list[i] + t.kv_cache_dict[full_layer_name] = kv_cache_list[i] + elif layer_name == "norm": + _, _, sz = task_list[0].h.shape + h = torch.cat([t.h.view(-1, sz) for t in task_list]) + h = self.layers[full_layer_name](h) + start = 0 + for t in task_list: + bsz, seqlen, sz = t.h.shape + t.h = h[start:start+(bsz * seqlen)].view(bsz, seqlen, sz) + start += (bsz * seqlen) + elif layer_name == "output": + _, _, sz = task_list[0].h.shape + h = torch.cat([t.h.view(-1, sz) for t in task_list]) + h = self.layers[full_layer_name](h) + sz = h.shape[-1] + start = 0 + for t in task_list: + bsz, seqlen, _ = t.h.shape + t.h = h[start:start+(bsz * seqlen)].view(bsz, seqlen, sz) + start += (bsz * seqlen) + else: + raise NotImplementedError("Unknown layers") + layer_en = time.monotonic() + latency = (layer_en-layer_st)*1000 + latency_dict[layer_name] = latency + #print(layer_name, input_shapes, latency) + en = time.monotonic() + latency = (en-st)*1000 + self.perf_computation.append(((str(task.layer_names), str(input_shapes)), latency)) + # print(((str(task.layer_names), str(input_shapes)), latency)) + print(f"{json.dumps(latency_dict)},") + for task in task_list: + task.call_back_queue.put((task.h, task.kv_cache_dict)) + + def layer_forward_engine(self): + q = self.layer_forward_engine_queue + while True: + task_list = [] + task = q.get() + total_bsz = task.bsz + task_list.append(task) + while True: + try: + task2 = q.get(block=False) + if task2.layer_names == task.layer_names: + task_list.append(task2) + total_bsz += task2.bsz + if total_bsz >= 16: + break + else: + q.put(task2) + break + except queue.Empty: + break + # print("layer_forward_engine_step: ", len(task_list)) + self.layer_forward_engine_step(task_list) + + def start_layer_forward_engine(self): + heartbeat_thread = threading.Thread(target=self.layer_forward_engine) + heartbeat_thread.daemon = True + heartbeat_thread.start() + + def layers_forward(self, h, layer_names, bsz, is_new_task, round, start_pos, seqlen, kv_cache_dict): + q = queue.Queue() + self.layer_forward_engine_queue.put(LayerForward(h, layer_names, bsz, is_new_task, round, start_pos, seqlen, kv_cache_dict, q)) + h, kv_cache_dict = q.get() + del q + return h, kv_cache_dict + + def send_update_task(self, task_manager_url, task_id, step): + q = self.task_update_queue[task_id] + while True: + output_tokens_list = [] + round, output_tokens = q.get() + if output_tokens is None: + break + output_tokens_list.append(output_tokens) + ret_flag = False + while True: + try: + _, output_tokens = q.get(block=False) + if output_tokens is None: + ret_flag = True + break + else: + output_tokens_list.append(output_tokens) + except queue.Empty: + break + # print("requests_post", round, output_tokens_list) + requests_post( + f"{task_manager_url}/update_task", + headers={"worker-id": self.worker_id, "api-token": self.api_token}, + json={ + "task_id": task_id, + "plan_current_step": step, + "plan_current_round": round, + "output_tokens": output_tokens_list, + }, + worker=self) + if ret_flag: + break + # TODO del self.task_update_queue[task_id]? + + def new_task_update(self, task_manager_url, task_id, _step, round, output_tokens): + if task_manager_url is not None: + self.task_update_queue[task_id].put((round, output_tokens)) + + def forward_same_node(self, delta_round, h, layer_names, bsz, is_new_task, round, start_pos, seqlen, kv_cache_dict, temperature, top_p, max_total_len, eos_reached, prompt_tokens, task_manager_url, task_id, step): + ans_tokens = [] + try: + for i in range(delta_round): + h, kv_cache_dict = self.layers_forward(h, layer_names, bsz, is_new_task, round+i, start_pos, seqlen, kv_cache_dict) + # last node + if temperature > 0: + probs = torch.softmax(h[:, -1] / temperature, dim=-1) + next_token = sample_top_p(probs, top_p) + else: + next_token = torch.argmax(h[:, -1], dim=-1) + next_token = next_token.reshape(-1) + if start_pos > max_total_len: + next_token = torch.tensor([2] * bsz, device=main_device) # FIXME fake max length limit + # print(next_token) + next_token = next_token + # eos_reached + if is_new_task: + eos_reached = torch.tensor([False] * bsz, device=main_device) + eos_reached |= next_token == 2 # eos_id + if all(eos_reached) or i == delta_round-1: + return h, kv_cache_dict, ans_tokens, eos_reached + + # loop + ans_tokens.append(next_token) + start_pos = start_pos+seqlen + seqlen = 1 + is_new_task = False + + # first node + tokens = torch.zeros((bsz, 1), dtype=torch.long, device=main_device) + for k, t in enumerate(prompt_tokens): + if len(t) > start_pos: + tokens[k, :] = torch.tensor([t[start_pos]], dtype=torch.long, device=main_device) + else: + tokens[k, :] = next_token[k] + h = tokens + return h, kv_cache_dict, ans_tokens, eos_reached + finally: + # update_task + for i, output_tokens in enumerate(ans_tokens): + self.new_task_update(task_manager_url, task_id, step, round+i, output_tokens.tolist()) + + def forward(self, + task_id: str, + plan: List[Tuple[str, List[str]]], + step: int, + round: int = -1, + payload: Optional[List] = None, + max_total_len: int = 2048, + temperature: float = 0.0, + top_p: float = 0.9, + task_manager_url: Optional[str] = None, + signature: Optional[str] = None, + timestamp: Optional[int] = None, + ): + # self.verify(task_manager_url, task_id, plan, timestamp, signature) + + index = step + is_new_task = round == 0 + if payload is None or task_id in self.canceled_task: + self.del_task(task_id) + if index < len(plan)-1: + # next node + self.send_forward( + plan[index+1][0], + data={ + "task_id": task_id, + "plan": plan, + "step": step+1, + "task_manager_url": task_manager_url, + "signature": signature, + "timestamp": timestamp, + }) + return + + if is_new_task: + if task_id in self.task_local_steps: + self.task_local_steps[task_id].append(step) + else: + self.task_local_steps[task_id] = [step] + self.task_info[(task_id, step)] = (0, dict()) + else: + if not task_id in self.task_local_steps: + return + start_pos, kv_cache_dict = self.task_info[(task_id, step)] + + # first node + if index == 0: + bsz = len(payload) + if is_new_task: + min_prompt_len = min(len(t) for t in payload) + self.task_prompt_tokens[task_id] = payload + tokens = torch.zeros((bsz, min_prompt_len), dtype=torch.long) + for k, t in enumerate(payload): + tokens[k, :] = torch.tensor(t[:min_prompt_len], dtype=torch.long) + h = tokens.to(main_device) + else: + prompt_tokens = self.task_prompt_tokens[task_id] + tokens = torch.zeros((bsz, 1), dtype=torch.long) + for k, t in enumerate(prompt_tokens): + if len(t) > start_pos: + tokens[k, :] = torch.tensor([t[start_pos]], dtype=torch.long) + else: + tokens[k, :] = torch.tensor([payload[k]], dtype=torch.long) + h = tokens.to(main_device) + # print(h) + bsz, seqlen = h.shape + else: + h = torch.tensor(payload, dtype=main_dtype, device=main_device) + if len(h.shape) > 2: + bsz, seqlen, _ = h.shape + else: + bsz, seqlen = h.shape + + # last node init + if index == len(plan)-1 and is_new_task: + self.task_eos_reached[task_id] = torch.tensor([False] * bsz) + self.task_update_queue[task_id] = queue.Queue() + executor.submit(self.send_update_task, task_manager_url, task_id, step) + + # forward + _, layer_names = plan[index] + self.preload_layers(layer_names) # preload + if len(plan) == 1: + delta_round = 16 + eos_reached = self.task_eos_reached[task_id].to(main_device) + prompt_tokens = self.task_prompt_tokens[task_id] + h, kv_cache_dict, tokens, eos_reached = self.forward_same_node(delta_round, h, layer_names, bsz, is_new_task, round, start_pos, seqlen, + kv_cache_dict, temperature, top_p, max_total_len, eos_reached, prompt_tokens, task_manager_url, task_id, step) + self.task_eos_reached[task_id] = eos_reached.to("cpu") + delta_round = len(tokens)+1 + round = round+delta_round-1 + start_pos = start_pos+delta_round-1 + else: + h, kv_cache_dict = self.layers_forward(h, layer_names, bsz, is_new_task, round, start_pos, seqlen, kv_cache_dict) + if h is None: + return + else: + self.task_info[(task_id, step)] = (start_pos+seqlen, kv_cache_dict) + + # last node + if index == len(plan)-1: + if temperature > 0: + probs = torch.softmax(h[:, -1] / temperature, dim=-1) + next_token = sample_top_p(probs, top_p) + else: + next_token = torch.argmax(h[:, -1], dim=-1) + next_token = next_token.reshape(-1) + if start_pos > max_total_len: + next_token = torch.tensor([2] * bsz) # FIXME fake max length limit + # print(next_token) + next_token = next_token.to("cpu") + + # eos_reached + self.task_eos_reached[task_id] |= next_token == 2 # eos_id + if not all(self.task_eos_reached[task_id]): + # next node + self.send_forward( + plan[0][0], + data={ + "task_id": task_id, + "plan": plan, + "step": 0, + "round": round+1, + "payload": next_token.tolist(), + "max_total_len": max_total_len, + "temperature": temperature, + "top_p": top_p, + "task_manager_url": task_manager_url, + "signature": signature, + "timestamp": timestamp, + }) + else: + self.send_forward( + plan[0][0], + data={ + "task_id": task_id, + "plan": plan, + "step": 0, + "task_manager_url": task_manager_url, + "signature": signature, + "timestamp": timestamp, + }) + # update + if task_manager_url is not None: + self.new_task_update(task_manager_url, task_id, step, round, next_token.tolist()) + if all(self.task_eos_reached[task_id]): + self.cancel_task(task_id) + else: + # next node + self.send_forward( + plan[index+1][0], + data={ + "task_id": task_id, + "plan": plan, + "step": step+1, + "round": round, + "payload": h.tolist(), + "max_total_len": max_total_len, + "temperature": temperature, + "top_p": top_p, + "task_manager_url": task_manager_url, + "signature": signature, + "timestamp": timestamp, + }) + # update + # if task_manager_url is not None: + # send_request( + # f"{task_manager_url}/update_task", + # headers={"worker-id": self.worker_id, "api-token": self.api_token}, + # json={ + # "task_id": task_id, + # "plan_current_step": step, + # "plan_current_round": round, + # }, + # worker=self) + + def get_info(self, node_list, timeout): + gpu_mem_info = torch.cuda.mem_get_info() + latency_list = measure_latency(node_list, timeout) + return self.worker_nickname, gpu_mem_info, latency_list + + def send_heartbeat(self): + info_data = { + "loaded_layers": json.dumps(list(self.layers.keys())), + "perf_computation": [], + "perf_network": [] + } + + s = {} + for k, v in self.perf_computation: + if k not in s: + s[k] = [v, 1] + else: + s[k][0] += v + s[k][1] += 1 + for k, v in s.items(): + layers, input_shape = k + avg_latency = v[0]/v[1] + info_data["perf_computation"].append({"layers": layers, "input_shape": input_shape, "latency": avg_latency}) + s = {} + for k, v in self.perf_network: + if k not in s: + s[k] = [v, 1] + else: + s[k][0] += v + s[k][1] += 1 + for k, v in s.items(): + avg_latency = v[0]/v[1] + info_data["perf_network"].append({"to_worker_id": k, "latency": avg_latency}) + + if torch.cuda.is_available(): + memory = torch.cuda.mem_get_info() + info_data["gpu_remaining_memory"] = memory[0] + data = {"info_update": json.dumps(info_data)} + try: + r = requests.post(f"{self.controller_url}/worker_heartbeat", + json=data, + headers={"worker-id": self.worker_id, "api-token": self.api_token}) + res = json.loads(r.content) + self.tm_pubkeys = res["pubkeys"] + except: + pass + + def start_heartbeat_daemon(self): + def heartbeat_thread(): + while True: + self.send_heartbeat() + time.sleep(self.heartbeat_interval) + heartbeat_thread = threading.Thread(target=heartbeat_thread) + heartbeat_thread.daemon = True + heartbeat_thread.start() + + +def sample_top_p(probs, p): + """ + Perform top-p (nucleus) sampling on a probability distribution. + + Args: + probs (torch.Tensor): Probability distribution tensor. + p (float): Probability threshold for top-p sampling. + + Returns: + torch.Tensor: Sampled token indices. + + Note: + Top-p sampling selects the smallest set of tokens whose cumulative probability mass + exceeds the threshold p. The distribution is renormalized based on the selected tokens. + + """ + probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) + probs_sum = torch.cumsum(probs_sort, dim=-1) + mask = probs_sum - probs_sort > p + probs_sort[mask] = 0.0 + probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) + next_token = torch.multinomial(probs_sort, num_samples=1) + next_token = torch.gather(probs_idx, -1, next_token) + return next_token diff --git a/results70b.json b/results70b.json new file mode 100644 index 0000000..0aff2a9 --- /dev/null +++ b/results70b.json @@ -0,0 +1,34 @@ +[ +{"tok_embeddings": 132.46827200055122, "layers.0": 392.12358091026545, "layers.1": 1.0147381108254194, "layers.2": 0.8739551994949579, "layers.3": 0.8340240456163883, "layers.4": 0.8178339339792728, "norm": 0.0918020959943533, "output": 0.31708599999547005}, +{"tok_embeddings": 0.24701422080397606, "layers.0": 12.8053049556911, "layers.1": 0.6413219962269068, "layers.2": 0.5597490817308426, "layers.3": 0.560190062969923, "layers.4": 0.5524598527699709, "norm": 0.0824520830065012, "output": 0.06840098649263382}, +{"tok_embeddings": 0.06035109981894493, "layers.0": 0.6257810164242983, "layers.1": 0.5724600050598383, "layers.2": 0.5363801028579473, "layers.3": 0.5557099357247353, "layers.4": 0.5436590872704983, "norm": 0.08357199840247631, "output": 0.05430099554359913}, +{"tok_embeddings": 0.05133100785315037, "layers.0": 0.5779811181128025, "layers.1": 0.6071100942790508, "layers.2": 0.5711899138987064, "layers.3": 0.5672900006175041, "layers.4": 0.5642001051455736, "norm": 0.0828220508992672, "output": 0.052310992032289505}, +{"tok_embeddings": 0.05512102507054806, "layers.0": 0.5960201378911734, "layers.1": 0.5376201588660479, "layers.2": 0.5307591054588556, "layers.3": 0.513899140059948, "layers.4": 0.5152788944542408, "norm": 0.0791219063103199, "output": 0.04984019324183464}, +{"tok_embeddings": 0.08338200859725475, "layers.0": 0.6141711492091417, "layers.1": 0.5517101380974054, "layers.2": 0.5396390333771706, "layers.3": 0.5587700288742781, "layers.4": 0.5183890461921692, "norm": 0.07874099537730217, "output": 0.06558094173669815}, +{"tok_embeddings": 0.07111113518476486, "layers.0": 0.6926530040800571, "layers.1": 0.5749000702053308, "layers.2": 0.5479687824845314, "layers.3": 0.5450299941003323, "layers.4": 0.568330055102706, "norm": 0.08326093666255474, "output": 0.0534809660166502}, +{"tok_embeddings": 0.05201087333261967, "layers.0": 0.600920058786869, "layers.1": 0.5644389893859625, "layers.2": 0.5344299133867025, "layers.3": 0.558760017156601, "layers.4": 0.5477489903569221, "norm": 0.08208188228309155, "output": 0.05519110709428787}, +{"tok_embeddings": 0.07023103535175323, "layers.0": 0.6054709665477276, "layers.1": 0.5414490588009357, "layers.2": 0.6071908865123987, "layers.3": 0.5563800223171711, "layers.4": 0.5196691490709782, "norm": 0.07816101424396038, "output": 0.050680944696068764}, +{"tok_embeddings": 0.06949109956622124, "layers.0": 0.6041810847818851, "layers.1": 0.5441089160740376, "layers.2": 0.5413498729467392, "layers.3": 0.5283788777887821, "layers.4": 0.523529015481472, "norm": 0.09996187873184681, "output": 0.051520997658371925}, +{"tok_embeddings": 0.06822100840508938, "layers.0": 0.6095210555940866, "layers.1": 0.5651300307363272, "layers.2": 0.5354089662432671, "layers.3": 0.5321300122886896, "layers.4": 0.5228291265666485, "norm": 0.0791910570114851, "output": 0.05111098289489746}, +{"tok_embeddings": 0.06980099715292454, "layers.0": 0.6150510162115097, "layers.1": 0.5318101029843092, "layers.2": 0.5520491395145655, "layers.3": 0.5253399722278118, "layers.4": 0.5222889594733715, "norm": 0.07795100100338459, "output": 0.05043088458478451}, +{"tok_embeddings": 0.06825104355812073, "layers.0": 0.6278208456933498, "layers.1": 0.5434800405055285, "layers.2": 0.520539004355669, "layers.3": 0.5260088946670294, "layers.4": 0.5285800434648991, "norm": 0.07954100146889687, "output": 0.04992098547518253}, +{"tok_embeddings": 0.0663509126752615, "layers.0": 0.5940399132668972, "layers.1": 0.5645400378853083, "layers.2": 0.534560065716505, "layers.3": 0.5285190418362617, "layers.4": 0.5172088276594877, "norm": 0.07946114055812359, "output": 0.05083112046122551}, +{"tok_embeddings": 0.07229112088680267, "layers.0": 0.6048411596566439, "layers.1": 0.546569935977459, "layers.2": 0.5251590628176928, "layers.3": 0.518738990649581, "layers.4": 0.5105691961944103, "norm": 0.07849186658859253, "output": 0.05001993849873543}, +{"tok_embeddings": 0.08632196113467216, "layers.0": 8.228644030168653, "layers.1": 0.5737198516726494, "layers.2": 0.5500998813658953, "layers.3": 0.6075110286474228, "layers.4": 0.5626101046800613, "norm": 0.0840409193187952, "output": 0.05257106386125088}, +{"layers.0": 0.8622349705547094, "layers.1": 0.6534119602292776}, +{"layers.0": 0.5857399664819241, "layers.1": 0.5359489005059004}, +{"layers.0": 0.5605400074273348, "layers.1": 0.5521089769899845}, +{"layers.0": 0.6248410791158676, "layers.1": 0.5389989819377661}, +{"layers.0": 0.5698299501091242, "layers.1": 0.5465799476951361}, +{"layers.0": 0.5561099387705326, "layers.1": 0.5448088049888611}, +{"layers.0": 0.5750500131398439, "layers.1": 0.5575299728661776}, +{"layers.0": 0.5517899990081787, "layers.1": 0.5218391306698322}, +{"layers.0": 0.5623099859803915, "layers.1": 0.5315791349858046}, +{"layers.0": 0.5510689225047827, "layers.1": 0.5673600826412439}, +{"layers.0": 0.5614799447357655, "layers.1": 0.5163189489394426}, +{"layers.0": 0.544459093362093, "layers.1": 0.516268890351057}, +{"layers.0": 0.5456290673464537, "layers.1": 0.5126188043504953}, +{"layers.0": 0.5418600048869848, "layers.1": 0.5207490175962448}, +{"layers.0": 0.5410588346421719, "layers.1": 0.5282601341605186}, +{"layers.0": 0.5437799263745546, "layers.1": 0.5348790436983109}, +] diff --git a/results7b.json b/results7b.json new file mode 100644 index 0000000..f38d765 --- /dev/null +++ b/results7b.json @@ -0,0 +1,18 @@ +[ +{"tok_embeddings": 133.77531105652452, "layers.0": 402.2816689684987, "layers.1": 1.1048070155084133, "layers.2": 1.0726859327405691, "layers.3": 0.6777010858058929, "layers.4": 0.8688329253345728, "layers.5": 0.7522520609200001, "layers.6": 0.6586799863725901, "layers.7": 0.8490930777043104, "layers.8": 0.6512098480015993, "layers.9": 0.8182330057024956, "layers.10": 0.650809844955802, "layers.11": 0.6213991437107325, "layers.12": 0.8493340574204922, "layers.13": 0.6158389151096344, "layers.14": 0.8113631047308445, "layers.15": 0.6511998362839222, "layers.16": 0.6519099697470665, "layers.17": 0.8248728699982166, "layers.18": 0.6009689532220364, "layers.19": 0.7761919405311346, "layers.20": 0.6489201914519072, "layers.21": 0.5974390078336, "layers.22": 0.8338131010532379, "layers.23": 0.6428598426282406, "layers.24": 0.7889720145612955, "layers.25": 0.6134789437055588, "layers.26": 0.6719310767948627, "layers.27": 0.8147219195961952, "layers.28": 0.656649935990572, "layers.29": 0.786392018198967, "layers.30": 0.6577011663466692, "layers.31": 0.6629100535064936, "norm": 0.08744117803871632, "output": 0.25055394507944584}, +{"tok_embeddings": 0.25886413641273975, "layers.0": 14.126758091151714, "layers.1": 0.5567579064518213, "layers.2": 0.5120080895721912, "layers.3": 0.4951478913426399, "layers.4": 0.5226579960435629, "layers.5": 0.5217681173235178, "layers.6": 0.4922978114336729, "layers.7": 0.5028669256716967, "layers.8": 0.521278940141201, "layers.9": 0.5186379421502352, "layers.10": 0.5007470026612282, "layers.11": 0.48678810708224773, "layers.12": 0.5160279106348753, "layers.13": 0.5163580644875765, "layers.14": 0.5059880204498768, "layers.15": 0.48464699648320675, "layers.16": 0.4835869185626507, "layers.17": 0.49757794477045536, "layers.18": 0.487977871671319, "layers.19": 0.4847869277000427, "layers.20": 0.5013379268348217, "layers.21": 0.49430783838033676, "layers.22": 0.4873969592154026, "layers.23": 0.4929581191390753, "layers.24": 0.4918668419122696, "layers.25": 0.4806080833077431, "layers.26": 0.506256939843297, "layers.27": 0.4845680668950081, "layers.28": 0.49097719602286816, "layers.29": 0.4887881223112345, "layers.30": 0.4929478745907545, "layers.31": 0.49031712114810944, "norm": 0.08032121695578098, "output": 0.07132207974791527}, +{"tok_embeddings": 0.07808092050254345, "layers.0": 0.5829490255564451, "layers.1": 0.5110581405460835, "layers.2": 0.4991281311959028, "layers.3": 0.511018093675375, "layers.4": 0.4964270628988743, "layers.5": 0.49457792192697525, "layers.6": 0.4898079205304384, "layers.7": 0.4995970521122217, "layers.8": 0.4803980700671673, "layers.9": 0.4822670016437769, "layers.10": 0.5084581207484007, "layers.11": 0.5112180951982737, "layers.12": 0.5767191760241985, "layers.13": 0.5251381080597639, "layers.14": 0.5042569246143103, "layers.15": 0.5145880859345198, "layers.16": 0.4982580430805683, "layers.17": 0.5084879230707884, "layers.18": 0.52289804443717, "layers.19": 0.5135280080139637, "layers.20": 0.5090979393571615, "layers.21": 0.5032180342823267, "layers.22": 0.497286906465888, "layers.23": 0.49345800653100014, "layers.24": 0.5609290674328804, "layers.25": 0.5108281038701534, "layers.26": 0.5159680731594563, "layers.27": 0.5274179857224226, "layers.28": 0.49842684529721737, "layers.29": 0.49710716120898724, "layers.30": 0.5029779858887196, "layers.31": 0.4949269350618124, "norm": 0.08296198211610317, "output": 0.058281002566218376}, +{"tok_embeddings": 0.05782092921435833, "layers.0": 0.5641088355332613, "layers.1": 0.5305178929120302, "layers.2": 0.5112681537866592, "layers.3": 0.5179480649530888, "layers.4": 0.5279979668557644, "layers.5": 0.5121780559420586, "layers.6": 0.5040778778493404, "layers.7": 0.5284280050545931, "layers.8": 0.5461680702865124, "layers.9": 0.540419016033411, "layers.10": 0.514667946845293, "layers.11": 0.504316994920373, "layers.12": 0.5180980078876019, "layers.13": 0.5044881254434586, "layers.14": 0.5050678737461567, "layers.15": 0.5687391385436058, "layers.16": 0.5193180404603481, "layers.17": 0.5170081276446581, "layers.18": 0.5248480010777712, "layers.19": 0.5077568348497152, "layers.20": 0.5215879064053297, "layers.21": 0.5074669606983662, "layers.22": 0.5532889626920223, "layers.23": 0.5744590889662504, "layers.24": 0.5265881773084402, "layers.25": 0.5033279303461313, "layers.26": 0.5017069634050131, "layers.27": 0.49726804718375206, "layers.28": 0.5186179187148809, "layers.29": 0.5054478533565998, "layers.30": 0.4934880416840315, "layers.31": 0.4904370289295912, "norm": 0.08429191075265408, "output": 0.0541098415851593}, +{"tok_embeddings": 0.05355989560484886, "layers.0": 0.5391789600253105, "layers.1": 0.5103279836475849, "layers.2": 0.5088369362056255, "layers.3": 0.5184579640626907, "layers.4": 0.5156078841537237, "layers.5": 0.49602799117565155, "layers.6": 0.5199678707867861, "layers.7": 0.5305579397827387, "layers.8": 0.4996981006115675, "layers.9": 0.5012580659240484, "layers.10": 0.4946871194988489, "layers.11": 0.4915781319141388, "layers.12": 0.5080080591142178, "layers.13": 0.4991181194782257, "layers.14": 0.49671693705022335, "layers.15": 0.5133680533617735, "layers.16": 0.5153880920261145, "layers.17": 0.5168979987502098, "layers.18": 0.49387780018150806, "layers.19": 0.4988268483430147, "layers.20": 0.4949779249727726, "layers.21": 0.5025679711252451, "layers.22": 0.5052578635513783, "layers.23": 0.5061570554971695, "layers.24": 0.49966806545853615, "layers.25": 0.5772488657385111, "layers.26": 0.5245979409664869, "layers.27": 0.5078378599137068, "layers.28": 0.5049179308116436, "layers.29": 0.5053370259702206, "layers.30": 0.5198880098760128, "layers.31": 0.5494190845638514, "norm": 0.08530099876224995, "output": 0.05374103784561157}, +{"tok_embeddings": 0.056680990383028984, "layers.0": 0.5709491670131683, "layers.1": 0.5235681310296059, "layers.2": 0.5080481059849262, "layers.3": 0.5038881208747625, "layers.4": 0.5027868319302797, "layers.5": 0.5833778996020555, "layers.6": 0.512078171595931, "layers.7": 0.5139179993420839, "layers.8": 0.501308124512434, "layers.9": 0.4995879717171192, "layers.10": 0.5271679256111383, "layers.11": 0.5004068370908499, "layers.12": 0.4959581419825554, "layers.13": 0.5332180298864841, "layers.14": 0.4974980838596821, "layers.15": 0.5099677946418524, "layers.16": 0.4887368995696306, "layers.17": 0.4921581130474806, "layers.18": 0.49385707825422287, "layers.19": 0.4932780284434557, "layers.20": 0.4933380987495184, "layers.21": 0.513377832248807, "layers.22": 0.5028869491070509, "layers.23": 0.49674813635647297, "layers.24": 0.5291379056870937, "layers.25": 0.5237578880041838, "layers.26": 0.4944480024278164, "layers.27": 0.4949078429490328, "layers.28": 0.5121978465467691, "layers.29": 0.514887971803546, "layers.30": 0.5157378036528826, "layers.31": 0.49460679292678833, "norm": 0.11614197865128517, "output": 0.06683100946247578}, +{"tok_embeddings": 0.06313109770417213, "layers.0": 0.6511597894132137, "layers.1": 0.5153280217200518, "layers.2": 0.5225581116974354, "layers.3": 0.565278809517622, "layers.4": 0.5046972073614597, "layers.5": 0.5095279775559902, "layers.6": 0.5027079023420811, "layers.7": 0.5522989667952061, "layers.8": 0.5077279638499022, "layers.9": 0.5062469281256199, "layers.10": 0.5356390029191971, "layers.11": 0.5339081399142742, "layers.12": 0.4892069846391678, "layers.13": 0.5117780528962612, "layers.14": 0.5008378066122532, "layers.15": 0.5009581800550222, "layers.16": 0.5027770530432463, "layers.17": 0.48570800572633743, "layers.18": 0.5030378233641386, "layers.19": 0.5013169720768929, "layers.20": 0.47839805483818054, "layers.21": 0.4872470162808895, "layers.22": 0.4778879228979349, "layers.23": 0.4957672208547592, "layers.24": 0.515857944265008, "layers.25": 0.48025790601968765, "layers.26": 0.48792711459100246, "layers.27": 0.4786879289895296, "layers.28": 0.48324698582291603, "layers.29": 0.5002780817449093, "layers.30": 0.5099878180772066, "layers.31": 0.4820569884032011, "norm": 0.0816909596323967, "output": 0.055750831961631775}, +{"tok_embeddings": 0.07680198177695274, "layers.0": 0.5802388768643141, "layers.1": 0.5302478093653917, "layers.2": 0.49710716120898724, "layers.3": 0.5099079571664333, "layers.4": 0.49420795403420925, "layers.5": 0.4847471136599779, "layers.6": 0.4807279910892248, "layers.7": 0.4759770818054676, "layers.8": 0.49752811901271343, "layers.9": 0.49185706302523613, "layers.10": 0.48106792382895947, "layers.11": 0.48492709174752235, "layers.12": 0.5154281388968229, "layers.13": 0.5356292240321636, "layers.14": 0.5305279046297073, "layers.15": 0.5148481577634811, "layers.16": 0.4874670412391424, "layers.17": 0.4944282118231058, "layers.18": 0.511527992784977, "layers.19": 0.48093684017658234, "layers.20": 0.47951797023415565, "layers.21": 0.5234479904174805, "layers.22": 0.4818870220333338, "layers.23": 0.47854799777269363, "layers.24": 0.4814069252461195, "layers.25": 0.513328006491065, "layers.26": 0.4882768262177706, "layers.27": 0.4766280762851238, "layers.28": 0.47937710769474506, "layers.29": 0.47492701560258865, "layers.30": 0.48475805670022964, "layers.31": 0.49095693975687027, "norm": 0.07969187572598457, "output": 0.05201995372772217}, +{"tok_embeddings": 0.07244083099067211, "layers.0": 0.5808889400213957, "layers.1": 0.5090481135994196, "layers.2": 0.49917795695364475, "layers.3": 0.48919697292149067, "layers.4": 0.48438808880746365, "layers.5": 0.5048280581831932, "layers.6": 0.4804569762200117, "layers.7": 0.47842785716056824, "layers.8": 0.49058697186410427, "layers.9": 0.5849190056324005, "layers.10": 0.5139878485351801, "layers.11": 0.5031479522585869, "layers.12": 0.5052580963820219, "layers.13": 0.5065870936959982, "layers.14": 0.4996981006115675, "layers.15": 0.494958134368062, "layers.16": 0.511478167027235, "layers.17": 0.49136695452034473, "layers.18": 0.5129079800099134, "layers.19": 0.49522798508405685, "layers.20": 0.5035370122641325, "layers.21": 0.5501389969140291, "layers.22": 0.5098478868603706, "layers.23": 0.513467937707901, "layers.24": 0.51497807726264, "layers.25": 0.5162679590284824, "layers.26": 0.49596698954701424, "layers.27": 0.5163978785276413, "layers.28": 0.49095810391008854, "layers.29": 0.5098681431263685, "layers.30": 0.495067797601223, "layers.31": 0.5138481501489878, "norm": 0.0916910357773304, "output": 0.058311037719249725}, +{"tok_embeddings": 0.05343090742826462, "layers.0": 0.5651288665831089, "layers.1": 0.5044969730079174, "layers.2": 0.5041379481554031, "layers.3": 0.4995279014110565, "layers.4": 0.5114779341965914, "layers.5": 0.5629679653793573, "layers.6": 0.5789890419691801, "layers.7": 0.5088781472295523, "layers.8": 0.5013779737055302, "layers.9": 0.5251278635114431, "layers.10": 0.5228479858487844, "layers.11": 0.5010578315705061, "layers.12": 0.5001579411327839, "layers.13": 0.494837062433362, "layers.14": 0.5318887997418642, "layers.15": 0.49437698908150196, "layers.16": 0.521698035299778, "layers.17": 0.5142178852111101, "layers.18": 0.502958195284009, "layers.19": 0.4991469904780388, "layers.20": 0.5857879295945168, "layers.21": 0.5609379149973392, "layers.22": 0.5067079328000546, "layers.23": 0.48958812840282917, "layers.24": 0.48529705964028835, "layers.25": 0.48539694398641586, "layers.26": 0.47747790813446045, "layers.27": 0.49897702410817146, "layers.28": 0.492617953568697, "layers.29": 0.4887678660452366, "layers.30": 0.4831769037991762, "layers.31": 0.47621713019907475, "norm": 0.07929187268018723, "output": 0.05025998689234257}, +{"tok_embeddings": 0.07366109639406204, "layers.0": 0.586668960750103, "layers.1": 0.5091880448162556, "layers.2": 0.49747806042432785, "layers.3": 0.49355695955455303, "layers.4": 0.511248130351305, "layers.5": 0.4900780040770769, "layers.6": 0.4869371186941862, "layers.7": 0.5162379238754511, "layers.8": 0.47967792488634586, "layers.9": 0.505656935274601, "layers.10": 0.4829780664294958, "layers.11": 0.4858570173382759, "layers.12": 0.5322881042957306, "layers.13": 0.4997279029339552, "layers.14": 0.4815279971808195, "layers.15": 0.4845771472901106, "layers.16": 0.4802269395440817, "layers.17": 0.4778779111802578, "layers.18": 0.47939689829945564, "layers.19": 0.4776869900524616, "layers.20": 0.4753079265356064, "layers.21": 0.49345684237778187, "layers.22": 0.4845280200242996, "layers.23": 0.47683692537248135, "layers.24": 0.48457784578204155, "layers.25": 0.49689807929098606, "layers.26": 0.48936717212200165, "layers.27": 0.4957679193466902, "layers.28": 0.49699796363711357, "layers.29": 0.4983569961041212, "layers.30": 0.47920807264745235, "layers.31": 0.5091980565339327, "norm": 0.08000107482075691, "output": 0.050680944696068764}, +{"tok_embeddings": 0.07211090996861458, "layers.0": 0.5650490056723356, "layers.1": 0.5068678874522448, "layers.2": 0.49314694479107857, "layers.3": 0.5098080728203058, "layers.4": 0.48576807603240013, "layers.5": 0.4817370790988207, "layers.6": 0.4837580490857363, "layers.7": 0.4849969409406185, "layers.8": 0.5698790773749352, "layers.9": 0.513097969815135, "layers.10": 0.5015679635107517, "layers.11": 0.520787900313735, "layers.12": 0.5318480543792248, "layers.13": 0.5720490589737892, "layers.14": 0.5089179612696171, "layers.15": 0.516227912157774, "layers.16": 0.5020070821046829, "layers.17": 0.516087980940938, "layers.18": 0.4964780528098345, "layers.19": 0.5484779831022024, "layers.20": 0.5206081550568342, "layers.21": 0.49668806605041027, "layers.22": 0.4932170268148184, "layers.23": 0.4937378689646721, "layers.24": 0.49441796727478504, "layers.25": 0.49575697630643845, "layers.26": 0.4908179398626089, "layers.27": 0.5107279866933823, "layers.28": 0.49443705938756466, "layers.29": 0.49729784950613976, "layers.30": 0.5114679224789143, "layers.31": 0.5105878226459026, "norm": 0.0832010991871357, "output": 0.05387095734477043}, +{"tok_embeddings": 0.05552009679377079, "layers.0": 0.5515790544450283, "layers.1": 0.5244079511612654, "layers.2": 0.5384080577641726, "layers.3": 0.5013481713831425, "layers.4": 0.5199580918997526, "layers.5": 0.566649017855525, "layers.6": 0.5772490985691547, "layers.7": 0.5145978648215532, "layers.8": 0.514667946845293, "layers.9": 0.5241681355983019, "layers.10": 0.5071568302810192, "layers.11": 0.5403480026870966, "layers.12": 0.5017179064452648, "layers.13": 0.5002471152693033, "layers.14": 0.49736793152987957, "layers.15": 0.49911788664758205, "layers.16": 0.49409782513976097, "layers.17": 0.5232179537415504, "layers.18": 0.5014671478420496, "layers.19": 0.5310990381985903, "layers.20": 0.5518780089914799, "layers.21": 0.5052478518337011, "layers.22": 0.4980668891221285, "layers.23": 0.49820798449218273, "layers.24": 0.49391784705221653, "layers.25": 0.5102681461721659, "layers.26": 0.5211180541664362, "layers.27": 0.4971169400960207, "layers.28": 0.5000978708267212, "layers.29": 0.49242679961025715, "layers.30": 0.49287802539765835, "layers.31": 0.49430783838033676, "norm": 0.08192099630832672, "output": 0.05440087988972664}, +{"tok_embeddings": 0.057791126891970634, "layers.0": 0.591119984164834, "layers.1": 0.5163080058991909, "layers.2": 0.5168679635971785, "layers.3": 0.5120181012898684, "layers.4": 0.49940706230700016, "layers.5": 0.5001579411327839, "layers.6": 0.5006981082260609, "layers.7": 0.5136679392307997, "layers.8": 0.5065079312771559, "layers.9": 0.5006571300327778, "layers.10": 0.49648783169686794, "layers.11": 0.5216279532760382, "layers.12": 0.5585479084402323, "layers.13": 0.5619281437247992, "layers.14": 0.4869180265814066, "layers.15": 0.5037169903516769, "layers.16": 0.5056979134678841, "layers.17": 0.4817380104213953, "layers.18": 0.4806669894605875, "layers.19": 0.4897171165794134, "layers.20": 0.4851480480283499, "layers.21": 0.47645694576203823, "layers.22": 0.47764810733497143, "layers.23": 0.49361702986061573, "layers.24": 0.4797480069100857, "layers.25": 0.4764171317219734, "layers.26": 0.5170679651200771, "layers.27": 0.4806679207831621, "layers.28": 0.4746268969029188, "layers.29": 0.4815168213099241, "layers.30": 0.4770581144839525, "layers.31": 0.5051279440522194, "norm": 0.0818711705505848, "output": 0.051890965551137924}, +{"tok_embeddings": 0.08022086694836617, "layers.0": 0.5796288605779409, "layers.1": 0.5070380866527557, "layers.2": 0.5120879504829645, "layers.3": 0.4891669377684593, "layers.4": 0.4904880188405514, "layers.5": 0.4935779143124819, "layers.6": 0.5088970065116882, "layers.7": 0.48564793542027473, "layers.8": 0.48619811423122883, "layers.9": 0.4887368995696306, "layers.10": 0.48026791773736477, "layers.11": 0.5062269046902657, "layers.12": 0.5086779128760099, "layers.13": 0.5158479325473309, "layers.14": 0.49506803043186665, "layers.15": 0.489488011226058, "layers.16": 0.4859070759266615, "layers.17": 0.482786912471056, "layers.18": 0.48920814879238605, "layers.19": 0.5033579654991627, "layers.20": 0.5634380504488945, "layers.21": 0.5125878378748894, "layers.22": 0.5251578986644745, "layers.23": 0.5078781396150589, "layers.24": 0.500258058309555, "layers.25": 0.5006580613553524, "layers.26": 0.4950570873916149, "layers.27": 0.4961180966347456, "layers.28": 0.4982168320566416, "layers.29": 0.5017479415982962, "layers.30": 0.566418981179595, "layers.31": 0.5019279196858406, "norm": 0.08341111242771149, "output": 0.05380110815167427}, +{"tok_embeddings": 0.06145099177956581, "layers.0": 0.6021491717547178, "layers.1": 0.5406790878623724, "layers.2": 0.5251881666481495, "layers.3": 0.5085079465061426, "layers.4": 0.6208589766174555, "layers.5": 0.5262380000203848, "layers.6": 0.494728097692132, "layers.7": 0.5054578650742769, "layers.8": 0.49643684178590775, "layers.9": 0.48234802670776844, "layers.10": 0.47958712093532085, "layers.11": 0.48292684368789196, "layers.12": 0.49779703840613365, "layers.13": 0.491937855258584, "layers.14": 0.48554688692092896, "layers.15": 0.47846813686192036, "layers.16": 0.47565693967044353, "layers.17": 0.49052806571125984, "layers.18": 0.48358715139329433, "layers.19": 0.4751079250127077, "layers.20": 0.4861371126025915, "layers.21": 0.49447803758084774, "layers.22": 0.4783070180565119, "layers.23": 0.4747470375150442, "layers.24": 0.49291690811514854, "layers.25": 0.4780469462275505, "layers.26": 0.4774180706590414, "layers.27": 0.49828714691102505, "layers.28": 0.47510815784335136, "layers.29": 0.4890069831162691, "layers.30": 0.47684810124337673, "layers.31": 0.49058697186410427, "norm": 0.07829093374311924, "output": 0.051341019570827484}, +] diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8a32e3b --- /dev/null +++ b/setup.py @@ -0,0 +1,41 @@ +from setuptools import setup +import os + + +def read(rel_path: str) -> str: + here = os.path.abspath(os.path.dirname(__file__)) + with open(os.path.join(here, rel_path)) as fp: + return fp.read() + + +def get_version(rel_path: str) -> str: + for line in read(rel_path).splitlines(): + if line.startswith("__version__"): + delim = '"' if '"' in line else "'" + return line.split(delim)[1] + raise RuntimeError("Unable to find version string.") + + +setup( + name="fleece-worker", + version=get_version('fleece-worker/__init__.py'), + description="fleece-worker", + long_description=open('README.md').read(), + long_description_content_type='text/markdown', + author="stneng", + author_email="git@stneng.com", + url="https://github.com/CoLearn-Dev/fleece-worker", + packages=["fleece-worker"], + install_requires=[ + "numpy", + "torch", + "fire", + "sentencepiece", + "fastapi", + "uvicorn", + "requests", + "cryptography", + "fleece-network" + ], + python_requires=">=3.10", +) diff --git a/specs/fleece_layers.csv b/specs/fleece_layers.csv new file mode 100644 index 0000000..4391816 --- /dev/null +++ b/specs/fleece_layers.csv @@ -0,0 +1,9 @@ +Layer_name,From_model,Repetition,Order,Mem_model,Mem_inference_slope,Mem_inference_intercept +llama-2-7b-chat-slice/tok_embeddings,llama-2-7b-chat-slice,1,0,262144000,0,0 +llama-2-7b-chat-slice/layers,llama-2-7b-chat-slice,32,1,404750336,8388608,0 +llama-2-7b-chat-slice/norm,llama-2-7b-chat-slice,1,2,8866,0,0 +llama-2-7b-chat-slice/output,llama-2-7b-chat-slice,1,3,262144000,0,0 +llama-2-70b-chat-slice/tok_embeddings,llama-2-70b-chat-slice,1,0,524288000,0,0 +llama-2-70b-chat-slice/layers,llama-2-70b-chat-slice,80,1,1711276032,2097152,0 +llama-2-70b-chat-slice/norm,llama-2-70b-chat-slice,1,2,17058,0,0 +llama-2-70b-chat-slice/output,llama-2-70b-chat-slice,1,3,524288000,0,0 \ No newline at end of file