diff --git a/llm/server/server/data/processor.py b/llm/server/server/data/processor.py index efa798fde7..b27ae1bf2c 100644 --- a/llm/server/server/data/processor.py +++ b/llm/server/server/data/processor.py @@ -338,13 +338,37 @@ def get_pad_id(self): return self.tokenizer.eos_token return self.tokenizer.pad_token_id + def pad_batch_data(self, insts, pad_id=0, return_seq_len=False, return_array=True, pad_style="right"): + """Pad the instances to the max sequence length in batch.""" + if len(insts) == 0: + padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]] + if return_seq_len: + seq_len = np.array([], dtype=np.int64) if return_array else [] + return padded_insts, seq_len + return padded_insts + + max_len = max(map(len, insts)) + if pad_style == "left": + padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts] + else: + padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts] + if return_array: + padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len]) + + if return_seq_len: + seq_len = [len(inst) for inst in insts] + if return_array: + seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1) + return padded_insts, seq_len + return padded_insts + def update_stop_seq(self, request): """ Update stop sequences from request. """ - stop_seqs = [[2], [100273]] + stop_seqs = [] for seq in request.get("stop_sequences", []): - if seq != self._get_eos_token_id(): + if seq != self.tokenizer.eos_token_id: stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq))) request["stop_seqs"], request["stop_seqs_len"] = self.pad_batch_data( stop_seqs, diff --git a/llm/server/server/engine/infer.py b/llm/server/server/engine/infer.py index 3e448b4d3c..ac006bf4ae 100644 --- a/llm/server/server/engine/infer.py +++ b/llm/server/server/engine/infer.py @@ -251,10 +251,10 @@ def init_inputs(self): self.share_inputs['free_list_len'] = paddle.full( shape=[1], fill_value=self.free_list_len, dtype="int32") - self.share_inputs['stop_seqs_len'] = paddle.full(shape=[max_stop_seqs_num,], + self.share_inputs['stop_seqs_len'] = paddle.full(shape=[self.max_stop_seqs_num,], fill_value=0, dtype="int32") - self.share_inputs['stop_seqs'] = paddle.full(shape=[max_stop_seqs_num, stop_seqs_max_len], + self.share_inputs['stop_seqs'] = paddle.full(shape=[self.max_stop_seqs_num, self.stop_seqs_max_len], fill_value=-1, dtype="int64") @@ -312,11 +312,11 @@ def dy_input_preprocess(self, tasks): if "stop_seqs_len" in task: stop_seqs_num = len(task["stop_seqs_len"]) - for i in range(stop_seqs_num, max_stop_seqs_num): + for i in range(stop_seqs_num, self.max_stop_seqs_num): task["stop_seqs_len"].append(0) - share_inputs['stop_seqs_len'][:] = np.array( + self.share_inputs['stop_seqs_len'][:] = np.array( task["stop_seqs_len"], dtype="int32") - share_inputs['stop_seqs'][:stop_seqs_num, :len(task['stop_seqs'][0])] = np.array( + self.share_inputs['stop_seqs'][:stop_seqs_num, :len(task['stop_seqs'][0])] = np.array( task["stop_seqs"], dtype="int64") def step_cuda(self, seq_lens_this_time): """