-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathrun_cosent.py
176 lines (141 loc) · 7.09 KB
/
run_cosent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""
@file : run_cosent_ATEC.py
@author : xiaolu
@email : [email protected]
@time : 2022-01-07
"""
import re
import os
import random
import torch
import numpy as np
from tqdm import tqdm
from config import set_args
from model import Model
from torch.utils.data import DataLoader
from utils import l2_normalize, compute_corrcoef
from transformers.models.bert import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from data_helper import CustomDataset, collate_fn, pad_to_maxlen, load_data, load_test_data
def set_seed():
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(args.seed)
def get_sent_id_tensor(s_list):
input_ids, attention_mask, token_type_ids, labels = [], [], [], []
max_len = max([len(_)+2 for _ in s_list]) # 这样写不太合适 后期想办法改一下
for s in s_list:
inputs = tokenizer.encode_plus(text=s, text_pair=None, add_special_tokens=True, return_token_type_ids=True)
input_ids.append(pad_to_maxlen(inputs['input_ids'], max_len=max_len))
attention_mask.append(pad_to_maxlen(inputs['attention_mask'], max_len=max_len))
token_type_ids.append(pad_to_maxlen(inputs['token_type_ids'], max_len=max_len))
all_input_ids = torch.tensor(input_ids, dtype=torch.long)
all_input_mask = torch.tensor(attention_mask, dtype=torch.long)
all_segment_ids = torch.tensor(token_type_ids, dtype=torch.long)
return all_input_ids, all_input_mask, all_segment_ids
def evaluate():
sent1, sent2, label = load_test_data(args.test_data)
all_a_vecs = []
all_b_vecs = []
all_labels = []
model.eval()
for s1, s2, lab in tqdm(zip(sent1, sent2, label)):
input_ids, input_mask, segment_ids = get_sent_id_tensor([s1, s2])
lab = torch.tensor([lab], dtype=torch.float)
if torch.cuda.is_available():
input_ids, input_mask, segment_ids = input_ids.cuda(), input_mask.cuda(), segment_ids.cuda()
lab = lab.cuda()
with torch.no_grad():
output = model(input_ids=input_ids, attention_mask=input_mask, encoder_type='fist-last-avg')
all_a_vecs.append(output[0].cpu().numpy())
all_b_vecs.append(output[1].cpu().numpy())
all_labels.extend(lab.cpu().numpy())
all_a_vecs = np.array(all_a_vecs)
all_b_vecs = np.array(all_b_vecs)
all_labels = np.array(all_labels)
a_vecs = l2_normalize(all_a_vecs)
b_vecs = l2_normalize(all_b_vecs)
sims = (a_vecs * b_vecs).sum(axis=1)
corrcoef = compute_corrcoef(all_labels, sims)
return corrcoef
def calc_loss(y_true, y_pred):
# 1. 取出真实的标签
y_true = y_true[::2] # tensor([1, 0, 1]) 真实的标签
# 2. 对输出的句子向量进行l2归一化 后面只需要对应为相乘 就可以得到cos值了
norms = (y_pred ** 2).sum(axis=1, keepdims=True) ** 0.5
# y_pred = y_pred / torch.clip(norms, 1e-8, torch.inf)
y_pred = y_pred / norms
# 3. 奇偶向量相乘
y_pred = torch.sum(y_pred[::2] * y_pred[1::2], dim=1) * 20
# 4. 取出负例-正例的差值
y_pred = y_pred[:, None] - y_pred[None, :] # 这里是算出所有位置 两两之间余弦的差值
# 矩阵中的第i行j列 表示的是第i个余弦值-第j个余弦值
y_true = y_true[:, None] < y_true[None, :] # 取出负例-正例的差值
y_true = y_true.float()
y_pred = y_pred - (1 - y_true) * 1e12
y_pred = y_pred.view(-1)
if torch.cuda.is_available():
y_pred = torch.cat((torch.tensor([0]).float().cuda(), y_pred), dim=0) # 这里加0是因为e^0 = 1相当于在log中加了1
else:
y_pred = torch.cat((torch.tensor([0]).float(), y_pred), dim=0) # 这里加0是因为e^0 = 1相当于在log中加了1
return torch.logsumexp(y_pred, dim=0)
if __name__ == '__main__':
args = set_args()
set_seed()
os.makedirs(args.output_dir, exist_ok=True)
tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path)
# 加载数据集
train_sentence, train_label = load_data(args.train_data)
train_dataset = CustomDataset(sentence=train_sentence, label=train_label, tokenizer=tokenizer)
train_dataloader = DataLoader(dataset=train_dataset, shuffle=False, batch_size=args.train_batch_size,
collate_fn=collate_fn, num_workers=1)
total_steps = len(train_dataloader) * args.num_train_epochs
num_train_optimization_steps = int(
len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
model = Model()
if torch.cuda.is_available():
model.cuda()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0.05 * total_steps,
num_training_steps=total_steps)
print("***** Running training *****")
print(" Num examples = %d" % len(train_dataset))
print(" Batch size = %d" % args.train_batch_size)
print(" Num steps = %d" % num_train_optimization_steps)
for epoch in range(args.num_train_epochs):
model.train()
train_label, train_predict = [], []
epoch_loss = 0
for step, batch in enumerate(train_dataloader):
# for step, batch in enumerate(train_dataloader):
input_ids, input_mask, segment_ids, label_ids = batch
if torch.cuda.is_available():
input_ids, input_mask, segment_ids = input_ids.cuda(), input_mask.cuda(), segment_ids.cuda()
label_ids = label_ids.cuda()
output = model(input_ids=input_ids, attention_mask=input_mask, encoder_type='fist-last-avg')
loss = calc_loss(label_ids, output)
loss.backward()
print("当前轮次:{}, 正在迭代:{}/{}, Loss:{:10f}".format(epoch, step, len(train_dataloader), loss)) # 在进度条前面定义一段文字
# nn.utils.clip_grad_norm_(model.parameters(), max_norm=20, norm_type=2)
epoch_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
corr = evaluate()
s = 'Epoch:{} | corr: {:10f}'.format(epoch, corr)
logs_path = os.path.join(args.output_dir, 'logs.txt')
with open(logs_path, 'a+') as f:
s += '\n'
f.write(s)
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, "base_model_epoch_{}.bin".format(epoch))
torch.save(model_to_save.state_dict(), output_model_file)