-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathIHRNomDB_Rs.py
84 lines (67 loc) · 2.67 KB
/
IHRNomDB_Rs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''
Implementation of the R-score algorithm for the IHRNomDB dataset:
- 𝑅_𝑠 = 𝑁_𝑑𝑖𝑠𝑡𝑖𝑛𝑐𝑡(𝑠) × max_𝑖^𝐷〖𝑁_𝑖〗+ 𝑁_𝑠
- Implemnted by: Nguyen Duc Duy Anh (https://github.com/duyanh1909)
'''
from tqdm.notebook import tqdm
from collections import Counter
def print_intersection(val_data, train_data):
"""
Calculate intersection.
Params:
val_data: the set of validation data.
train_data: the set of train data.
"""
total_chac_val = set(list(''.join(val_data)))
total_chac_train = set(list(''.join(train_data)))
word_val_in_train = len(list(filter(lambda x: True if x in total_chac_train else False, total_chac_val)))
word_train_in_val = len(list(filter(lambda x: True if x in total_chac_val else False, total_chac_train)))
intersection_val = word_val_in_train/len(total_chac_val) * 100
intersection_train = word_train_in_val/len(total_chac_train) * 100
print("Characters intersection train", intersection_val)
print("Characters intersection val", intersection_train)
def frequence_in_D(dataset, char):
"""
Count the number of appearances of a character in the dataset (D).
Params:
dataset (list): The dataset.
char (str): The character to count.
Returns:
(int): The number of appearances of a character.
"""
return ''.join(dataset).count(char)
def max_N(dataset):
"""
Calculate
"""
max_score = 0
vocab = dict(Counter(''.join(dataset)).most_common())
for idx, s in tqdm(enumerate(dataset)):
dataset_not_s = dataset[:idx] + dataset[idx:]
list_s = list(s)
vocab_not_s = vocab.copy()
for word in list_s: vocab_not_s[word] -= 1
sum_word_distinct = sum([vocab_not_s[word] for word in set(list_s)])
if max_score < sum_word_distinct: max_score = sum_word_distinct
return max_score
def calculate_r_scores(dataset):
"""
Calculate R scores
Params:
dataset (list): The dataset.
Returns:
results (list): The list of Chinese character sequences with their r-scores.
"""
vob = list(map(lambda elm: elm[1], dataset))
results = []
max_score = max_N(vob)
vocab = dict(Counter(''.join(vob)).most_common())
for idx, s in tqdm(enumerate(dataset)):
list_s = list(s[1])
vocab_not_s = vocab.copy()
for word in list_s: vocab_not_s[word] -= 1
sum_word_distinct = sum([vocab_not_s[word] for word in set(list_s)])
sum_word = sum([vocab_not_s[word] for word in list_s])
r = sum_word_distinct * max_score + sum_word
results.append([s[0], s[1], r])
return results