Skip to content

Commit

Permalink
[itn] fix issue#237, digit + union("百", "千", "万") + digit + unit (#255)
Browse files Browse the repository at this point in the history
  • Loading branch information
weimeng23 authored Jun 26, 2024
1 parent 667cbe0 commit 91e51ca
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 5 deletions.
10 changes: 10 additions & 0 deletions itn/chinese/data/number/digit_zh.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
36 changes: 34 additions & 2 deletions itn/chinese/rules/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file, accep, cross
from pynini import string_file, accep, cross, union
from pynini.lib.pynutil import delete, insert, add_weight


Expand All @@ -36,6 +36,11 @@ def build_tagger(self):
get_abs_path('../itn/chinese/data/measure/units_zh.tsv'))
sign = string_file(
get_abs_path('../itn/chinese/data/number/sign.tsv')) # + -
digit = string_file(
get_abs_path('../itn/chinese/data/number/digit.tsv')) # 1 ~ 9
digit_zh = string_file(
get_abs_path('../itn/chinese/data/number/digit_zh.tsv')) # 1 ~ 9
addzero = insert('0')
to = cross('到', '~') | cross('到百分之', '~')

units = add_weight(
Expand All @@ -55,8 +60,35 @@ def build_tagger(self):

# 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h
measure = number + (to + number).ques + units
tagger = insert('value: "') + (measure | percent) + insert('"')

# XXX: 特殊case处理, ignore enable_standalone_number
# digit + union("百", "千", "万") + digit + unit
unit_sp_case1 = [
'年',
'月',
'个月',
'周',
'天',
'位',
'次',
'个',
'顿',
]
if self.enable_0_to_9:
measure_sp = add_weight(
((digit + delete('百') + add_weight(addzero**2, 1.0)) |
(digit + delete('千') + add_weight(addzero**3, 1.0)) |
(digit + delete('万') + add_weight(addzero**4, 1.0))) +
insert(' ') + digit + union(*unit_sp_case1), -0.5)
else:
measure_sp = add_weight(
((digit + delete('百') + add_weight(addzero**2, 1.0)) |
(digit + delete('千') + add_weight(addzero**3, 1.0)) |
(digit + delete('万') + add_weight(addzero**4, 1.0))) +
digit_zh + union(*unit_sp_case1), -0.5)

tagger = insert('value: "') + (measure | measure_sp
| percent) + insert('"')
# 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h
tagger |= (insert('denominator: "') + delete('每') + units +
insert('" numerator: "') + measure + insert('"'))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,6 @@
这是九十九九千 => 这是九十九九千
这是十二一千 => 这是十二一千
这是零百 => 这是零百
这是零千 => 这是零千
这是零千 => 这是零千
这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天
这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@
这是九十九九千 => 这是九十九九千
这是十二一千 => 这是十二一千
这是零百 => 这是零百
这是零千 => 这是零千
这是零千 => 这是零千
这是一百一个,一千两位,一万三天 => 这是100 1个,1000 2位,10000 3天
这是九百九周,九千九月,九万九年 => 这是900 9周,9000 9月,90000 9年
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,6 @@
这是九十九九千 => 这是99 9000
这是十二一千 => 这是12 1000
这是零百 => 这是零百
这是零千 => 这是零千
这是零千 => 这是零千
这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天
这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年

0 comments on commit 91e51ca

Please sign in to comment.