Skip to content

Commit

Permalink
Merge pull request #17 from ahlec/preserve-spaces
Browse files Browse the repository at this point in the history
Retain ASCII space characters in original input string
  • Loading branch information
obynio authored Oct 4, 2022
2 parents abf28e0 + f158aa6 commit 6213c22
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
12 changes: 12 additions & 0 deletions reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@
HTML_REPLACER = '▦'
NEWLINE_REPLACER = '▧'

# Unicode character used to replace ASCII Space (0x20) in expression before
# passing in to MeCab. MeCab separates kanji/reading nodes with ASCII spaces,
# so without this we wouldn't be able to tell apart a node separator from a
# space character in the original string.
# This is unique to ASCII Space (0x20) and does not apply to any other whitespace
# character (eg CJK Space)
# Codepoint chosen to be a unicode character unlikely to ever feature in ANY
# Anki card.
ASCII_SPACE_TOKEN = u"\U0000FFFF"

def htmlReplace(text):
pattern = r"(?:<[^<]+?>)"
matches = re.findall(pattern, text)
Expand Down Expand Up @@ -100,6 +110,7 @@ def ensureOpen(self):
def reading(self, expr, ignoreNumbers = True, useRubyTags = False):
self.ensureOpen()
matches, expr = escapeText(expr)
expr = expr.replace(" ", ASCII_SPACE_TOKEN)
self.mecab.stdin.write(expr.encode("utf-8", "ignore") + b'\n')
self.mecab.stdin.flush()
expr = self.mecab.stdout.readline().rstrip(b'\r\n').decode('utf-8', "ignore")
Expand Down Expand Up @@ -181,6 +192,7 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False):
fin = ''.join(node.format(useRubyTags) for node in nodes)

# Finalize formatting
fin = fin.replace(ASCII_SPACE_TOKEN, ' ')
for match in matches:
fin = fin.replace(HTML_REPLACER, match, 1)

Expand Down
6 changes: 6 additions & 0 deletions test/test_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,9 @@ def testKanaBetweenKanji(self):
self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き込[こ]む")
self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り抜[ぬ]く")
self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り回[まわ]る")

# ensure that any regular ASCII space characters (0x20) that are in the original
# string are found in the resultant string as well
def testSpacesRetained(self):
self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります")
self.assertEqual(reading.mecab.reading("hello world"), "hello world")

0 comments on commit 6213c22

Please sign in to comment.