From e218c3eda77838fdea9f5a9823b0e7c775a811b9 Mon Sep 17 00:00:00 2001 From: ChingC Date: Wed, 1 Jan 2025 21:13:30 +0800 Subject: [PATCH] fix: unicode mask converter --- include/parser5/parser5.hpp | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/include/parser5/parser5.hpp b/include/parser5/parser5.hpp index 4961519..7ab7d77 100644 --- a/include/parser5/parser5.hpp +++ b/include/parser5/parser5.hpp @@ -314,34 +314,42 @@ inline bool parser5::unicode::isHexDigit(u8char ch) template inline uint64_t parser5::unicode::toUnicode(u8char ch) { - std::stack coded; if (ch == 0) { return ch; } + + std::stack coded; while (ch > 0) { coded.push(ch & 0xff); - ch = ch >> 8; + ch >>= 8; } + u8char charcode = 0; uint8_t t = coded.top(); coded.pop(); if (t < 128) { return t; } - uint8_t high_bit_mask = (1 << 6) - 1; + + uint8_t high_bit_mask = 0b00111111; uint8_t high_bit_shift = 0; int total_bits = 0; const int other_bits = 6; + while ((t & 0xC0) == 0xC0) { t <<= 1; t &= 0xff; - total_bits += 6; + total_bits += other_bits; high_bit_mask >>= 1; high_bit_shift++; - charcode <<= other_bits; - charcode |= coded.top() & ((1 << other_bits) - 1); - coded.pop(); + + if (!coded.empty()) { + charcode <<= other_bits; + charcode |= coded.top() & ((1 << other_bits) - 1); + coded.pop(); + } } + charcode |= static_cast((t >> high_bit_shift) & high_bit_mask) << total_bits; return charcode; }