Skip to content

Commit

Permalink
Merge pull request #101 from Soreepeong/fix/parser
Browse files Browse the repository at this point in the history
MacroString parser fixes (number, SeString-to-SeString, invalid sequence)
  • Loading branch information
NotAdam authored Nov 24, 2024
2 parents db7deb5 + a904505 commit ff613b8
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 30 deletions.
60 changes: 53 additions & 7 deletions src/Lumina.Tests/SeStringBuilderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -299,13 +299,13 @@ public void ComplicatedTest()
}

[Sheet( "Addon" )]
public readonly struct Addon( ExcelPage page, uint offset, uint row ) : IExcelRow<Addon>
public readonly struct Addon( ExcelPage page, uint offset, uint row ) : IExcelRow< Addon >
{
public uint RowId => row;

public ReadOnlySeString Text => page.ReadString( offset, offset );

static Addon IExcelRow<Addon>.Create( ExcelPage page, uint offset, uint row ) =>
static Addon IExcelRow< Addon >.Create( ExcelPage page, uint offset, uint row ) =>
new( page, offset, row );
}

Expand Down Expand Up @@ -458,7 +458,7 @@ public unsafe void SpanViewNullTerminationTest()
var span = test.GetViewAsSpan();
Assert.True( span.SequenceEqual( expected ) );
fixed( byte* p = span )
Assert.Equal( 0 , p[ span.Length ]);
Assert.Equal( 0, p[ span.Length ] );
}

[Fact]
Expand Down Expand Up @@ -605,14 +605,60 @@ public void FriendlyErrorMessage()
}
}

[Fact]
public void ParseNestedSeStringPayloadTest()
{
var t = ReadOnlySeString.FromMacroString( "ABC\\<italic(1)>DEF<italic(0)>" );
t = ReadOnlySeString.FromMacroString( t, new() { CharEnumerationFlags = UtfEnumeratorFlags.Utf8SeString } );
Assert.Equal( ReadOnlySeString.FromMacroString( "ABC<italic(1)>DEF<italic(0)>" ), t );
}

[Fact]
public void ParseIgnoreInvalidUtf8SequenceTest()
{
var invalidSequence = new byte[]
{
(byte) 'A',
(byte) 'B',
0xFF, // 0xFF is never valid in UTF-8 bytes
(byte) 'C',
(byte) 'D',
};
Assert.Throws< EncoderFallbackException >( () =>
ReadOnlySeString.FromMacroString( invalidSequence, new() { CharEnumerationFlags = UtfEnumeratorFlags.ThrowOnFirstError } ) );
Assert.Equal(
ReadOnlySeString.FromMacroString( invalidSequence, new() { CharEnumerationFlags = UtfEnumeratorFlags.IgnoreErrors } ).Data.ToArray(),
"ABCD"u8.ToArray() );
}

[Fact]
public void ParseNumberTest()
{
static void Test( string numberString, uint expected )
{
var e = ReadOnlySeString.FromMacroString( $"<italic({numberString})>" ).AsSpan().GetEnumerator();
Assert.True( e.MoveNext() );
Assert.Equal( MacroCode.Italic, e.Current.MacroCode );
Assert.True( e.Current.TryGetExpression( out var expr ) );
Assert.True( expr.TryGetUInt( out var parsed ) );
Assert.Equal( parsed, expected );
}

Test( "0_00'0012'345", 12345 );
Test( "0o000_5151", 2665 );
Test( "0b0000'1111'0000'1111", 0x0F0F );
Test( "0x1234_5678", 0x12345678 );
Test( "0d_5555", 5555 );
}

[RequiresGameInstallationFact]
public void AllSheetsTextColumnCodec()
{
var gameData = RequiresGameInstallationFact.CreateGameData();
var ssb = new SeStringBuilder();
foreach( var sheetName in gameData.Excel.SheetNames )
{
var header = gameData.GetFile<ExcelHeaderFile>( $"exd/{sheetName}.exh" );
var header = gameData.GetFile< ExcelHeaderFile >( $"exd/{sheetName}.exh" );
if( header?.Header.Variant == ExcelVariant.Subrows )
continue;
var languages = header?.Languages ?? [Language.None];
Expand All @@ -625,7 +671,7 @@ public void AllSheetsTextColumnCodec()
{
foreach( var columnOffset in stringColumns )
{
var test1 = row.ReadString(columnOffset);
var test1 = row.ReadString( columnOffset );
if( test1.Data.Span.IndexOf( "payload:"u8 ) != -1 )
throw new( $"Unsupported payload at {sheetName}#{row.RowId}; {test1}" );

Expand All @@ -647,14 +693,14 @@ public void AllSheetsTextColumnCodec()
}

[Sheet]
public readonly struct RawRow( ExcelPage page, uint offset, uint row ) : IExcelRow<RawRow>
public readonly struct RawRow( ExcelPage page, uint offset, uint row ) : IExcelRow< RawRow >
{
public uint RowId => row;

public ReadOnlySeString ReadString( ushort off ) =>
page.ReadString( off + offset, offset );

static RawRow IExcelRow<RawRow>.Create( ExcelPage page, uint offset, uint row ) =>
static RawRow IExcelRow< RawRow >.Create( ExcelPage page, uint offset, uint row ) =>
new( page, offset, row );
}
}
78 changes: 55 additions & 23 deletions src/Lumina/Text/Parse/MacroStringParser.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;
using Lumina.Text.Expressions;
Expand All @@ -9,9 +10,11 @@ namespace Lumina.Text.Parse;

internal readonly ref struct MacroStringParser
{
// Map from ascii code to supposed number.
// -1 = invalid, -2 = ignore.
// See the static constructor for initialization.
/// <summary>Map from ascii code to supposed number. See the static constructor for initialization.</summary>
/// <value><ul>
/// <li>-1: invalid</li>
/// <li>-2: ignore</li>
/// </ul></value>
private static readonly sbyte[] Digits;

private readonly ReadOnlySpan< byte > _macroString;
Expand All @@ -21,14 +24,16 @@ internal readonly ref struct MacroStringParser
static MacroStringParser()
{
Digits = new sbyte[0x80];
Digits.AsSpan().Fill(-1);
Digits['_'] = Digits['\''] = -2;
for (var i = '0'; i <= '9'; i++)
Digits[i] = (sbyte)(i - '0');
for (var i = 'A'; i <= 'F'; i++)
Digits[i] = (sbyte)(10 + (i - 'A'));
for (var i = 'a'; i <= 'f'; i++)
Digits[i] = (sbyte)(10 + (i - 'a'));
Digits.AsSpan().Fill( -1 );
// Programming languages such as C# will ignore underscores(_) between digits, to let users write 0x0123_4567_89AB_CDEF for ease of reading.
// C++ will use single quotes(') instead of underscores.
Digits[ '_' ] = Digits[ '\'' ] = -2;
for( var i = '0'; i <= '9'; i++ )
Digits[ i ] = (sbyte) ( i - '0' );
for( var i = 'A'; i <= 'F'; i++ )
Digits[ i ] = (sbyte) ( 10 + ( i - 'A' ) );
for( var i = 'a'; i <= 'f'; i++ )
Digits[ i ] = (sbyte) ( 10 + ( i - 'a' ) );
}

internal MacroStringParser( ReadOnlySpan< byte > macroString, SeStringBuilder builder, MacroStringParseOptions parseOptions )
Expand All @@ -39,25 +44,44 @@ internal MacroStringParser( ReadOnlySpan< byte > macroString, SeStringBuilder bu
}

/// <summary>Parses the macro string.</summary>
/// <returns>The builder.</returns>
/// <param name="offset">Offset in <see cref="_macroString"/> to parse from.</param>
/// <param name="stopOnCharRequiringEscape">Whether to stop parsing if a character requires escaping to have itself skipped from being processed as a part
/// of string representation of SeString payloads. Used to allow using special characters used to form string representation of SeString payloads, such as
/// <c>(</c> or <c>,</c>, when the string being parsed is at the topmost level (not a part of string SeString expression.)</param>
/// <param name="extraTerminators">If any of the bytes in this span is encountered while parsing, it will be treated as the end of the current string being
/// parsed. Used to terminate parsing string SeString expressions, so that it can exclude <c>)</c> from the expression and stop when parsing
/// <c>&lt;string(asdf)&gt;</c>, instead of producing <c>asdf)</c> as the parsed string SeString expression and fail with invalid syntax.</param>
/// <returns>One past the final offset in <see cref="_macroString"/> that got parsed.</returns>
public int ParseMacroStringAndAppend( int offset, bool stopOnCharRequiringEscape, ReadOnlySpan< byte > extraTerminators )
{
var beginOffset = offset;
while( new UtfEnumerator( _macroString[ offset.. ], _parseOptions.CharEnumerationFlags ).TryPeekNext( out var s, out _ ) )
while( new UtfEnumerator( _macroString[ offset.. ], _parseOptions.CharEnumerationFlags ).TryPeekNext( out var c, out _ ) )
{
if( s.IsSeStringPayload )
Debug.Assert(
( _parseOptions.CharEnumerationFlags & UtfEnumeratorFlags.IgnoreErrors ) != 0 || c.ByteOffset == 0,
$"Offset of the first item retrieved UtfEnumerator should have been 0, unless {nameof( UtfEnumeratorFlags.IgnoreErrors )} is set." );

offset += c.ByteOffset;

if( c.IsSeStringPayload )
{
_builder.Append( new ReadOnlySeStringSpan( _macroString.Slice( offset + s.ByteOffset, s.ByteLength ) ) );
Debug.Assert( ( _parseOptions.CharEnumerationFlags & UtfEnumeratorFlags.Utf8SeString ) != 0,
$"SeString Payload should have not been yielded unless {nameof( UtfEnumeratorFlags.Utf8SeString )} is set." );

_builder.Append( new ReadOnlySeStringSpan( _macroString.Slice( offset, c.ByteLength ) ) );
offset += c.ByteLength;
continue;
}

switch( s.Value.UIntValue )
switch( c.Value.UIntValue )
{
case '\\':
// Backslashes will *always* produce the following character as-is.
// No special escape sequences such as \n and \t are defined for SeStrings.
offset += ParseMacroStringTextAndAppend( offset, extraTerminators );
break;

case <= byte.MaxValue when extraTerminators.Contains( (byte) s.Value.UIntValue ):
case <= byte.MaxValue when extraTerminators.Contains( (byte) c.Value.UIntValue ):
return offset - beginOffset;

case '<' when _parseOptions.ExceptionMode is MacroStringParseExceptionMode.Throw:
Expand All @@ -71,7 +95,7 @@ public int ParseMacroStringAndAppend( int offset, bool stopOnCharRequiringEscape
}
catch( MacroStringParseException e )
{
var byteLength = Math.Max( s.ByteLength, e.ByteOffset - offset );
var byteLength = Math.Max( c.ByteLength, e.ByteOffset - offset );
var sliceUntilError = _macroString.Slice( offset, byteLength );
_builder.Append( new UtfEnumerator( sliceUntilError, _parseOptions.CharEnumerationFlags ) );
if( _parseOptions.ExceptionMode == MacroStringParseExceptionMode.EmbedError )
Expand All @@ -81,13 +105,13 @@ public int ParseMacroStringAndAppend( int offset, bool stopOnCharRequiringEscape

break;

case <= byte.MaxValue when CharRequiresEscapeInSeString( s.Value.UIntValue ):
case <= byte.MaxValue when CharRequiresEscapeInSeString( c.Value.UIntValue ):
if( stopOnCharRequiringEscape )
return offset - beginOffset;

var v = unchecked( (byte) s.Value.UIntValue );
var v = unchecked( (byte) c.Value.UIntValue );
_builder.Append( MemoryMarshal.CreateReadOnlySpan( ref v, 1 ) );
offset += s.ByteLength;
offset += c.ByteLength;
break;

default:
Expand All @@ -104,6 +128,9 @@ private int ParseMacroStringTextAndAppend( int offset, ReadOnlySpan< byte > extr
var nextIsEscaped = false;
foreach( var c in new UtfEnumerator( _macroString[ offset.. ], _parseOptions.CharEnumerationFlags ) )
{
if( c.IsSeStringPayload )
return c.ByteOffset;

switch( c.Value.UIntValue )
{
case var _ when nextIsEscaped:
Expand Down Expand Up @@ -355,7 +382,12 @@ static bool TryParseInt( ReadOnlySpan< byte > data, out int result )
} while( !data.IsEmpty );

var maxPerDigit = 10u;
if( data.Length > 2 && data[ 0 ] == '0' )

// If the number string begins with 0 followed by non-decimal digits, try parsing it as non-decimal.
if( data.Length > 2
&& data[ 0 ] == '0'
&& data[ 1 ] is not ((byte) '_' or (byte) '\'')
&& data[ 1 ] is not (>= (byte) '0' and <= (byte) '9') )
{
maxPerDigit = (char) data[ 1 ] switch
{
Expand Down Expand Up @@ -460,7 +492,7 @@ private MacroCode ParseMacroCode( ref int offset )

macroCodeName = macroCodeName[ ..macroCodeNameLength ];

foreach( var n in MacroCodeExtensions.GetDefinedMacroCodes())
foreach( var n in MacroCodeExtensions.GetDefinedMacroCodes() )
{
if( macroCodeName.SequenceEqual( n.GetEncodeName() ) )
{
Expand Down

0 comments on commit ff613b8

Please sign in to comment.