Skip to content

Commit

Permalink
Use FrozenDictionary for language models (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
russcam authored Nov 20, 2024
1 parent 44b3ec6 commit fda9413
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 49 deletions.
7 changes: 2 additions & 5 deletions src/Lingua/Internal/DictionaryExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ namespace Lingua.Internal;
internal static class DictionaryExtensions
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IncrementCounter<TKey>(this Dictionary<TKey, int> dictionary, TKey key, int increment = 1) where TKey : notnull
{
dictionary.TryGetValue(key, out var count);
dictionary[key] = count + increment;
}
public static void IncrementCounter<TKey>(this Dictionary<TKey, int> dictionary, TKey key, int increment = 1) where TKey : notnull =>
dictionary[key] = dictionary.TryGetValue(key, out var count) ? count + increment : increment;
}
5 changes: 3 additions & 2 deletions src/Lingua/Internal/LanguageModel.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
using System.Buffers;
using System.Collections.Frozen;
using System.Runtime.CompilerServices;
using System.Text.Json;

namespace Lingua.Internal;

internal static class LanguageModel
{
public static Dictionary<string, double> FromJson(Stream stream)
public static FrozenDictionary<string, double> FromJson(Stream stream)
{
using var memoryStream = new MemoryStream();
stream.CopyTo(memoryStream);
Expand Down Expand Up @@ -48,7 +49,7 @@ public static Dictionary<string, double> FromJson(Stream stream)
}

frequencies.TrimExcess();
return frequencies;
return frequencies.ToFrozenDictionary();
}
}

Expand Down
26 changes: 15 additions & 11 deletions src/Lingua/LanguageDetector.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Collections.Concurrent;
using System.Collections.Frozen;
using System.IO.Compression;
using System.Text.RegularExpressions;
using Lingua.Internal;
Expand Down Expand Up @@ -73,11 +74,11 @@ public sealed partial class LanguageDetector
["Éé"] = [Catalan, Czech, French, Hungarian, Icelandic, Irish, Italian, Portuguese, Slovak, Spanish, Vietnamese, Yoruba],
};

internal static readonly ConcurrentDictionary<Language, Lazy<Dictionary<string, double>>> UnigramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<Dictionary<string, double>>> BigramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<Dictionary<string, double>>> TrigramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<Dictionary<string, double>>> QuadrigramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<Dictionary<string, double>>> FivegramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<FrozenDictionary<string, double>>> UnigramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<FrozenDictionary<string, double>>> BigramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<FrozenDictionary<string, double>>> TrigramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<FrozenDictionary<string, double>>> QuadrigramLanguageModels = new();
internal static readonly ConcurrentDictionary<Language, Lazy<FrozenDictionary<string, double>>> FivegramLanguageModels = new();

private readonly HashSet<Language> _languages;
private readonly double _minimumRelativeDistance;
Expand All @@ -87,6 +88,8 @@ public sealed partial class LanguageDetector

private static readonly int[] LowAccuracyRange = [3];
private static readonly int[] HighAccuracyRange = [1, 2, 3, 4, 5];
private static readonly Lazy<FrozenDictionary<string, double>> Empty = new(() =>
Enumerable.Empty<KeyValuePair<string, double>>().ToFrozenDictionary());

internal LanguageDetector(
HashSet<Language> languages,
Expand Down Expand Up @@ -366,11 +369,12 @@ internal static double LookupNgramProbability(Language language, ReadOnlySpan<ch
var lookup = model.GetAlternateLookup<ReadOnlySpan<char>>();
return lookup.TryGetValue(ngram, out var result) ? result : 0;
#else
return model.GetValueOrDefault(ngram.ToString(), 0);
// ReSharper disable once CanSimplifyDictionaryTryGetValueWithGetValueOrDefault - skip the null check
return model.TryGetValue(ngram.ToString(), out var result) ? result : 0;
#endif
}

private static Dictionary<string, double> LoadLanguageModel(Language language, int ngramLength)
private static FrozenDictionary<string, double> LoadLanguageModel(Language language, int ngramLength)
{
var languageModels = ngramLength switch
{
Expand Down Expand Up @@ -419,11 +423,11 @@ private void PreloadLanguageModels()
});
}

private static Dictionary<string, double> LoadLanguageModels(ConcurrentDictionary<Language, Lazy<Dictionary<string, double>>> languageModels, Language language, int ngramLength) =>
private static FrozenDictionary<string, double> LoadLanguageModels(ConcurrentDictionary<Language, Lazy<FrozenDictionary<string, double>>> languageModels, Language language, int ngramLength) =>
languageModels.GetOrAdd(language, static (l, nl) =>
new Lazy<Dictionary<string, double>>(() => ReadLanguageModel(l, nl)), ngramLength).Value;
new Lazy<FrozenDictionary<string, double>>(() => ReadLanguageModel(l, nl)), ngramLength).Value;

private static Dictionary<string, double> ReadLanguageModel(Language language, int ngramLength)
private static FrozenDictionary<string, double> ReadLanguageModel(Language language, int ngramLength)
{
var isoCode = language.IsoCode6391().ToString().ToLowerInvariant();
var nGramName = Ngram.GetNameByLength(ngramLength);
Expand All @@ -438,7 +442,7 @@ private static Dictionary<string, double> ReadLanguageModel(Language language, i
catch (FileNotFoundException)
{
// there may not be a model for a given ngram/language
return new Dictionary<string, double>();
return Empty.Value;
}
}

Expand Down
63 changes: 32 additions & 31 deletions tests/Lingua.Tests/LanguageDetectorTests.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System.Collections.Frozen;
using FluentAssertions;
using Lingua.Internal;
using Xunit;
Expand All @@ -7,7 +8,7 @@ namespace Lingua.Tests;

public class LanguageDetectorTests : IDisposable
{
private static readonly Dictionary<string, double> UnigramLanguageModelForEnglish = new()
private static readonly FrozenDictionary<string, double> UnigramLanguageModelForEnglish = new Dictionary<string, double>
{
["a"] = 0.01,
["l"] = 0.02,
Expand All @@ -16,9 +17,9 @@ public class LanguageDetectorTests : IDisposable
["r"] = 0.05,
// unknown unigram in model
["w"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> BigramLanguageModelForEnglish = new()
private static readonly FrozenDictionary<string, double> BigramLanguageModelForEnglish = new Dictionary<string, double>
{
["al"] = 0.11,
["lt"] = 0.12,
Expand All @@ -27,9 +28,9 @@ public class LanguageDetectorTests : IDisposable
// unknown bigrams in model
["aq"] = 0,
["wx"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> TrigramLanguageModelForEnglish = new()
private static readonly FrozenDictionary<string, double> TrigramLanguageModelForEnglish = new Dictionary<string, double>
{
["alt"] = 0.19,
["lte"] = 0.2,
Expand All @@ -38,25 +39,25 @@ public class LanguageDetectorTests : IDisposable
["aqu"] = 0,
["tez"] = 0,
["wxy"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> QuadrigramLanguageModelForEnglish = new()
private static readonly FrozenDictionary<string, double> QuadrigramLanguageModelForEnglish = new Dictionary<string, double>
{
["alte"] = 0.25,
["lter"] = 0.26,
// unknown quadrigrams in model
["aqua"] = 0,
["wxyz"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> FivegramLanguageModelForEnglish = new()
private static readonly FrozenDictionary<string, double> FivegramLanguageModelForEnglish = new Dictionary<string, double>
{
["alter"] = 0.29,
// unknown fivegrams in model
["aquas"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> UnigramLanguageModelForGerman = new()
private static readonly FrozenDictionary<string, double> UnigramLanguageModelForGerman = new Dictionary<string, double>
{
["a"] = 0.06,
["l"] = 0.07,
Expand All @@ -65,39 +66,39 @@ public class LanguageDetectorTests : IDisposable
["r"] = 0.1,
// unknown unigrams in model
["w"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> BigramLanguageModelForGerman = new()
private static readonly FrozenDictionary<string, double> BigramLanguageModelForGerman = new Dictionary<string, double>
{
["al"] = 0.15,
["lt"] = 0.16,
["te"] = 0.17,
["er"] = 0.18,
// unknown bigrams in model
["wx"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> TrigramLanguageModelForGerman = new()
private static readonly FrozenDictionary<string, double> TrigramLanguageModelForGerman = new Dictionary<string, double>
{
["alt"] = 0.22,
["lte"] = 0.23,
["ter"] = 0.24,
// unknown trigrams in model
["wxy"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> QuadrigramLanguageModelForGerman = new()
private static readonly FrozenDictionary<string, double> QuadrigramLanguageModelForGerman = new Dictionary<string, double>
{
["alte"] = 0.27,
["lter"] = 0.28,
// unknown quadrigrams in model
["wxyz"] = 0
};
}.ToFrozenDictionary();

private static readonly Dictionary<string, double> FivegramLanguageModelForGerman = new()
private static readonly FrozenDictionary<string, double> FivegramLanguageModelForGerman = new Dictionary<string, double>
{
["alter"] = 0.3
};
}.ToFrozenDictionary();

private readonly LanguageDetector _detectorForEnglishAndGerman = new(
[English, German],
Expand Down Expand Up @@ -748,7 +749,7 @@ public void UnknownLanguageReturnedWhenNoNgramProbabilitiesAvailable() =>
[Fact]
public void ZeroConfidenceValuesReturnedWhenNoNgramProbabilitiesAvailable() =>
_detectorForEnglishAndGerman.ComputeLanguageConfidenceValues("проарплап").Should().BeEquivalentTo(
new Dictionary<Language, double>()
new Dictionary<Language, double>
{
[English] = 0,
[German] = 0
Expand Down Expand Up @@ -888,20 +889,20 @@ private void AssertThatAllLanguageModelsAreLoaded()

private void AddLanguageModelsToDetector()
{
LanguageDetector.UnigramLanguageModels[English] = new Lazy<Dictionary<string, double>>(UnigramLanguageModelForEnglish);
LanguageDetector.UnigramLanguageModels[German] = new Lazy<Dictionary<string, double>>(UnigramLanguageModelForGerman);
LanguageDetector.UnigramLanguageModels[English] = new Lazy<FrozenDictionary<string, double>>(UnigramLanguageModelForEnglish);
LanguageDetector.UnigramLanguageModels[German] = new Lazy<FrozenDictionary<string, double>>(UnigramLanguageModelForGerman);

LanguageDetector.BigramLanguageModels[English] = new Lazy<Dictionary<string, double>>(BigramLanguageModelForEnglish);
LanguageDetector.BigramLanguageModels[German] = new Lazy<Dictionary<string, double>>(BigramLanguageModelForGerman);
LanguageDetector.BigramLanguageModels[English] = new Lazy<FrozenDictionary<string, double>>(BigramLanguageModelForEnglish);
LanguageDetector.BigramLanguageModels[German] = new Lazy<FrozenDictionary<string, double>>(BigramLanguageModelForGerman);

LanguageDetector.TrigramLanguageModels[English] = new Lazy<Dictionary<string, double>>(TrigramLanguageModelForEnglish);
LanguageDetector.TrigramLanguageModels[German] = new Lazy<Dictionary<string, double>>(TrigramLanguageModelForGerman);
LanguageDetector.TrigramLanguageModels[English] = new Lazy<FrozenDictionary<string, double>>(TrigramLanguageModelForEnglish);
LanguageDetector.TrigramLanguageModels[German] = new Lazy<FrozenDictionary<string, double>>(TrigramLanguageModelForGerman);

LanguageDetector.QuadrigramLanguageModels[English] = new Lazy<Dictionary<string, double>>(QuadrigramLanguageModelForEnglish);
LanguageDetector.QuadrigramLanguageModels[German] = new Lazy<Dictionary<string, double>>(QuadrigramLanguageModelForGerman);
LanguageDetector.QuadrigramLanguageModels[English] = new Lazy<FrozenDictionary<string, double>>(QuadrigramLanguageModelForEnglish);
LanguageDetector.QuadrigramLanguageModels[German] = new Lazy<FrozenDictionary<string, double>>(QuadrigramLanguageModelForGerman);

LanguageDetector.FivegramLanguageModels[English] = new Lazy<Dictionary<string, double>>(FivegramLanguageModelForEnglish);
LanguageDetector.FivegramLanguageModels[German] = new Lazy<Dictionary<string, double>>(FivegramLanguageModelForGerman);
LanguageDetector.FivegramLanguageModels[English] = new Lazy<FrozenDictionary<string, double>>(FivegramLanguageModelForEnglish);
LanguageDetector.FivegramLanguageModels[German] = new Lazy<FrozenDictionary<string, double>>(FivegramLanguageModelForGerman);
}

private void RemoveLanguageModelsFromDetector()
Expand Down

0 comments on commit fda9413

Please sign in to comment.