From 3b2daa4d916e99a7cb6afae5c0f99c09c06e8f1f Mon Sep 17 00:00:00 2001 From: Andrew Valencik Date: Tue, 13 Sep 2022 17:54:56 -0400 Subject: [PATCH] Add GermanAnalyzerBuilder --- .../textmogrify/lucene/AnalyzerBuilder.scala | 29 ++++++++++++ .../lucene/AnalyzerBuilderSuite.scala | 46 +++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala b/lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala index 358b9a0..541ea18 100644 --- a/lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala +++ b/lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.en.PorterStemFilter import org.apache.lucene.analysis.es.SpanishLightStemFilter import org.apache.lucene.analysis.fr.FrenchLightStemFilter import org.apache.lucene.analysis.it.ItalianLightStemFilter +import org.apache.lucene.analysis.de.GermanLightStemFilter import org.apache.lucene.analysis.LowerCaseFilter import org.apache.lucene.analysis.Analyzer import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter @@ -104,6 +105,8 @@ object AnalyzerBuilder { new EnglishAnalyzerBuilder(Config.empty, false) def french: FrenchAnalyzerBuilder = new FrenchAnalyzerBuilder(Config.empty, false) + def german: GermanAnalyzerBuilder = + new GermanAnalyzerBuilder(Config.empty, false) def italian: ItalianAnalyzerBuilder = new ItalianAnalyzerBuilder(Config.empty, false) def spanish: SpanishAnalyzerBuilder = @@ -233,3 +236,29 @@ final class ItalianAnalyzerBuilder private[lucene] ( def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] = mkFromStandardTokenizer(config)(ts => if (self.stemmer) new ItalianLightStemFilter(ts) else ts) } + +final class GermanAnalyzerBuilder private[lucene] ( + config: Config, + stemmer: Boolean, +) extends AnalyzerBuilder(config) { self => + type Builder = GermanAnalyzerBuilder + + private def copy( + newConfig: Config, + stemmer: Boolean = self.stemmer, + ): GermanAnalyzerBuilder = + new GermanAnalyzerBuilder(newConfig, stemmer) + + def withConfig(newConfig: Config): GermanAnalyzerBuilder = + copy(newConfig = newConfig) + + /** Adds the GermanLight Stemmer to the end of the analyzer pipeline and enables lowercasing. + * Stemming reduces words like `jumping` and `jumps` to their root word `jump`. + * NOTE: Lowercasing is forced as it is required for the Lucene GermanLightStemFilter. + */ + def withGermanLightStemmer: GermanAnalyzerBuilder = + copy(config.copy(lowerCase = true), stemmer = true) + + def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] = + mkFromStandardTokenizer(config)(ts => if (self.stemmer) new GermanLightStemFilter(ts) else ts) +} diff --git a/lucene/src/test/scala/textmogrify/lucene/AnalyzerBuilderSuite.scala b/lucene/src/test/scala/textmogrify/lucene/AnalyzerBuilderSuite.scala index bbff719..07e78c1 100644 --- a/lucene/src/test/scala/textmogrify/lucene/AnalyzerBuilderSuite.scala +++ b/lucene/src/test/scala/textmogrify/lucene/AnalyzerBuilderSuite.scala @@ -235,3 +235,49 @@ class ItalianAnalyzerBuilderSuite extends CatsEffectSuite { } } + +class GermanAnalyzerBuilderSuite extends CatsEffectSuite { + + val jalapenos = "Ich mag Jalapeños" + val jumping = "Neeko springt gerne auf Theken" + + test("german analyzer default should tokenize without any transformations") { + val analyzer = AnalyzerBuilder.german + val actual = analyzer.tokenizer[IO].use(f => f(jalapenos)) + assertIO(actual, Vector("Ich", "mag", "Jalapeños")) + } + + test("german analyzer withLowerCasing should lowercase all letters") { + val analyzer = AnalyzerBuilder.german.withLowerCasing + val actual = analyzer.tokenizer[IO].use(f => f(jalapenos)) + assertIO(actual, Vector("ich", "mag", "jalapeños")) + } + + test("german analyzer withASCIIFolding should fold 'ñ' to 'n'") { + val analyzer = AnalyzerBuilder.german.withASCIIFolding + val actual = analyzer.tokenizer[IO].use(f => f(jalapenos)) + assertIO(actual, Vector("Ich", "mag", "Jalapenos")) + } + + test("german analyzer withStopWords should filter them out") { + val analyzer = AnalyzerBuilder.german.withStopWords(Set("Ich")) + val actual = analyzer.tokenizer[IO].use(f => f(jalapenos)) + assertIO(actual, Vector("mag", "Jalapeños")) + } + + test("german analyzer withGermanLightStemmer should lowercase and stem words") { + val analyzer = AnalyzerBuilder.german.withGermanLightStemmer + val actual = analyzer.tokenizer[IO].use(f => f(jumping)) + assertIO(actual, Vector("neeko", "springt", "gern", "auf", "thek")) + } + + test("german analyzer builder settings can be chained") { + val analyzer = AnalyzerBuilder.german.withGermanLightStemmer + .withStopWords(Set("auf")) + .withASCIIFolding + .withLowerCasing + val actual = analyzer.tokenizer[IO].use(f => f(jumping)) + assertIO(actual, Vector("neeko", "springt", "gern", "thek")) + } + +}