From a617ab47a80e69531e5df3bfdf6223b47ddda075 Mon Sep 17 00:00:00 2001 From: Antonis Galanis Date: Wed, 23 Oct 2019 01:14:41 +0300 Subject: [PATCH 1/3] =?UTF-8?q?Mapped=20c=20to=20greek=20letter=20=CF=88?= =?UTF-8?q?=20in=20the=20greeklish=20generator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../org/elasticsearch/index/analysis/GreeklishGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/GreeklishGenerator.java b/src/main/java/org/elasticsearch/index/analysis/GreeklishGenerator.java index d9da3a1..2e0f9c9 100644 --- a/src/main/java/org/elasticsearch/index/analysis/GreeklishGenerator.java +++ b/src/main/java/org/elasticsearch/index/analysis/GreeklishGenerator.java @@ -69,7 +69,7 @@ public class GreeklishGenerator { { "ι", "i" }, { "κ", "k" }, { "λ", "l" }, { "μ", "m" }, { "ν", "n" }, { "ξ", "ks", "x" }, { "ο", "o" }, { "π", "p" }, { "ρ", "r" }, { "σ", "s" }, { "τ", "t" }, { "υ", "y", "u", "i" }, - { "φ", "f", "ph" }, { "χ", "x", "h", "ch" }, { "ψ", "ps" }, + { "φ", "f", "ph" }, { "χ", "x", "h", "ch" }, { "ψ", "c", "ps" }, { "ω", "w", "o", "v" } }; /** From 1d3f8316e72bb214e2f065a5b8fd127f892705db Mon Sep 17 00:00:00 2001 From: Antonis Galanis Date: Thu, 24 Oct 2019 12:35:25 +0300 Subject: [PATCH 2/3] Added setting for special mapping --- .../index/analysis/GreeklishConverter.java | 4 ++-- .../index/analysis/GreeklishGenerator.java | 18 ++++++++++++++++-- .../index/analysis/GreeklishTokenFilter.java | 4 ++-- .../analysis/GreeklishTokenFilterFactory.java | 4 +++- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/GreeklishConverter.java b/src/main/java/org/elasticsearch/index/analysis/GreeklishConverter.java index fd81828..594ab37 100644 --- a/src/main/java/org/elasticsearch/index/analysis/GreeklishConverter.java +++ b/src/main/java/org/elasticsearch/index/analysis/GreeklishConverter.java @@ -59,7 +59,7 @@ public class GreeklishConverter { private final boolean generateGreekVariants; // Constructor - public GreeklishConverter(int maxExpansions, boolean generateGreekVariants) { + public GreeklishConverter(int maxExpansions, boolean generateGreekVariants, boolean useSpecialMapping) { // Initialize greekWords list this.greekWords = new ArrayList(); @@ -68,7 +68,7 @@ public GreeklishConverter(int maxExpansions, boolean generateGreekVariants) { this.reverseStemmer = new GreekReverseStemmer(); // Initialize greeklish generator - this.greeklishGenerator = new GreeklishGenerator(maxExpansions); + this.greeklishGenerator = new GreeklishGenerator(maxExpansions, useSpecialMapping); // Initialize setting for generating greek variants this.generateGreekVariants = generateGreekVariants; diff --git a/src/main/java/org/elasticsearch/index/analysis/GreeklishGenerator.java b/src/main/java/org/elasticsearch/index/analysis/GreeklishGenerator.java index 2e0f9c9..cf84335 100644 --- a/src/main/java/org/elasticsearch/index/analysis/GreeklishGenerator.java +++ b/src/main/java/org/elasticsearch/index/analysis/GreeklishGenerator.java @@ -69,9 +69,16 @@ public class GreeklishGenerator { { "ι", "i" }, { "κ", "k" }, { "λ", "l" }, { "μ", "m" }, { "ν", "n" }, { "ξ", "ks", "x" }, { "ο", "o" }, { "π", "p" }, { "ρ", "r" }, { "σ", "s" }, { "τ", "t" }, { "υ", "y", "u", "i" }, - { "φ", "f", "ph" }, { "χ", "x", "h", "ch" }, { "ψ", "c", "ps" }, + { "φ", "f", "ph" }, { "χ", "x", "h", "ch" }, { "ψ", "ps" }, { "ω", "w", "o", "v" } }; + /** + * The possible string conversions for special cases. + */ + private static final String[][] specialConvertStrings = new String[][] { + { "ψ", "c", "ps"} + }; + /** * The maximum greeklish expansions per greek token. */ @@ -102,7 +109,7 @@ public class GreeklishGenerator { private String initialToken; // Constructor - public GreeklishGenerator(int maxExpansions) { + public GreeklishGenerator(int maxExpansions, boolean useSpecialMapping) { this.maxExpansions = maxExpansions; @@ -120,6 +127,13 @@ public GreeklishGenerator(int maxExpansions) { conversions.put(convertString[0].charAt(0), Arrays.copyOfRange(convertString, 1, convertString.length)); } + + if(useSpecialMapping) { + for (String[] convertString : specialConvertStrings) { + conversions.put(convertString[0].charAt(0), + Arrays.copyOfRange(convertString, 1, convertString.length)); + } + } } /** diff --git a/src/main/java/org/elasticsearch/index/analysis/GreeklishTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/GreeklishTokenFilter.java index a105a4a..ec82ebf 100644 --- a/src/main/java/org/elasticsearch/index/analysis/GreeklishTokenFilter.java +++ b/src/main/java/org/elasticsearch/index/analysis/GreeklishTokenFilter.java @@ -49,9 +49,9 @@ public class GreeklishTokenFilter extends TokenFilter { private GreeklishConverter greeklishConverter; // Constructor - public GreeklishTokenFilter(TokenStream tokenStream, int maxExpansions, boolean generateGreekVariants) { + public GreeklishTokenFilter(TokenStream tokenStream, int maxExpansions, boolean generateGreekVariants, boolean useSpecialMapping) { super(tokenStream); - this.greeklishConverter = new GreeklishConverter(maxExpansions, generateGreekVariants); + this.greeklishConverter = new GreeklishConverter(maxExpansions, generateGreekVariants, useSpecialMapping); } @Override diff --git a/src/main/java/org/elasticsearch/index/analysis/GreeklishTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/GreeklishTokenFilterFactory.java index e62472a..c0bb354 100644 --- a/src/main/java/org/elasticsearch/index/analysis/GreeklishTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/GreeklishTokenFilterFactory.java @@ -11,6 +11,7 @@ public class GreeklishTokenFilterFactory extends AbstractTokenFilterFactory { private final int maxExpansions; private final boolean generateGreekVariants; + private final boolean useSpecialMapping; @Inject public GreeklishTokenFilterFactory(IndexSettings indexSettings, @@ -21,11 +22,12 @@ public GreeklishTokenFilterFactory(IndexSettings indexSettings, super(indexSettings, name, settings); this.maxExpansions = settings.getAsInt("max_expansions", 20); this.generateGreekVariants = settings.getAsBoolean("greek_variants", true); + this.useSpecialMapping = settings.getAsBoolean("use_special_mapping", false); } @Override public TokenStream create(TokenStream tokenStream) { - return new GreeklishTokenFilter(tokenStream, maxExpansions, generateGreekVariants); + return new GreeklishTokenFilter(tokenStream, maxExpansions, generateGreekVariants, useSpecialMapping); } } From 41b132031e48a29c1e58bdceb2eae4e66ea8a95f Mon Sep 17 00:00:00 2001 From: Antonis Galanis Date: Thu, 24 Oct 2019 12:35:40 +0300 Subject: [PATCH 3/3] Updated tests for special mapping --- .../analysis/GreeklishConverterTest.java | 49 +++++++++++++++++-- .../analysis/GreeklishGeneratorTest.java | 48 +++++++++++++++++- 2 files changed, 92 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/elasticsearch/index/analysis/GreeklishConverterTest.java b/src/test/java/org/elasticsearch/index/analysis/GreeklishConverterTest.java index 93336bb..71aa2f4 100644 --- a/src/test/java/org/elasticsearch/index/analysis/GreeklishConverterTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/GreeklishConverterTest.java @@ -14,6 +14,9 @@ public class GreeklishConverterTest { private static final boolean GENERATE_GREEK_VARIANTS = true; + private static final boolean USE_SPECIAL_MAPPING_ON = true; + private static final boolean USE_SPECIAL_MAPPING_OFF = false; + private GreeklishConverter converter; /** @@ -23,6 +26,12 @@ public class GreeklishConverterTest { private static final String[] greekWords = { "αυτοκινητο", "ομπρελα", "ξεσκεπαστοσ"}; + /** + * a sample of greek words to generate their greeklish + * counterparts. + */ + private static final String[] greekWordsSpecial = { "ωιψηυ"}; + /** * the greeklish counterparts that should be generated from the * greek words. @@ -35,6 +44,19 @@ public class GreeklishConverterTest { {"kseskepastos", "xeskepastos", "kseskepastou", "xeskepastwn", "kseskepastoi"} }; + /** + * the greeklish counterparts that should be generated from the + * greek words. + */ + private static final String[][] generatedGreeklishWordsSpecial = { + { + "oichu", "wichi", "wichu", "vipsiy", + "oipsiy", "wipsiy", "viciy", "oiciy", + "wiciy", "vipshy", "oipshy", "wipshy", + "vichy", "oichy", "wichy" + } + }; + /** * these words should not be processed by the converter. */ @@ -47,7 +69,7 @@ public class GreeklishConverterTest { @BeforeClass public void setUp() { - this.converter = new GreeklishConverter(MAX_EXPANSIONS, GENERATE_GREEK_VARIANTS); + this.converter = new GreeklishConverter(MAX_EXPANSIONS, GENERATE_GREEK_VARIANTS, USE_SPECIAL_MAPPING_OFF); } @BeforeMethod @@ -85,11 +107,32 @@ public void testGreekTokenConversionForValidWords() { } } + @Test + public void testGreekTokenConversionForValidWordsSpecial() { + int newMaxExpansions = 20; + converter = new GreeklishConverter(newMaxExpansions, GENERATE_GREEK_VARIANTS, USE_SPECIAL_MAPPING_ON); + for (int i = 0; i < greekWordsSpecial.length; i++) { + greeklishWords = converter.convert(greekWordsSpecial[i].toCharArray(), + greekWordsSpecial[i].length()); + + populateConvertedStringsList(); + + Assert.assertFalse(greeklishWords.isEmpty(), + "Greeklish words should be generated"); + + for (String greeklishWord : generatedGreeklishWordsSpecial[i]) { + Assert.assertTrue(convertedGreeklishStrings + .contains(greeklishWord), + "It should contain greeklish word: " + greeklishWord); + } + } + } + @Test public void testMaxGreeklishExpansions() { int newMaxExpansions = 2; boolean generateGreekVariants = false; - converter = new GreeklishConverter(newMaxExpansions, generateGreekVariants); + converter = new GreeklishConverter(newMaxExpansions, generateGreekVariants, USE_SPECIAL_MAPPING_OFF); greeklishWords = converter.convert(greekWords[0].toCharArray(), greekWords[0].length()); @@ -114,7 +157,7 @@ public void testMaxGreeklishExpansions() { public void testGreekVariantsGeneration() { int newMaxExpansions = 1; boolean generateGreekVariants = false; - converter = new GreeklishConverter(newMaxExpansions, generateGreekVariants); + converter = new GreeklishConverter(newMaxExpansions, generateGreekVariants, USE_SPECIAL_MAPPING_OFF); greeklishWords = converter.convert(greekWords[0].toCharArray(), greekWords[0].length()); diff --git a/src/test/java/org/elasticsearch/index/analysis/GreeklishGeneratorTest.java b/src/test/java/org/elasticsearch/index/analysis/GreeklishGeneratorTest.java index f644ca6..750a298 100644 --- a/src/test/java/org/elasticsearch/index/analysis/GreeklishGeneratorTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/GreeklishGeneratorTest.java @@ -11,6 +11,8 @@ public class GreeklishGeneratorTest { private static final int MAX_EXPANSIONS = 10; + private static final boolean USE_SPECIAL_MAPPING_ON = true; + private static final boolean USE_SPECIAL_MAPPING_OFF = false; /** * a sample of greek words to generate their greeklish @@ -19,6 +21,12 @@ public class GreeklishGeneratorTest { private static final String[] greekWords = { "αυτοκινητο", "ομπρελα", "ξεσκεπαστοσ", }; + /** + * a special sample of greek words to generate their greeklish + * counterparts. + */ + private static final String[] greekWordsSpecial = { "ωιψηυ" }; + /** * the greeklish counterparts that should be generated from the * greek words. @@ -29,6 +37,17 @@ public class GreeklishGeneratorTest { "omprela", "obrela", "kseskepastos", "xeskepastos" }; + /** + * the special greeklish counterparts that should be generated from the + * greek words. + */ + private static final String[] generatedGreeklishWordsSpecial = { + "oichu", "wichi", "wichu", "vipsiy", + "oipsiy", "wipsiy", "viciy", "oiciy", + "wiciy", "vipshy", "oipshy", "wipshy", + "vichy", "oichy", "wichy" + }; + private GreeklishGenerator generator; private List inputGreekList = new ArrayList(); @@ -39,7 +58,7 @@ public class GreeklishGeneratorTest { @BeforeClass public void populateInputGreekList() { - this.generator = new GreeklishGenerator(MAX_EXPANSIONS); + this.generator = new GreeklishGenerator(MAX_EXPANSIONS, USE_SPECIAL_MAPPING_OFF); for (String word : greekWords) { inputGreekList.add(word); @@ -72,7 +91,7 @@ public void testGreekTokenConversionForValidWords() { @Test public void testMaxGreeklishWordGenerations() { int newMaxExpansions = 2; - generator = new GreeklishGenerator(newMaxExpansions); + generator = new GreeklishGenerator(newMaxExpansions, USE_SPECIAL_MAPPING_OFF); greeklishWords = generator.generateGreeklishWords(inputGreekList); @@ -82,6 +101,31 @@ public void testMaxGreeklishWordGenerations() { } + @Test + public void testGreekTokenConversionForValidWordsSpecial() { + inputGreekList.clear(); + for (String word : greekWordsSpecial) { + inputGreekList.add(word); + } + + for (int i = 0; i < greekWordsSpecial.length; i++) { + int newMaxExpansions = 20; + generator = new GreeklishGenerator(newMaxExpansions, USE_SPECIAL_MAPPING_ON); + greeklishWords = generator.generateGreeklishWords(inputGreekList); + + populateConvertedStringsList(); + + Assert.assertFalse(greeklishWords.isEmpty(), + "Greeklish words should be generated"); + for (String greeklishWord : generatedGreeklishWordsSpecial) { + Assert.assertTrue( + convertedGreeklishStrings.contains(greeklishWord), + "It should contain the greeklish word: " + + greeklishWord); + } + } + } + private final void populateConvertedStringsList() { for (StringBuilder word : greeklishWords) { convertedGreeklishStrings.add(word.toString());