[FOray-commit] SF.net SVN: foray:[12045] trunk/foray/foray-orthography/src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 12045
          http://sourceforge.net/p/foray/code/12045
Author:   victormote
Date:     2021-11-12 12:17:39 +0000 (Fri, 12 Nov 2021)
Log Message:
-----------
Rename OrthographyConfig4a to Orthography4a, for consistency and clarity.

Modified Paths:
--------------
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/HyphenationConsumer4a.java
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/ConfigParser.java
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java
    trunk/foray/foray-orthography/src/test/java/org/foray/orthography/HyphenationConsumer4aTests.java

Added Paths:
-----------
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java

Removed Paths:
-------------
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/HyphenationConsumer4a.java
===================================================================

--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/HyphenationConsumer4a.java	2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/HyphenationConsumer4a.java	2021-11-12 12:17:39 UTC (rev 12045)
@@ -73,7 +73,7 @@
             final ParaConfig paraConfig) {
         final ParaBranch4a wordSequence = new ParaBranch4a(paraConfig);
         final CharSequence sequence = characters.subSequence(startIndex, startIndex + length);
-        final OrthographyConfig4a orthographyConfig = this.server.getOrthography(paraConfig.getWritingSystem());
+        final Orthography4a orthographyConfig = this.server.getOrthography(paraConfig.getWritingSystem());
         if (orthographyConfig == null) {
             this.logger.error("Orthography not configured for: " + paraConfig.getWritingSystem());
         }

Copied: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java (from rev 12044, trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java)
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java	                        (rev 0)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java	2021-11-12 12:17:39 UTC (rev 12045)
@@ -0,0 +1,402 @@
+/*
+ * Copyright 2019 The FOray Project.
+ *      http://www.foray.org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * This work is in part derived from the following work(s), used with the
+ * permission of the licensor:
+ *      Apache FOP, licensed by the Apache Software Foundation
+ *
+ */
+
+/*
+ * $LastChangedRevision$
+ * $LastChangedDate$
+ * $LastChangedBy$
+ */
+
+package org.foray.orthography;
+
+import org.foray.common.primitive.BooleanUtils;
+import org.foray.common.primitive.CharSequenceUtils;
+import org.foray.orthography.wrapper.CapitalizedWord;
+import org.foray.orthography.wrapper.ExactWord;
+import org.foray.orthography.wrapper.UppercaseWord;
+
+import org.axsl.orthography.Dictionary;
+import org.axsl.orthography.Orthography;
+import org.axsl.orthography.PartOfSpeech;
+import org.axsl.orthography.Word;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * The resources (dictionaries, word wrappers, hyphenation patterns, etc.) that should be used by a given orthography.
+ */
+public class Orthography4a implements Orthography {
+
+    /** The list of ids to lists of regex patterns, which, for this orthography, signal a valid word if matched. */
+    private List<String> matchRuleListIds = new ArrayList<String>();
+
+    /** The list of ids to lists of derivative rules, which, for this orthography, can compute derivative words. */
+    private List<String> derivativeRuleListIds = new ArrayList<String>();
+
+    /** The dictionary for this orthography.*/
+    private DictionaryResource dictionaryResource;
+
+    /** The hyphenation patterns for this orthography. */
+    private HyphenationPatternsResource hyphenationPatternsResource;
+
+    /** The list of word wrapper factories for this orthography. */
+    private List<WordWrapperFactory<?>> wordWrapperFactories = new ArrayList<WordWrapperFactory<?>>();
+
+    /** The word breaker for this orthography. */
+    private WordBreaker wordBreaker;
+
+    /** The parent hyphenation server. */
+    private OrthographyServer4a server;
+
+    /* TODO: Following orthography-specific config needs to be moved to XML or subclass. */
+    /** Character delimiting a compound word. NB: This variable may be orthography specific, and may therefore need to
+     * be moved to the orthography configuration. However, we have found no evidence yet for that need. */
+    private char compoundWordMarker = '-';
+
+    /** Regex pattern used to break compound words into their components. */
+    private Pattern compoundWordBreaker = Pattern.compile(Character.toString(compoundWordMarker));
+
+    /**
+     * Constructor.
+     * @param server The parent hyphenation server.
+     */
+    public Orthography4a(final OrthographyServer4a server) {
+        this.server = server;
+    }
+
+    /**
+     * Returns the list of match rule Ids.
+     * @return The list of match rule Ids.
+     */
+    public List<String> getMatchRuleListIds() {
+        return this.matchRuleListIds;
+    }
+
+    /**
+     * Adds a match rule list Id to this configuration.
+     * @param matchRuleListId The new match rule list Id.
+     */
+    public void registerMatchRuleListId(final String matchRuleListId) {
+        if (this.matchRuleListIds.contains(matchRuleListId)) {
+            throw new IllegalArgumentException(
+                    "Match Rule List already configured for this orthography: " + matchRuleListId);
+        }
+        this.matchRuleListIds.add(matchRuleListId);
+    }
+
+    /**
+     * Returns the list of derivative rule Ids.
+     * @return The list of derivative rule Ids.
+     */
+    public List<String> getDerivativeRuleListIds() {
+        return this.derivativeRuleListIds;
+    }
+
+    /**
+     * Adds a derivative rule list Id to this configuration.
+     * @param derivativeRuleListId The new derivative rule list Id.
+     */
+    public void registerDerivativeRuleListId(final String derivativeRuleListId) {
+        if (this.derivativeRuleListIds.contains(derivativeRuleListId)) {
+            throw new IllegalArgumentException(
+                    "Derivative Rule List already configured for this orthography: " + derivativeRuleListId);
+        }
+        this.derivativeRuleListIds.add(derivativeRuleListId);
+    }
+
+    /**
+     * Returns the dictionary resource.
+     * @return The dictionary resource.
+     */
+    public DictionaryResource getDictionaryResource() {
+        return this.dictionaryResource;
+    }
+
+    /**
+     * Sets the dictionary resource.
+     * @param dictionaryResource The dictionaryResource to set.
+     */
+    public void setDictionaryResource(final DictionaryResource dictionaryResource) {
+        this.dictionaryResource = dictionaryResource;
+    }
+
+    /**
+     * Returns the hyphenation patterns resource.
+     * @return The hyphenation patterns resource
+     */
+    public HyphenationPatternsResource getHyphenationPatternsResource() {
+        return this.hyphenationPatternsResource;
+    }
+
+    /**
+     * Sets the hyphenation patterns resource.
+     * @param hyphenationPatternsResource The hyphenation patterns resource to set.
+     */
+    public void setHyphenationPatternsResource(final HyphenationPatternsResource hyphenationPatternsResource) {
+        this.hyphenationPatternsResource = hyphenationPatternsResource;
+    }
+
+    /**
+     * Returns the list of word wrapper factories.
+     * @return The list of word wrapper factories.
+     */
+    public List<WordWrapperFactory<?>> getWordWrapperFactories() {
+        return this.wordWrapperFactories;
+    }
+
+    /**
+     * Sets the list of word wrapper factories.
+     * @param wordWrapperFactories The word wrapper factories to set.
+     */
+    public void setWordWrapperFactories(final List<WordWrapperFactory<?>> wordWrapperFactories) {
+        this.wordWrapperFactories = wordWrapperFactories;
+    }
+
+    /**
+     * Returns the word breaker.
+     * @return The word breaker.
+     */
+    public WordBreaker getWordBreaker() {
+        return this.wordBreaker;
+    }
+
+    /**
+     * Sets the word breaker.
+     * @param wordBreaker The word breaker to set.
+     */
+    public void setWordBreaker(final WordBreaker wordBreaker) {
+        this.wordBreaker = wordBreaker;
+    }
+
+    /**
+     * Returns the dictionary.
+     * @return The dictionary, or null if one is not configured or cannot be obtained.
+     */
+    public SegmentDictionary getDictionary() {
+        if (this.dictionaryResource == null) {
+            return null;
+        } else {
+            return this.dictionaryResource.getResource();
+        }
+    }
+
+    /**
+     * Returns the hyphenation patterns.
+     * @return The hyphenation patterns.
+     */
+    public PatternTree getHyphenationPatterns() {
+        if (this.hyphenationPatternsResource == null) {
+            return null;
+        } else {
+            return this.hyphenationPatternsResource.getResource();
+        }
+    }
+
+    /**
+     * Searches the configured word wrapper factories for a match that would create a word derived from a dictionary
+     * word.
+     * @param chars The word to test.
+     * @return A word wrapper if {@code chars} matches a word wrapper factory, or null if not.
+     */
+    public WordWrapper findDerivatives(final CharSequence chars) {
+        /* TODO: For now, this returns the first item that matches. This may need to be expanded to allow nested wrapped
+         * words. */
+        WordWrapper word = null;
+        final Dictionary dictionary = getDictionary();
+        for (int index = 0; index < this.wordWrapperFactories.size(); index ++) {
+            final WordWrapperFactory<?> factory = this.wordWrapperFactories.get(index);
+            word = factory.makeInstance(chars, dictionary);
+            if (word != null) {
+                return word;
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Indicates whether a given word is found in the match rules for this orthography, i.e. rules looking for
+     * non-dictionary items such as numbers, currency, etc.
+     * @param wordChars The word to be tested.
+     * @return True if and only if {@code word} matches at least one match rule for this orthography.
+     */
+    public boolean foundInMatchRules(final CharSequence wordChars) {
+        for (int idIndex = 0; idIndex < getMatchRuleListIds().size(); idIndex ++) {
+            final String ruleListId = matchRuleListIds.get(idIndex);
+            final List<Pattern> validWordPatterns = server.getMatchRules(ruleListId);
+            for (int index = 0; index < validWordPatterns.size(); index ++) {
+                final Pattern pattern = validWordPatterns.get(index);
+                final Matcher matcher = pattern.matcher(wordChars);
+                if (matcher.matches()) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    @Override
+    public Word getWord(final CharSequence wordChars, final PartOfSpeech pos,
+            final List<Dictionary> adhocDictionaries) {
+        // TODO Auto-generated method stub
+        return null;
+    }
+
+    @Override
+    public boolean isValidWord(final CharSequence wordChars, final PartOfSpeech pos,
+            final List<Dictionary> adhocDictionaries) {
+        if (wordChars.length() < 1) {
+            return false;
+        }
+
+        /* 1. Check exact matches in adhoc dictionaries. */
+        if (adhocDictionaries != null) {
+            for (int index = 0; index < adhocDictionaries.size(); index ++) {
+                final Dictionary adhocDictionary = adhocDictionaries.get(index);
+                if (adhocDictionary.getWord(wordChars, 0) != null) {
+                    return true;
+                }
+            }
+        }
+
+        /* 2. Check exact matches in standard dictionaries for the orthography. */
+        final Dictionary orthoDictionary = getDictionary();
+        if (orthoDictionary != null
+                && orthoDictionary.getWord(wordChars, 0) != null) {
+            return true;
+        }
+
+        /* 3. Check the match rules. */
+        if (foundInMatchRules(wordChars)) {
+            return true;
+        }
+
+        /* 4. Check for compound word. */
+        if (CharSequenceUtils.contains(wordChars, '-')) {
+            final String[] components = this.compoundWordBreaker.split(wordChars);
+            final boolean[] componentsValid = new boolean[components.length];
+            for (int index = 0; index < components.length; index ++) {
+                componentsValid[index] = isValidWord(components[index], pos, adhocDictionaries);
+            }
+            if (BooleanUtils.allTrue(componentsValid)) {
+                return true;
+            }
+        }
+
+        /* 5. Check derivative matches in adhoc dictionaries. */
+        if (adhocDictionaries != null) {
+            for (int dictIndex = 0; dictIndex < adhocDictionaries.size(); dictIndex ++) {
+                final Dictionary adhocDictionary = adhocDictionaries.get(dictIndex);
+                if (isDerivativeFound(adhocDictionary, wordChars)) {
+                    return true;
+                }
+            }
+        }
+
+        /* 6. Check derivative matches in standard dictionaries for the orthography. */
+        if (orthoDictionary != null) {
+            if (isDerivativeFound(orthoDictionary, wordChars)) {
+                return true;
+            }
+        }
+
+        /* Not found in any dictionary. */
+        /* If the first character is uppercase, convert to lowercase and try again. Discussion: For English at least, we
+         * do not want the opposite effect, i.e. to convert words starting with lowercase have the first char converted
+         * to uppercase. If the word is in the dictionary as a proper noun, we should treat a failure to capitalize it
+         * as a spelling error. Also, we do not want to generally convert the entire word to lowercase, as capital
+         * letters in the middle of the word should normally be treated as a spelling error. For exceptions to this
+         * last rule, users should enter the oddly-capitalized word into a dictionary in that form.
+         * TODO: This capability should be included in the orthography configuration instead of being hard-coded
+         * here. */
+        if (Character.isUpperCase(wordChars.charAt(0))) {
+            final StringBuilder builder = new StringBuilder(wordChars);
+            builder.setCharAt(0, Character.toLowerCase(wordChars.charAt(0)));
+            return isValidWord(builder, pos, adhocDictionaries);
+        }
+
+        return false;
+    }
+
+    private boolean isDerivativeFound(final Dictionary dictionary, final CharSequence wordChars) {
+        for (int listIndex = 0; listIndex < this.derivativeRuleListIds.size(); listIndex ++) {
+            final String ruleListKey = this.derivativeRuleListIds.get(listIndex);
+            final List<DerivativePattern> patternList = this.server.getDerivativePatterns(ruleListKey);
+            for (int patternIndex = 0; patternIndex < patternList.size(); patternIndex ++) {
+                final DerivativePattern pattern = patternList.get(patternIndex);
+                if (pattern.findFirstApplicableRule(wordChars, dictionary) != null) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    @Override
+    public Word hyphenate(final CharSequence word, final int offset, final int length) {
+        /* The character sequence containing the characters in the word that we are looking for. */
+        final CharSequence chars = word.subSequence(offset, offset + length);
+        Word hyphenatedWord = null;
+
+        /* Look in the dictionary first, as it should be more accurate. */
+        final SegmentDictionary dictionary = getDictionary();
+        if (dictionary != null) {
+            hyphenatedWord = dictionary.getWord(chars.toString().toLowerCase(), 0);
+            if (hyphenatedWord == null) {
+                hyphenatedWord = findDerivatives(chars);
+            }
+        }
+
+
+        if (hyphenatedWord == null) {
+            /* The word was not found in the dictionary. Try the hyphenation patterns. */
+            final PatternTree patternTree = getHyphenationPatterns();
+            if (patternTree == null) {
+                return null;
+            }
+            hyphenatedWord = patternTree.hyphenate(chars, 0, length);
+        }
+
+        if (hyphenatedWord == null) {
+            return null;
+        }
+
+        final boolean capitalized = CharSequenceUtils.equalToCapitalized(hyphenatedWord.getNormalizedContent(), chars);
+        if (capitalized) {
+            return new CapitalizedWord(hyphenatedWord);
+        }
+        final boolean uppercase = CharSequenceUtils.equalToUppercase(hyphenatedWord.getNormalizedContent(), chars);
+        if (uppercase) {
+            return new UppercaseWord(hyphenatedWord);
+        }
+
+        if (CharSequenceUtils.hasAnyUppercase(chars)) {
+            /* There is unexpected capitalization. */
+            return new ExactWord(hyphenatedWord, chars.toString());
+        }
+
+        return hyphenatedWord;
+    }
+
+}

Deleted: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java	2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java	2021-11-12 12:17:39 UTC (rev 12045)
@@ -1,402 +0,0 @@
-/*
- * Copyright 2019 The FOray Project.
- *      http://www.foray.org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * This work is in part derived from the following work(s), used with the
- * permission of the licensor:
- *      Apache FOP, licensed by the Apache Software Foundation
- *
- */
-
-/*
- * $LastChangedRevision$
- * $LastChangedDate$
- * $LastChangedBy$
- */
-
-package org.foray.orthography;
-
-import org.foray.common.primitive.BooleanUtils;
-import org.foray.common.primitive.CharSequenceUtils;
-import org.foray.orthography.wrapper.CapitalizedWord;
-import org.foray.orthography.wrapper.ExactWord;
-import org.foray.orthography.wrapper.UppercaseWord;
-
-import org.axsl.orthography.Dictionary;
-import org.axsl.orthography.Orthography;
-import org.axsl.orthography.PartOfSpeech;
-import org.axsl.orthography.Word;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * The resources (dictionaries, word wrappers, hyphenation patterns, etc.) that should be used by a given orthography.
- */
-public class OrthographyConfig4a implements Orthography {
-
-    /** The list of ids to lists of regex patterns, which, for this orthography, signal a valid word if matched. */
-    private List<String> matchRuleListIds = new ArrayList<String>();
-
-    /** The list of ids to lists of derivative rules, which, for this orthography, can compute derivative words. */
-    private List<String> derivativeRuleListIds = new ArrayList<String>();
-
-    /** The dictionary for this orthography.*/
-    private DictionaryResource dictionaryResource;
-
-    /** The hyphenation patterns for this orthography. */
-    private HyphenationPatternsResource hyphenationPatternsResource;
-
-    /** The list of word wrapper factories for this orthography. */
-    private List<WordWrapperFactory<?>> wordWrapperFactories = new ArrayList<WordWrapperFactory<?>>();
-
-    /** The word breaker for this orthography. */
-    private WordBreaker wordBreaker;
-
-    /** The parent hyphenation server. */
-    private OrthographyServer4a server;
-
-    /* TODO: Following orthography-specific config needs to be moved to XML or subclass. */
-    /** Character delimiting a compound word. NB: This variable may be orthography specific, and may therefore need to
-     * be moved to the orthography configuration. However, we have found no evidence yet for that need. */
-    private char compoundWordMarker = '-';
-
-    /** Regex pattern used to break compound words into their components. */
-    private Pattern compoundWordBreaker = Pattern.compile(Character.toString(compoundWordMarker));
-
-    /**
-     * Constructor.
-     * @param server The parent hyphenation server.
-     */
-    public OrthographyConfig4a(final OrthographyServer4a server) {
-        this.server = server;
-    }
-
-    /**
-     * Returns the list of match rule Ids.
-     * @return The list of match rule Ids.
-     */
-    public List<String> getMatchRuleListIds() {
-        return this.matchRuleListIds;
-    }
-
-    /**
-     * Adds a match rule list Id to this configuration.
-     * @param matchRuleListId The new match rule list Id.
-     */
-    public void registerMatchRuleListId(final String matchRuleListId) {
-        if (this.matchRuleListIds.contains(matchRuleListId)) {
-            throw new IllegalArgumentException(
-                    "Match Rule List already configured for this orthography: " + matchRuleListId);
-        }
-        this.matchRuleListIds.add(matchRuleListId);
-    }
-
-    /**
-     * Returns the list of derivative rule Ids.
-     * @return The list of derivative rule Ids.
-     */
-    public List<String> getDerivativeRuleListIds() {
-        return this.derivativeRuleListIds;
-    }
-
-    /**
-     * Adds a derivative rule list Id to this configuration.
-     * @param derivativeRuleListId The new derivative rule list Id.
-     */
-    public void registerDerivativeRuleListId(final String derivativeRuleListId) {
-        if (this.derivativeRuleListIds.contains(derivativeRuleListId)) {
-            throw new IllegalArgumentException(
-                    "Derivative Rule List already configured for this orthography: " + derivativeRuleListId);
-        }
-        this.derivativeRuleListIds.add(derivativeRuleListId);
-    }
-
-    /**
-     * Returns the dictionary resource.
-     * @return The dictionary resource.
-     */
-    public DictionaryResource getDictionaryResource() {
-        return this.dictionaryResource;
-    }
-
-    /**
-     * Sets the dictionary resource.
-     * @param dictionaryResource The dictionaryResource to set.
-     */
-    public void setDictionaryResource(final DictionaryResource dictionaryResource) {
-        this.dictionaryResource = dictionaryResource;
-    }
-
-    /**
-     * Returns the hyphenation patterns resource.
-     * @return The hyphenation patterns resource
-     */
-    public HyphenationPatternsResource getHyphenationPatternsResource() {
-        return this.hyphenationPatternsResource;
-    }
-
-    /**
-     * Sets the hyphenation patterns resource.
-     * @param hyphenationPatternsResource The hyphenation patterns resource to set.
-     */
-    public void setHyphenationPatternsResource(final HyphenationPatternsResource hyphenationPatternsResource) {
-        this.hyphenationPatternsResource = hyphenationPatternsResource;
-    }
-
-    /**
-     * Returns the list of word wrapper factories.
-     * @return The list of word wrapper factories.
-     */
-    public List<WordWrapperFactory<?>> getWordWrapperFactories() {
-        return this.wordWrapperFactories;
-    }
-
-    /**
-     * Sets the list of word wrapper factories.
-     * @param wordWrapperFactories The word wrapper factories to set.
-     */
-    public void setWordWrapperFactories(final List<WordWrapperFactory<?>> wordWrapperFactories) {
-        this.wordWrapperFactories = wordWrapperFactories;
-    }
-
-    /**
-     * Returns the word breaker.
-     * @return The word breaker.
-     */
-    public WordBreaker getWordBreaker() {
-        return this.wordBreaker;
-    }
-
-    /**
-     * Sets the word breaker.
-     * @param wordBreaker The word breaker to set.
-     */
-    public void setWordBreaker(final WordBreaker wordBreaker) {
-        this.wordBreaker = wordBreaker;
-    }
-
-    /**
-     * Returns the dictionary.
-     * @return The dictionary, or null if one is not configured or cannot be obtained.
-     */
-    public SegmentDictionary getDictionary() {
-        if (this.dictionaryResource == null) {
-            return null;
-        } else {
-            return this.dictionaryResource.getResource();
-        }
-    }
-
-    /**
-     * Returns the hyphenation patterns.
-     * @return The hyphenation patterns.
-     */
-    public PatternTree getHyphenationPatterns() {
-        if (this.hyphenationPatternsResource == null) {
-            return null;
-        } else {
-            return this.hyphenationPatternsResource.getResource();
-        }
-    }
-
-    /**
-     * Searches the configured word wrapper factories for a match that would create a word derived from a dictionary
-     * word.
-     * @param chars The word to test.
-     * @return A word wrapper if {@code chars} matches a word wrapper factory, or null if not.
-     */
-    public WordWrapper findDerivatives(final CharSequence chars) {
-        /* TODO: For now, this returns the first item that matches. This may need to be expanded to allow nested wrapped
-         * words. */
-        WordWrapper word = null;
-        final Dictionary dictionary = getDictionary();
-        for (int index = 0; index < this.wordWrapperFactories.size(); index ++) {
-            final WordWrapperFactory<?> factory = this.wordWrapperFactories.get(index);
-            word = factory.makeInstance(chars, dictionary);
-            if (word != null) {
-                return word;
-            }
-        }
-        return null;
-    }
-
-    /**
-     * Indicates whether a given word is found in the match rules for this orthography, i.e. rules looking for
-     * non-dictionary items such as numbers, currency, etc.
-     * @param wordChars The word to be tested.
-     * @return True if and only if {@code word} matches at least one match rule for this orthography.
-     */
-    public boolean foundInMatchRules(final CharSequence wordChars) {
-        for (int idIndex = 0; idIndex < getMatchRuleListIds().size(); idIndex ++) {
-            final String ruleListId = matchRuleListIds.get(idIndex);
-            final List<Pattern> validWordPatterns = server.getMatchRules(ruleListId);
-            for (int index = 0; index < validWordPatterns.size(); index ++) {
-                final Pattern pattern = validWordPatterns.get(index);
-                final Matcher matcher = pattern.matcher(wordChars);
-                if (matcher.matches()) {
-                    return true;
-                }
-            }
-        }
-        return false;
-    }
-
-    @Override
-    public Word getWord(final CharSequence wordChars, final PartOfSpeech pos,
-            final List<Dictionary> adhocDictionaries) {
-        // TODO Auto-generated method stub
-        return null;
-    }
-
-    @Override
-    public boolean isValidWord(final CharSequence wordChars, final PartOfSpeech pos,
-            final List<Dictionary> adhocDictionaries) {
-        if (wordChars.length() < 1) {
-            return false;
-        }
-
-        /* 1. Check exact matches in adhoc dictionaries. */
-        if (adhocDictionaries != null) {
-            for (int index = 0; index < adhocDictionaries.size(); index ++) {
-                final Dictionary adhocDictionary = adhocDictionaries.get(index);
-                if (adhocDictionary.getWord(wordChars, 0) != null) {
-                    return true;
-                }
-            }
-        }
-
-        /* 2. Check exact matches in standard dictionaries for the orthography. */
-        final Dictionary orthoDictionary = getDictionary();
-        if (orthoDictionary != null
-                && orthoDictionary.getWord(wordChars, 0) != null) {
-            return true;
-        }
-
-        /* 3. Check the match rules. */
-        if (foundInMatchRules(wordChars)) {
-            return true;
-        }
-
-        /* 4. Check for compound word. */
-        if (CharSequenceUtils.contains(wordChars, '-')) {
-            final String[] components = this.compoundWordBreaker.split(wordChars);
-            final boolean[] componentsValid = new boolean[components.length];
-            for (int index = 0; index < components.length; index ++) {
-                componentsValid[index] = isValidWord(components[index], pos, adhocDictionaries);
-            }
-            if (BooleanUtils.allTrue(componentsValid)) {
-                return true;
-            }
-        }
-
-        /* 5. Check derivative matches in adhoc dictionaries. */
-        if (adhocDictionaries != null) {
-            for (int dictIndex = 0; dictIndex < adhocDictionaries.size(); dictIndex ++) {
-                final Dictionary adhocDictionary = adhocDictionaries.get(dictIndex);
-                if (isDerivativeFound(adhocDictionary, wordChars)) {
-                    return true;
-                }
-            }
-        }
-
-        /* 6. Check derivative matches in standard dictionaries for the orthography. */
-        if (orthoDictionary != null) {
-            if (isDerivativeFound(orthoDictionary, wordChars)) {
-                return true;
-            }
-        }
-
-        /* Not found in any dictionary. */
-        /* If the first character is uppercase, convert to lowercase and try again. Discussion: For English at least, we
-         * do not want the opposite effect, i.e. to convert words starting with lowercase have the first char converted
-         * to uppercase. If the word is in the dictionary as a proper noun, we should treat a failure to capitalize it
-         * as a spelling error. Also, we do not want to generally convert the entire word to lowercase, as capital
-         * letters in the middle of the word should normally be treated as a spelling error. For exceptions to this
-         * last rule, users should enter the oddly-capitalized word into a dictionary in that form.
-         * TODO: This capability should be included in the orthography configuration instead of being hard-coded
-         * here. */
-        if (Character.isUpperCase(wordChars.charAt(0))) {
-            final StringBuilder builder = new StringBuilder(wordChars);
-            builder.setCharAt(0, Character.toLowerCase(wordChars.charAt(0)));
-            return isValidWord(builder, pos, adhocDictionaries);
-        }
-
-        return false;
-    }
-
-    private boolean isDerivativeFound(final Dictionary dictionary, final CharSequence wordChars) {
-        for (int listIndex = 0; listIndex < this.derivativeRuleListIds.size(); listIndex ++) {
-            final String ruleListKey = this.derivativeRuleListIds.get(listIndex);
-            final List<DerivativePattern> patternList = this.server.getDerivativePatterns(ruleListKey);
-            for (int patternIndex = 0; patternIndex < patternList.size(); patternIndex ++) {
-                final DerivativePattern pattern = patternList.get(patternIndex);
-                if (pattern.findFirstApplicableRule(wordChars, dictionary) != null) {
-                    return true;
-                }
-            }
-        }
-        return false;
-    }
-
-    @Override
-    public Word hyphenate(final CharSequence word, final int offset, final int length) {
-        /* The character sequence containing the characters in the word that we are looking for. */
-        final CharSequence chars = word.subSequence(offset, offset + length);
-        Word hyphenatedWord = null;
-
-        /* Look in the dictionary first, as it should be more accurate. */
-        final SegmentDictionary dictionary = getDictionary();
-        if (dictionary != null) {
-            hyphenatedWord = dictionary.getWord(chars.toString().toLowerCase(), 0);
-            if (hyphenatedWord == null) {
-                hyphenatedWord = findDerivatives(chars);
-            }
-        }
-
-
-        if (hyphenatedWord == null) {
-            /* The word was not found in the dictionary. Try the hyphenation patterns. */
-            final PatternTree patternTree = getHyphenationPatterns();
-            if (patternTree == null) {
-                return null;
-            }
-            hyphenatedWord = patternTree.hyphenate(chars, 0, length);
-        }
-
-        if (hyphenatedWord == null) {
-            return null;
-        }
-
-        final boolean capitalized = CharSequenceUtils.equalToCapitalized(hyphenatedWord.getNormalizedContent(), chars);
-        if (capitalized) {
-            return new CapitalizedWord(hyphenatedWord);
-        }
-        final boolean uppercase = CharSequenceUtils.equalToUppercase(hyphenatedWord.getNormalizedContent(), chars);
-        if (uppercase) {
-            return new UppercaseWord(hyphenatedWord);
-        }
-
-        if (CharSequenceUtils.hasAnyUppercase(chars)) {
-            /* There is unexpected capitalization. */
-            return new ExactWord(hyphenatedWord, chars.toString());
-        }
-
-        return hyphenatedWord;
-    }
-
-}

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java	2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java	2021-11-12 12:17:39 UTC (rev 12045)
@@ -77,9 +77,9 @@
     /** An EntityResolver to be used by XML parsers (for handling DTD catalogs, etc.). */
     private EntityResolver entityResolver = null;
 
-    /** Map of orthographies and their configurations to be used by this server. */
-    private Map<WritingSystem, OrthographyConfig4a> orthographyConfigurations =
-            new HashMap<WritingSystem, OrthographyConfig4a>();
+    /** Map of writing systems and their orthographies. */
+    private Map<WritingSystem, Orthography4a> orthographyMap =
+            new HashMap<WritingSystem, Orthography4a>();
 
     /** The map of match rule lists, keyed by id. */
     private Map<String, List<Pattern>> matchRuleLists = new HashMap<String, List<Pattern>>();
@@ -283,16 +283,16 @@
 
     /**
      * Registers a configuration for a given orthography.
-     * @param orthography The orthography for which the configuration should be registered.
-     * @param config The configuration for {@code orthography}.
+     * @param writingSystem The orthography for which the configuration should be registered.
+     * @param orthography The configuration for {@code orthography}.
      */
-    public void registerOrthographyConfig(final WritingSystem orthography, final OrthographyConfig4a config) {
-        this.orthographyConfigurations.put(orthography, config);
+    public void registerOrthography(final WritingSystem writingSystem, final Orthography4a orthography) {
+        this.orthographyMap.put(writingSystem, orthography);
     }
 
     @Override
-    public OrthographyConfig4a getOrthography(final WritingSystem orthography) {
-        return this.orthographyConfigurations.get(orthography);
+    public Orthography4a getOrthography(final WritingSystem writingSystem) {
+        return this.orthographyMap.get(writingSystem);
     }
 
     /**

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/ConfigParser.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/ConfigParser.java	2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/ConfigParser.java	2021-11-12 12:17:39 UTC (rev 12045)
@@ -41,7 +41,7 @@
 import org.foray.orthography.DerivativeRule;
 import org.foray.orthography.DictionaryResource;
 import org.foray.orthography.HyphenationPatternsResource;
-import org.foray.orthography.OrthographyConfig4a;
+import org.foray.orthography.Orthography4a;
 import org.foray.orthography.OrthographyServer4a;
 import org.foray.orthography.PosUtils;
 import org.foray.orthography.WordBreaker;
@@ -134,7 +134,7 @@
     private StringBuilder textAccumulator = new StringBuilder();
 
     /** Stateful variable tracking the current orthography configuration. */
-    private transient OrthographyConfig4a currentOrthographyConfig;
+    private transient Orthography4a currentOrthographyConfig;
 
 //    /** The map of match rule lists, keyed by id. */
 //    private Map<String, List<Pattern>> matchRuleLists = new HashMap<String, List<Pattern>>();
@@ -400,7 +400,7 @@
             return;
         }
         case "configuration": {
-            this.currentOrthographyConfig = new OrthographyConfig4a(this.hyphenationServer);
+            this.currentOrthographyConfig = new Orthography4a(this.hyphenationServer);
             return;
         }
         case "orthography": {
@@ -497,7 +497,7 @@
             this.logger.error("Unable to find script for: {}_{}_{}", languageString, countryString, scriptString);
             this.logger.error(getContextMessage());
         }
-        this.hyphenationServer.registerOrthographyConfig(orthography, this.currentOrthographyConfig);
+        this.hyphenationServer.registerOrthography(orthography, this.currentOrthographyConfig);
     }
 
     /**

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java	2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java	2021-11-12 12:17:39 UTC (rev 12045)
@@ -35,7 +35,7 @@
 import org.foray.common.primitive.ObjectUtils;
 import org.foray.common.primitive.StringUtils;
 import org.foray.common.primitive.XmlUtils;
-import org.foray.orthography.OrthographyConfig4a;
+import org.foray.orthography.Orthography4a;
 import org.foray.orthography.OrthographyServer4a;
 import org.foray.orthography.OrthographyServerConfig;
 import org.foray.orthography.SegmentDictionary;
@@ -99,7 +99,7 @@
         private WritingSystem writingSystem;
 
         /** The orthography configuration for this element, only if {@link #writingSystem} is not null. */
-        private OrthographyConfig4a orthographyConfig;
+        private Orthography4a orthographyConfig;
 
         /**
          * Checks whether a set of element descriptor items match this instance.
@@ -160,7 +160,7 @@
     private Stack<Element> elementStack = new Stack<Element>();
 
     /** The current orthography configuration. */
-    private OrthographyConfig4a currentOrthographyConfig;
+    private Orthography4a currentOrthographyConfig;
 
     /** The logger. */
     private Logger logger = LoggerFactory.getLogger(SpellChecker.class);
@@ -367,7 +367,7 @@
                         countryString, script.getAlphaCode());
                 this.output.println(message + locationString());
             } else {
-                final OrthographyConfig4a config = this.server.getOrthography(element.writingSystem);
+                final Orthography4a config = this.server.getOrthography(element.writingSystem);
                 if (config == null) {
                     final String message = String.format(
                             "Unconfigured orthography. Language: %1$s, Country: %2$s, Script: %3$s ",

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java	2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java	2021-11-12 12:17:39 UTC (rev 12045)
@@ -29,7 +29,7 @@
 package org.foray.orthography.util;
 
 import org.foray.common.i18n.WritingSystem4a;
-import org.foray.orthography.OrthographyConfig4a;
+import org.foray.orthography.Orthography4a;
 import org.foray.orthography.OrthographyServer4a;
 import org.foray.orthography.OrthographyServerConfig;
 import org.foray.orthography.SegmentDictionary;
@@ -85,7 +85,7 @@
 //    private Logger logger = LoggerFactory.getLogger(WordChecker.class);
 
     /** The current orthography configuration. */
-    private OrthographyConfig4a currentOrthographyConfig;
+    private Orthography4a currentOrthographyConfig;
 
     /** The Hyphenation server. */
     private OrthographyServer4a server;

Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/HyphenationConsumer4aTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/HyphenationConsumer4aTests.java	2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/HyphenationConsumer4aTests.java	2021-11-12 12:17:39 UTC (rev 12045)
@@ -142,7 +142,7 @@
         if (! testWord.toLowerCase().equals("hyphenation")) {
             throw new OrthographyException("Test of \"hyphenation\" has invalid input.");
         }
-        final OrthographyConfig4a orthography = server.getOrthography(WritingSystem4a.USA);
+        final Orthography4a orthography = server.getOrthography(WritingSystem4a.USA);
         final Word hyphenation = orthography.hyphenate(testWord, 0, testWord.length());
         Assert.assertNotNull(hyphenation);
         Assert.assertEquals(3, hyphenation.getQtyHyphenationPoints());
@@ -183,7 +183,7 @@
         if (! testWord.toLowerCase().equals("obligatory")) {
             throw new OrthographyException("Test of \"obligatory\" has invalid input.");
         }
-        final OrthographyConfig4a orthography = server.getOrthography(WritingSystem4a.USA);
+        final Orthography4a orthography = server.getOrthography(WritingSystem4a.USA);
         final Word hyphenation = orthography.hyphenate(testWord, 0, testWord.length());
         Assert.assertNotNull(hyphenation);
         Assert.assertEquals(4, hyphenation.getQtyHyphenationPoints());
@@ -202,7 +202,7 @@
     @Test
     public void testEnInvalidCharacter() throws OrthographyException {
         final String testWord = "table8";
-        final OrthographyConfig4a orthography = server.getOrthography(WritingSystem4a.USA);
+        final Orthography4a orthography = server.getOrthography(WritingSystem4a.USA);
         final Word hyphenation = orthography.hyphenate(testWord, 0, testWord.length());
         Assert.assertNull(hyphenation);
     }
@@ -215,7 +215,7 @@
     @Test
     public void testTimes() throws OrthographyException {
         final String testWord = "times";
-        final OrthographyConfig4a orthography = server.getOrthography(WritingSystem4a.USA);
+        final Orthography4a orthography = server.getOrthography(WritingSystem4a.USA);
         final Word hyphenation = orthography.hyphenate(testWord, 0, testWord.length());
         Assert.assertNotNull(hyphenation);
         Assert.assertEquals("times", hyphenation.toString());

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[FOray-commit] SF.net SVN: foray:[12045] trunk/foray/foray-orthography/src

Modular XSL-FO Implementation for Java.

[FOray-commit] SF.net SVN: foray:[12045] trunk/foray/foray-orthography/src