[FOray-commit] SF.net SVN: foray:[12045] trunk/foray/foray-orthography/src
Modular XSL-FO Implementation for Java.
Status: Alpha
Brought to you by:
victormote
|
From: <vic...@us...> - 2021-11-12 12:17:42
|
Revision: 12045
http://sourceforge.net/p/foray/code/12045
Author: victormote
Date: 2021-11-12 12:17:39 +0000 (Fri, 12 Nov 2021)
Log Message:
-----------
Rename OrthographyConfig4a to Orthography4a, for consistency and clarity.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/HyphenationConsumer4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/ConfigParser.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/HyphenationConsumer4aTests.java
Added Paths:
-----------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java
Removed Paths:
-------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/HyphenationConsumer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/HyphenationConsumer4a.java 2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/HyphenationConsumer4a.java 2021-11-12 12:17:39 UTC (rev 12045)
@@ -73,7 +73,7 @@
final ParaConfig paraConfig) {
final ParaBranch4a wordSequence = new ParaBranch4a(paraConfig);
final CharSequence sequence = characters.subSequence(startIndex, startIndex + length);
- final OrthographyConfig4a orthographyConfig = this.server.getOrthography(paraConfig.getWritingSystem());
+ final Orthography4a orthographyConfig = this.server.getOrthography(paraConfig.getWritingSystem());
if (orthographyConfig == null) {
this.logger.error("Orthography not configured for: " + paraConfig.getWritingSystem());
}
Copied: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java (from rev 12044, trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java)
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java (rev 0)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java 2021-11-12 12:17:39 UTC (rev 12045)
@@ -0,0 +1,402 @@
+/*
+ * Copyright 2019 The FOray Project.
+ * http://www.foray.org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * This work is in part derived from the following work(s), used with the
+ * permission of the licensor:
+ * Apache FOP, licensed by the Apache Software Foundation
+ *
+ */
+
+/*
+ * $LastChangedRevision$
+ * $LastChangedDate$
+ * $LastChangedBy$
+ */
+
+package org.foray.orthography;
+
+import org.foray.common.primitive.BooleanUtils;
+import org.foray.common.primitive.CharSequenceUtils;
+import org.foray.orthography.wrapper.CapitalizedWord;
+import org.foray.orthography.wrapper.ExactWord;
+import org.foray.orthography.wrapper.UppercaseWord;
+
+import org.axsl.orthography.Dictionary;
+import org.axsl.orthography.Orthography;
+import org.axsl.orthography.PartOfSpeech;
+import org.axsl.orthography.Word;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * The resources (dictionaries, word wrappers, hyphenation patterns, etc.) that should be used by a given orthography.
+ */
+public class Orthography4a implements Orthography {
+
+ /** The list of ids to lists of regex patterns, which, for this orthography, signal a valid word if matched. */
+ private List<String> matchRuleListIds = new ArrayList<String>();
+
+ /** The list of ids to lists of derivative rules, which, for this orthography, can compute derivative words. */
+ private List<String> derivativeRuleListIds = new ArrayList<String>();
+
+ /** The dictionary for this orthography.*/
+ private DictionaryResource dictionaryResource;
+
+ /** The hyphenation patterns for this orthography. */
+ private HyphenationPatternsResource hyphenationPatternsResource;
+
+ /** The list of word wrapper factories for this orthography. */
+ private List<WordWrapperFactory<?>> wordWrapperFactories = new ArrayList<WordWrapperFactory<?>>();
+
+ /** The word breaker for this orthography. */
+ private WordBreaker wordBreaker;
+
+ /** The parent hyphenation server. */
+ private OrthographyServer4a server;
+
+ /* TODO: Following orthography-specific config needs to be moved to XML or subclass. */
+ /** Character delimiting a compound word. NB: This variable may be orthography specific, and may therefore need to
+ * be moved to the orthography configuration. However, we have found no evidence yet for that need. */
+ private char compoundWordMarker = '-';
+
+ /** Regex pattern used to break compound words into their components. */
+ private Pattern compoundWordBreaker = Pattern.compile(Character.toString(compoundWordMarker));
+
+ /**
+ * Constructor.
+ * @param server The parent hyphenation server.
+ */
+ public Orthography4a(final OrthographyServer4a server) {
+ this.server = server;
+ }
+
+ /**
+ * Returns the list of match rule Ids.
+ * @return The list of match rule Ids.
+ */
+ public List<String> getMatchRuleListIds() {
+ return this.matchRuleListIds;
+ }
+
+ /**
+ * Adds a match rule list Id to this configuration.
+ * @param matchRuleListId The new match rule list Id.
+ */
+ public void registerMatchRuleListId(final String matchRuleListId) {
+ if (this.matchRuleListIds.contains(matchRuleListId)) {
+ throw new IllegalArgumentException(
+ "Match Rule List already configured for this orthography: " + matchRuleListId);
+ }
+ this.matchRuleListIds.add(matchRuleListId);
+ }
+
+ /**
+ * Returns the list of derivative rule Ids.
+ * @return The list of derivative rule Ids.
+ */
+ public List<String> getDerivativeRuleListIds() {
+ return this.derivativeRuleListIds;
+ }
+
+ /**
+ * Adds a derivative rule list Id to this configuration.
+ * @param derivativeRuleListId The new derivative rule list Id.
+ */
+ public void registerDerivativeRuleListId(final String derivativeRuleListId) {
+ if (this.derivativeRuleListIds.contains(derivativeRuleListId)) {
+ throw new IllegalArgumentException(
+ "Derivative Rule List already configured for this orthography: " + derivativeRuleListId);
+ }
+ this.derivativeRuleListIds.add(derivativeRuleListId);
+ }
+
+ /**
+ * Returns the dictionary resource.
+ * @return The dictionary resource.
+ */
+ public DictionaryResource getDictionaryResource() {
+ return this.dictionaryResource;
+ }
+
+ /**
+ * Sets the dictionary resource.
+ * @param dictionaryResource The dictionaryResource to set.
+ */
+ public void setDictionaryResource(final DictionaryResource dictionaryResource) {
+ this.dictionaryResource = dictionaryResource;
+ }
+
+ /**
+ * Returns the hyphenation patterns resource.
+ * @return The hyphenation patterns resource
+ */
+ public HyphenationPatternsResource getHyphenationPatternsResource() {
+ return this.hyphenationPatternsResource;
+ }
+
+ /**
+ * Sets the hyphenation patterns resource.
+ * @param hyphenationPatternsResource The hyphenation patterns resource to set.
+ */
+ public void setHyphenationPatternsResource(final HyphenationPatternsResource hyphenationPatternsResource) {
+ this.hyphenationPatternsResource = hyphenationPatternsResource;
+ }
+
+ /**
+ * Returns the list of word wrapper factories.
+ * @return The list of word wrapper factories.
+ */
+ public List<WordWrapperFactory<?>> getWordWrapperFactories() {
+ return this.wordWrapperFactories;
+ }
+
+ /**
+ * Sets the list of word wrapper factories.
+ * @param wordWrapperFactories The word wrapper factories to set.
+ */
+ public void setWordWrapperFactories(final List<WordWrapperFactory<?>> wordWrapperFactories) {
+ this.wordWrapperFactories = wordWrapperFactories;
+ }
+
+ /**
+ * Returns the word breaker.
+ * @return The word breaker.
+ */
+ public WordBreaker getWordBreaker() {
+ return this.wordBreaker;
+ }
+
+ /**
+ * Sets the word breaker.
+ * @param wordBreaker The word breaker to set.
+ */
+ public void setWordBreaker(final WordBreaker wordBreaker) {
+ this.wordBreaker = wordBreaker;
+ }
+
+ /**
+ * Returns the dictionary.
+ * @return The dictionary, or null if one is not configured or cannot be obtained.
+ */
+ public SegmentDictionary getDictionary() {
+ if (this.dictionaryResource == null) {
+ return null;
+ } else {
+ return this.dictionaryResource.getResource();
+ }
+ }
+
+ /**
+ * Returns the hyphenation patterns.
+ * @return The hyphenation patterns.
+ */
+ public PatternTree getHyphenationPatterns() {
+ if (this.hyphenationPatternsResource == null) {
+ return null;
+ } else {
+ return this.hyphenationPatternsResource.getResource();
+ }
+ }
+
+ /**
+ * Searches the configured word wrapper factories for a match that would create a word derived from a dictionary
+ * word.
+ * @param chars The word to test.
+ * @return A word wrapper if {@code chars} matches a word wrapper factory, or null if not.
+ */
+ public WordWrapper findDerivatives(final CharSequence chars) {
+ /* TODO: For now, this returns the first item that matches. This may need to be expanded to allow nested wrapped
+ * words. */
+ WordWrapper word = null;
+ final Dictionary dictionary = getDictionary();
+ for (int index = 0; index < this.wordWrapperFactories.size(); index ++) {
+ final WordWrapperFactory<?> factory = this.wordWrapperFactories.get(index);
+ word = factory.makeInstance(chars, dictionary);
+ if (word != null) {
+ return word;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Indicates whether a given word is found in the match rules for this orthography, i.e. rules looking for
+ * non-dictionary items such as numbers, currency, etc.
+ * @param wordChars The word to be tested.
+ * @return True if and only if {@code word} matches at least one match rule for this orthography.
+ */
+ public boolean foundInMatchRules(final CharSequence wordChars) {
+ for (int idIndex = 0; idIndex < getMatchRuleListIds().size(); idIndex ++) {
+ final String ruleListId = matchRuleListIds.get(idIndex);
+ final List<Pattern> validWordPatterns = server.getMatchRules(ruleListId);
+ for (int index = 0; index < validWordPatterns.size(); index ++) {
+ final Pattern pattern = validWordPatterns.get(index);
+ final Matcher matcher = pattern.matcher(wordChars);
+ if (matcher.matches()) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public Word getWord(final CharSequence wordChars, final PartOfSpeech pos,
+ final List<Dictionary> adhocDictionaries) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public boolean isValidWord(final CharSequence wordChars, final PartOfSpeech pos,
+ final List<Dictionary> adhocDictionaries) {
+ if (wordChars.length() < 1) {
+ return false;
+ }
+
+ /* 1. Check exact matches in adhoc dictionaries. */
+ if (adhocDictionaries != null) {
+ for (int index = 0; index < adhocDictionaries.size(); index ++) {
+ final Dictionary adhocDictionary = adhocDictionaries.get(index);
+ if (adhocDictionary.getWord(wordChars, 0) != null) {
+ return true;
+ }
+ }
+ }
+
+ /* 2. Check exact matches in standard dictionaries for the orthography. */
+ final Dictionary orthoDictionary = getDictionary();
+ if (orthoDictionary != null
+ && orthoDictionary.getWord(wordChars, 0) != null) {
+ return true;
+ }
+
+ /* 3. Check the match rules. */
+ if (foundInMatchRules(wordChars)) {
+ return true;
+ }
+
+ /* 4. Check for compound word. */
+ if (CharSequenceUtils.contains(wordChars, '-')) {
+ final String[] components = this.compoundWordBreaker.split(wordChars);
+ final boolean[] componentsValid = new boolean[components.length];
+ for (int index = 0; index < components.length; index ++) {
+ componentsValid[index] = isValidWord(components[index], pos, adhocDictionaries);
+ }
+ if (BooleanUtils.allTrue(componentsValid)) {
+ return true;
+ }
+ }
+
+ /* 5. Check derivative matches in adhoc dictionaries. */
+ if (adhocDictionaries != null) {
+ for (int dictIndex = 0; dictIndex < adhocDictionaries.size(); dictIndex ++) {
+ final Dictionary adhocDictionary = adhocDictionaries.get(dictIndex);
+ if (isDerivativeFound(adhocDictionary, wordChars)) {
+ return true;
+ }
+ }
+ }
+
+ /* 6. Check derivative matches in standard dictionaries for the orthography. */
+ if (orthoDictionary != null) {
+ if (isDerivativeFound(orthoDictionary, wordChars)) {
+ return true;
+ }
+ }
+
+ /* Not found in any dictionary. */
+ /* If the first character is uppercase, convert to lowercase and try again. Discussion: For English at least, we
+ * do not want the opposite effect, i.e. to convert words starting with lowercase have the first char converted
+ * to uppercase. If the word is in the dictionary as a proper noun, we should treat a failure to capitalize it
+ * as a spelling error. Also, we do not want to generally convert the entire word to lowercase, as capital
+ * letters in the middle of the word should normally be treated as a spelling error. For exceptions to this
+ * last rule, users should enter the oddly-capitalized word into a dictionary in that form.
+ * TODO: This capability should be included in the orthography configuration instead of being hard-coded
+ * here. */
+ if (Character.isUpperCase(wordChars.charAt(0))) {
+ final StringBuilder builder = new StringBuilder(wordChars);
+ builder.setCharAt(0, Character.toLowerCase(wordChars.charAt(0)));
+ return isValidWord(builder, pos, adhocDictionaries);
+ }
+
+ return false;
+ }
+
+ private boolean isDerivativeFound(final Dictionary dictionary, final CharSequence wordChars) {
+ for (int listIndex = 0; listIndex < this.derivativeRuleListIds.size(); listIndex ++) {
+ final String ruleListKey = this.derivativeRuleListIds.get(listIndex);
+ final List<DerivativePattern> patternList = this.server.getDerivativePatterns(ruleListKey);
+ for (int patternIndex = 0; patternIndex < patternList.size(); patternIndex ++) {
+ final DerivativePattern pattern = patternList.get(patternIndex);
+ if (pattern.findFirstApplicableRule(wordChars, dictionary) != null) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public Word hyphenate(final CharSequence word, final int offset, final int length) {
+ /* The character sequence containing the characters in the word that we are looking for. */
+ final CharSequence chars = word.subSequence(offset, offset + length);
+ Word hyphenatedWord = null;
+
+ /* Look in the dictionary first, as it should be more accurate. */
+ final SegmentDictionary dictionary = getDictionary();
+ if (dictionary != null) {
+ hyphenatedWord = dictionary.getWord(chars.toString().toLowerCase(), 0);
+ if (hyphenatedWord == null) {
+ hyphenatedWord = findDerivatives(chars);
+ }
+ }
+
+
+ if (hyphenatedWord == null) {
+ /* The word was not found in the dictionary. Try the hyphenation patterns. */
+ final PatternTree patternTree = getHyphenationPatterns();
+ if (patternTree == null) {
+ return null;
+ }
+ hyphenatedWord = patternTree.hyphenate(chars, 0, length);
+ }
+
+ if (hyphenatedWord == null) {
+ return null;
+ }
+
+ final boolean capitalized = CharSequenceUtils.equalToCapitalized(hyphenatedWord.getNormalizedContent(), chars);
+ if (capitalized) {
+ return new CapitalizedWord(hyphenatedWord);
+ }
+ final boolean uppercase = CharSequenceUtils.equalToUppercase(hyphenatedWord.getNormalizedContent(), chars);
+ if (uppercase) {
+ return new UppercaseWord(hyphenatedWord);
+ }
+
+ if (CharSequenceUtils.hasAnyUppercase(chars)) {
+ /* There is unexpected capitalization. */
+ return new ExactWord(hyphenatedWord, chars.toString());
+ }
+
+ return hyphenatedWord;
+ }
+
+}
Deleted: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java 2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyConfig4a.java 2021-11-12 12:17:39 UTC (rev 12045)
@@ -1,402 +0,0 @@
-/*
- * Copyright 2019 The FOray Project.
- * http://www.foray.org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * This work is in part derived from the following work(s), used with the
- * permission of the licensor:
- * Apache FOP, licensed by the Apache Software Foundation
- *
- */
-
-/*
- * $LastChangedRevision$
- * $LastChangedDate$
- * $LastChangedBy$
- */
-
-package org.foray.orthography;
-
-import org.foray.common.primitive.BooleanUtils;
-import org.foray.common.primitive.CharSequenceUtils;
-import org.foray.orthography.wrapper.CapitalizedWord;
-import org.foray.orthography.wrapper.ExactWord;
-import org.foray.orthography.wrapper.UppercaseWord;
-
-import org.axsl.orthography.Dictionary;
-import org.axsl.orthography.Orthography;
-import org.axsl.orthography.PartOfSpeech;
-import org.axsl.orthography.Word;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * The resources (dictionaries, word wrappers, hyphenation patterns, etc.) that should be used by a given orthography.
- */
-public class OrthographyConfig4a implements Orthography {
-
- /** The list of ids to lists of regex patterns, which, for this orthography, signal a valid word if matched. */
- private List<String> matchRuleListIds = new ArrayList<String>();
-
- /** The list of ids to lists of derivative rules, which, for this orthography, can compute derivative words. */
- private List<String> derivativeRuleListIds = new ArrayList<String>();
-
- /** The dictionary for this orthography.*/
- private DictionaryResource dictionaryResource;
-
- /** The hyphenation patterns for this orthography. */
- private HyphenationPatternsResource hyphenationPatternsResource;
-
- /** The list of word wrapper factories for this orthography. */
- private List<WordWrapperFactory<?>> wordWrapperFactories = new ArrayList<WordWrapperFactory<?>>();
-
- /** The word breaker for this orthography. */
- private WordBreaker wordBreaker;
-
- /** The parent hyphenation server. */
- private OrthographyServer4a server;
-
- /* TODO: Following orthography-specific config needs to be moved to XML or subclass. */
- /** Character delimiting a compound word. NB: This variable may be orthography specific, and may therefore need to
- * be moved to the orthography configuration. However, we have found no evidence yet for that need. */
- private char compoundWordMarker = '-';
-
- /** Regex pattern used to break compound words into their components. */
- private Pattern compoundWordBreaker = Pattern.compile(Character.toString(compoundWordMarker));
-
- /**
- * Constructor.
- * @param server The parent hyphenation server.
- */
- public OrthographyConfig4a(final OrthographyServer4a server) {
- this.server = server;
- }
-
- /**
- * Returns the list of match rule Ids.
- * @return The list of match rule Ids.
- */
- public List<String> getMatchRuleListIds() {
- return this.matchRuleListIds;
- }
-
- /**
- * Adds a match rule list Id to this configuration.
- * @param matchRuleListId The new match rule list Id.
- */
- public void registerMatchRuleListId(final String matchRuleListId) {
- if (this.matchRuleListIds.contains(matchRuleListId)) {
- throw new IllegalArgumentException(
- "Match Rule List already configured for this orthography: " + matchRuleListId);
- }
- this.matchRuleListIds.add(matchRuleListId);
- }
-
- /**
- * Returns the list of derivative rule Ids.
- * @return The list of derivative rule Ids.
- */
- public List<String> getDerivativeRuleListIds() {
- return this.derivativeRuleListIds;
- }
-
- /**
- * Adds a derivative rule list Id to this configuration.
- * @param derivativeRuleListId The new derivative rule list Id.
- */
- public void registerDerivativeRuleListId(final String derivativeRuleListId) {
- if (this.derivativeRuleListIds.contains(derivativeRuleListId)) {
- throw new IllegalArgumentException(
- "Derivative Rule List already configured for this orthography: " + derivativeRuleListId);
- }
- this.derivativeRuleListIds.add(derivativeRuleListId);
- }
-
- /**
- * Returns the dictionary resource.
- * @return The dictionary resource.
- */
- public DictionaryResource getDictionaryResource() {
- return this.dictionaryResource;
- }
-
- /**
- * Sets the dictionary resource.
- * @param dictionaryResource The dictionaryResource to set.
- */
- public void setDictionaryResource(final DictionaryResource dictionaryResource) {
- this.dictionaryResource = dictionaryResource;
- }
-
- /**
- * Returns the hyphenation patterns resource.
- * @return The hyphenation patterns resource
- */
- public HyphenationPatternsResource getHyphenationPatternsResource() {
- return this.hyphenationPatternsResource;
- }
-
- /**
- * Sets the hyphenation patterns resource.
- * @param hyphenationPatternsResource The hyphenation patterns resource to set.
- */
- public void setHyphenationPatternsResource(final HyphenationPatternsResource hyphenationPatternsResource) {
- this.hyphenationPatternsResource = hyphenationPatternsResource;
- }
-
- /**
- * Returns the list of word wrapper factories.
- * @return The list of word wrapper factories.
- */
- public List<WordWrapperFactory<?>> getWordWrapperFactories() {
- return this.wordWrapperFactories;
- }
-
- /**
- * Sets the list of word wrapper factories.
- * @param wordWrapperFactories The word wrapper factories to set.
- */
- public void setWordWrapperFactories(final List<WordWrapperFactory<?>> wordWrapperFactories) {
- this.wordWrapperFactories = wordWrapperFactories;
- }
-
- /**
- * Returns the word breaker.
- * @return The word breaker.
- */
- public WordBreaker getWordBreaker() {
- return this.wordBreaker;
- }
-
- /**
- * Sets the word breaker.
- * @param wordBreaker The word breaker to set.
- */
- public void setWordBreaker(final WordBreaker wordBreaker) {
- this.wordBreaker = wordBreaker;
- }
-
- /**
- * Returns the dictionary.
- * @return The dictionary, or null if one is not configured or cannot be obtained.
- */
- public SegmentDictionary getDictionary() {
- if (this.dictionaryResource == null) {
- return null;
- } else {
- return this.dictionaryResource.getResource();
- }
- }
-
- /**
- * Returns the hyphenation patterns.
- * @return The hyphenation patterns.
- */
- public PatternTree getHyphenationPatterns() {
- if (this.hyphenationPatternsResource == null) {
- return null;
- } else {
- return this.hyphenationPatternsResource.getResource();
- }
- }
-
- /**
- * Searches the configured word wrapper factories for a match that would create a word derived from a dictionary
- * word.
- * @param chars The word to test.
- * @return A word wrapper if {@code chars} matches a word wrapper factory, or null if not.
- */
- public WordWrapper findDerivatives(final CharSequence chars) {
- /* TODO: For now, this returns the first item that matches. This may need to be expanded to allow nested wrapped
- * words. */
- WordWrapper word = null;
- final Dictionary dictionary = getDictionary();
- for (int index = 0; index < this.wordWrapperFactories.size(); index ++) {
- final WordWrapperFactory<?> factory = this.wordWrapperFactories.get(index);
- word = factory.makeInstance(chars, dictionary);
- if (word != null) {
- return word;
- }
- }
- return null;
- }
-
- /**
- * Indicates whether a given word is found in the match rules for this orthography, i.e. rules looking for
- * non-dictionary items such as numbers, currency, etc.
- * @param wordChars The word to be tested.
- * @return True if and only if {@code word} matches at least one match rule for this orthography.
- */
- public boolean foundInMatchRules(final CharSequence wordChars) {
- for (int idIndex = 0; idIndex < getMatchRuleListIds().size(); idIndex ++) {
- final String ruleListId = matchRuleListIds.get(idIndex);
- final List<Pattern> validWordPatterns = server.getMatchRules(ruleListId);
- for (int index = 0; index < validWordPatterns.size(); index ++) {
- final Pattern pattern = validWordPatterns.get(index);
- final Matcher matcher = pattern.matcher(wordChars);
- if (matcher.matches()) {
- return true;
- }
- }
- }
- return false;
- }
-
- @Override
- public Word getWord(final CharSequence wordChars, final PartOfSpeech pos,
- final List<Dictionary> adhocDictionaries) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public boolean isValidWord(final CharSequence wordChars, final PartOfSpeech pos,
- final List<Dictionary> adhocDictionaries) {
- if (wordChars.length() < 1) {
- return false;
- }
-
- /* 1. Check exact matches in adhoc dictionaries. */
- if (adhocDictionaries != null) {
- for (int index = 0; index < adhocDictionaries.size(); index ++) {
- final Dictionary adhocDictionary = adhocDictionaries.get(index);
- if (adhocDictionary.getWord(wordChars, 0) != null) {
- return true;
- }
- }
- }
-
- /* 2. Check exact matches in standard dictionaries for the orthography. */
- final Dictionary orthoDictionary = getDictionary();
- if (orthoDictionary != null
- && orthoDictionary.getWord(wordChars, 0) != null) {
- return true;
- }
-
- /* 3. Check the match rules. */
- if (foundInMatchRules(wordChars)) {
- return true;
- }
-
- /* 4. Check for compound word. */
- if (CharSequenceUtils.contains(wordChars, '-')) {
- final String[] components = this.compoundWordBreaker.split(wordChars);
- final boolean[] componentsValid = new boolean[components.length];
- for (int index = 0; index < components.length; index ++) {
- componentsValid[index] = isValidWord(components[index], pos, adhocDictionaries);
- }
- if (BooleanUtils.allTrue(componentsValid)) {
- return true;
- }
- }
-
- /* 5. Check derivative matches in adhoc dictionaries. */
- if (adhocDictionaries != null) {
- for (int dictIndex = 0; dictIndex < adhocDictionaries.size(); dictIndex ++) {
- final Dictionary adhocDictionary = adhocDictionaries.get(dictIndex);
- if (isDerivativeFound(adhocDictionary, wordChars)) {
- return true;
- }
- }
- }
-
- /* 6. Check derivative matches in standard dictionaries for the orthography. */
- if (orthoDictionary != null) {
- if (isDerivativeFound(orthoDictionary, wordChars)) {
- return true;
- }
- }
-
- /* Not found in any dictionary. */
- /* If the first character is uppercase, convert to lowercase and try again. Discussion: For English at least, we
- * do not want the opposite effect, i.e. to convert words starting with lowercase have the first char converted
- * to uppercase. If the word is in the dictionary as a proper noun, we should treat a failure to capitalize it
- * as a spelling error. Also, we do not want to generally convert the entire word to lowercase, as capital
- * letters in the middle of the word should normally be treated as a spelling error. For exceptions to this
- * last rule, users should enter the oddly-capitalized word into a dictionary in that form.
- * TODO: This capability should be included in the orthography configuration instead of being hard-coded
- * here. */
- if (Character.isUpperCase(wordChars.charAt(0))) {
- final StringBuilder builder = new StringBuilder(wordChars);
- builder.setCharAt(0, Character.toLowerCase(wordChars.charAt(0)));
- return isValidWord(builder, pos, adhocDictionaries);
- }
-
- return false;
- }
-
- private boolean isDerivativeFound(final Dictionary dictionary, final CharSequence wordChars) {
- for (int listIndex = 0; listIndex < this.derivativeRuleListIds.size(); listIndex ++) {
- final String ruleListKey = this.derivativeRuleListIds.get(listIndex);
- final List<DerivativePattern> patternList = this.server.getDerivativePatterns(ruleListKey);
- for (int patternIndex = 0; patternIndex < patternList.size(); patternIndex ++) {
- final DerivativePattern pattern = patternList.get(patternIndex);
- if (pattern.findFirstApplicableRule(wordChars, dictionary) != null) {
- return true;
- }
- }
- }
- return false;
- }
-
- @Override
- public Word hyphenate(final CharSequence word, final int offset, final int length) {
- /* The character sequence containing the characters in the word that we are looking for. */
- final CharSequence chars = word.subSequence(offset, offset + length);
- Word hyphenatedWord = null;
-
- /* Look in the dictionary first, as it should be more accurate. */
- final SegmentDictionary dictionary = getDictionary();
- if (dictionary != null) {
- hyphenatedWord = dictionary.getWord(chars.toString().toLowerCase(), 0);
- if (hyphenatedWord == null) {
- hyphenatedWord = findDerivatives(chars);
- }
- }
-
-
- if (hyphenatedWord == null) {
- /* The word was not found in the dictionary. Try the hyphenation patterns. */
- final PatternTree patternTree = getHyphenationPatterns();
- if (patternTree == null) {
- return null;
- }
- hyphenatedWord = patternTree.hyphenate(chars, 0, length);
- }
-
- if (hyphenatedWord == null) {
- return null;
- }
-
- final boolean capitalized = CharSequenceUtils.equalToCapitalized(hyphenatedWord.getNormalizedContent(), chars);
- if (capitalized) {
- return new CapitalizedWord(hyphenatedWord);
- }
- final boolean uppercase = CharSequenceUtils.equalToUppercase(hyphenatedWord.getNormalizedContent(), chars);
- if (uppercase) {
- return new UppercaseWord(hyphenatedWord);
- }
-
- if (CharSequenceUtils.hasAnyUppercase(chars)) {
- /* There is unexpected capitalization. */
- return new ExactWord(hyphenatedWord, chars.toString());
- }
-
- return hyphenatedWord;
- }
-
-}
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java 2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java 2021-11-12 12:17:39 UTC (rev 12045)
@@ -77,9 +77,9 @@
/** An EntityResolver to be used by XML parsers (for handling DTD catalogs, etc.). */
private EntityResolver entityResolver = null;
- /** Map of orthographies and their configurations to be used by this server. */
- private Map<WritingSystem, OrthographyConfig4a> orthographyConfigurations =
- new HashMap<WritingSystem, OrthographyConfig4a>();
+ /** Map of writing systems and their orthographies. */
+ private Map<WritingSystem, Orthography4a> orthographyMap =
+ new HashMap<WritingSystem, Orthography4a>();
/** The map of match rule lists, keyed by id. */
private Map<String, List<Pattern>> matchRuleLists = new HashMap<String, List<Pattern>>();
@@ -283,16 +283,16 @@
/**
* Registers a configuration for a given orthography.
- * @param orthography The orthography for which the configuration should be registered.
- * @param config The configuration for {@code orthography}.
+ * @param writingSystem The orthography for which the configuration should be registered.
+ * @param orthography The configuration for {@code orthography}.
*/
- public void registerOrthographyConfig(final WritingSystem orthography, final OrthographyConfig4a config) {
- this.orthographyConfigurations.put(orthography, config);
+ public void registerOrthography(final WritingSystem writingSystem, final Orthography4a orthography) {
+ this.orthographyMap.put(writingSystem, orthography);
}
@Override
- public OrthographyConfig4a getOrthography(final WritingSystem orthography) {
- return this.orthographyConfigurations.get(orthography);
+ public Orthography4a getOrthography(final WritingSystem writingSystem) {
+ return this.orthographyMap.get(writingSystem);
}
/**
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/ConfigParser.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/ConfigParser.java 2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/ConfigParser.java 2021-11-12 12:17:39 UTC (rev 12045)
@@ -41,7 +41,7 @@
import org.foray.orthography.DerivativeRule;
import org.foray.orthography.DictionaryResource;
import org.foray.orthography.HyphenationPatternsResource;
-import org.foray.orthography.OrthographyConfig4a;
+import org.foray.orthography.Orthography4a;
import org.foray.orthography.OrthographyServer4a;
import org.foray.orthography.PosUtils;
import org.foray.orthography.WordBreaker;
@@ -134,7 +134,7 @@
private StringBuilder textAccumulator = new StringBuilder();
/** Stateful variable tracking the current orthography configuration. */
- private transient OrthographyConfig4a currentOrthographyConfig;
+ private transient Orthography4a currentOrthographyConfig;
// /** The map of match rule lists, keyed by id. */
// private Map<String, List<Pattern>> matchRuleLists = new HashMap<String, List<Pattern>>();
@@ -400,7 +400,7 @@
return;
}
case "configuration": {
- this.currentOrthographyConfig = new OrthographyConfig4a(this.hyphenationServer);
+ this.currentOrthographyConfig = new Orthography4a(this.hyphenationServer);
return;
}
case "orthography": {
@@ -497,7 +497,7 @@
this.logger.error("Unable to find script for: {}_{}_{}", languageString, countryString, scriptString);
this.logger.error(getContextMessage());
}
- this.hyphenationServer.registerOrthographyConfig(orthography, this.currentOrthographyConfig);
+ this.hyphenationServer.registerOrthography(orthography, this.currentOrthographyConfig);
}
/**
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2021-11-12 12:17:39 UTC (rev 12045)
@@ -35,7 +35,7 @@
import org.foray.common.primitive.ObjectUtils;
import org.foray.common.primitive.StringUtils;
import org.foray.common.primitive.XmlUtils;
-import org.foray.orthography.OrthographyConfig4a;
+import org.foray.orthography.Orthography4a;
import org.foray.orthography.OrthographyServer4a;
import org.foray.orthography.OrthographyServerConfig;
import org.foray.orthography.SegmentDictionary;
@@ -99,7 +99,7 @@
private WritingSystem writingSystem;
/** The orthography configuration for this element, only if {@link #writingSystem} is not null. */
- private OrthographyConfig4a orthographyConfig;
+ private Orthography4a orthographyConfig;
/**
* Checks whether a set of element descriptor items match this instance.
@@ -160,7 +160,7 @@
private Stack<Element> elementStack = new Stack<Element>();
/** The current orthography configuration. */
- private OrthographyConfig4a currentOrthographyConfig;
+ private Orthography4a currentOrthographyConfig;
/** The logger. */
private Logger logger = LoggerFactory.getLogger(SpellChecker.class);
@@ -367,7 +367,7 @@
countryString, script.getAlphaCode());
this.output.println(message + locationString());
} else {
- final OrthographyConfig4a config = this.server.getOrthography(element.writingSystem);
+ final Orthography4a config = this.server.getOrthography(element.writingSystem);
if (config == null) {
final String message = String.format(
"Unconfigured orthography. Language: %1$s, Country: %2$s, Script: %3$s ",
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java 2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java 2021-11-12 12:17:39 UTC (rev 12045)
@@ -29,7 +29,7 @@
package org.foray.orthography.util;
import org.foray.common.i18n.WritingSystem4a;
-import org.foray.orthography.OrthographyConfig4a;
+import org.foray.orthography.Orthography4a;
import org.foray.orthography.OrthographyServer4a;
import org.foray.orthography.OrthographyServerConfig;
import org.foray.orthography.SegmentDictionary;
@@ -85,7 +85,7 @@
// private Logger logger = LoggerFactory.getLogger(WordChecker.class);
/** The current orthography configuration. */
- private OrthographyConfig4a currentOrthographyConfig;
+ private Orthography4a currentOrthographyConfig;
/** The Hyphenation server. */
private OrthographyServer4a server;
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/HyphenationConsumer4aTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/HyphenationConsumer4aTests.java 2021-11-12 12:04:06 UTC (rev 12044)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/HyphenationConsumer4aTests.java 2021-11-12 12:17:39 UTC (rev 12045)
@@ -142,7 +142,7 @@
if (! testWord.toLowerCase().equals("hyphenation")) {
throw new OrthographyException("Test of \"hyphenation\" has invalid input.");
}
- final OrthographyConfig4a orthography = server.getOrthography(WritingSystem4a.USA);
+ final Orthography4a orthography = server.getOrthography(WritingSystem4a.USA);
final Word hyphenation = orthography.hyphenate(testWord, 0, testWord.length());
Assert.assertNotNull(hyphenation);
Assert.assertEquals(3, hyphenation.getQtyHyphenationPoints());
@@ -183,7 +183,7 @@
if (! testWord.toLowerCase().equals("obligatory")) {
throw new OrthographyException("Test of \"obligatory\" has invalid input.");
}
- final OrthographyConfig4a orthography = server.getOrthography(WritingSystem4a.USA);
+ final Orthography4a orthography = server.getOrthography(WritingSystem4a.USA);
final Word hyphenation = orthography.hyphenate(testWord, 0, testWord.length());
Assert.assertNotNull(hyphenation);
Assert.assertEquals(4, hyphenation.getQtyHyphenationPoints());
@@ -202,7 +202,7 @@
@Test
public void testEnInvalidCharacter() throws OrthographyException {
final String testWord = "table8";
- final OrthographyConfig4a orthography = server.getOrthography(WritingSystem4a.USA);
+ final Orthography4a orthography = server.getOrthography(WritingSystem4a.USA);
final Word hyphenation = orthography.hyphenate(testWord, 0, testWord.length());
Assert.assertNull(hyphenation);
}
@@ -215,7 +215,7 @@
@Test
public void testTimes() throws OrthographyException {
final String testWord = "times";
- final OrthographyConfig4a orthography = server.getOrthography(WritingSystem4a.USA);
+ final Orthography4a orthography = server.getOrthography(WritingSystem4a.USA);
final Word hyphenation = orthography.hyphenate(testWord, 0, testWord.length());
Assert.assertNotNull(hyphenation);
Assert.assertEquals("times", hyphenation.toString());
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|