foray-commit Mailing List for FOray (Page 24)
Modular XSL-FO Implementation for Java.
Status: Alpha
Brought to you by:
victormote
You can subscribe to this list here.
| 2006 |
Jan
|
Feb
|
Mar
(139) |
Apr
(98) |
May
(250) |
Jun
(394) |
Jul
(84) |
Aug
(13) |
Sep
(420) |
Oct
(186) |
Nov
(1) |
Dec
(3) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2007 |
Jan
(108) |
Feb
(202) |
Mar
(291) |
Apr
(247) |
May
(374) |
Jun
(227) |
Jul
(231) |
Aug
(60) |
Sep
(31) |
Oct
(45) |
Nov
(18) |
Dec
|
| 2008 |
Jan
(38) |
Feb
(71) |
Mar
(142) |
Apr
|
May
(59) |
Jun
(6) |
Jul
(10) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2009 |
Jan
(12) |
Feb
(4) |
Mar
(88) |
Apr
(121) |
May
(17) |
Jun
(30) |
Jul
|
Aug
(5) |
Sep
|
Oct
(1) |
Nov
|
Dec
|
| 2010 |
Jan
(11) |
Feb
(76) |
Mar
(11) |
Apr
|
May
(11) |
Jun
|
Jul
|
Aug
(44) |
Sep
(14) |
Oct
(7) |
Nov
|
Dec
|
| 2011 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(9) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
(10) |
Nov
|
Dec
|
| 2012 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
(3) |
Jul
(4) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2016 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
(168) |
| 2017 |
Jan
(77) |
Feb
(11) |
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2018 |
Jan
|
Feb
|
Mar
(1) |
Apr
(6) |
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2019 |
Jan
|
Feb
(88) |
Mar
(118) |
Apr
(1) |
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2020 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(6) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
(141) |
| 2021 |
Jan
(170) |
Feb
(20) |
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
|
Sep
|
Oct
(62) |
Nov
(189) |
Dec
(162) |
| 2022 |
Jan
(201) |
Feb
(118) |
Mar
(8) |
Apr
|
May
(2) |
Jun
(47) |
Jul
(19) |
Aug
(14) |
Sep
(3) |
Oct
|
Nov
(28) |
Dec
(235) |
| 2023 |
Jan
(112) |
Feb
(23) |
Mar
(2) |
Apr
(2) |
May
|
Jun
(1) |
Jul
|
Aug
(70) |
Sep
(92) |
Oct
(20) |
Nov
(1) |
Dec
(1) |
| 2024 |
Jan
|
Feb
|
Mar
(1) |
Apr
(1) |
May
(14) |
Jun
(11) |
Jul
(1) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2025 |
Jan
(10) |
Feb
(29) |
Mar
|
Apr
(162) |
May
(245) |
Jun
(83) |
Jul
|
Aug
(1) |
Sep
|
Oct
|
Nov
|
Dec
|
|
From: <vic...@us...> - 2023-10-02 23:19:36
|
Revision: 13301
http://sourceforge.net/p/foray/code/13301
Author: victormote
Date: 2023-10-02 23:19:34 +0000 (Mon, 02 Oct 2023)
Log Message:
-----------
Minor dependency cleanup.
Modified Paths:
--------------
trunk/foray/foray-orthography/build.gradle
Modified: trunk/foray/foray-orthography/build.gradle
===================================================================
--- trunk/foray/foray-orthography/build.gradle 2023-10-02 22:55:54 UTC (rev 13300)
+++ trunk/foray/foray-orthography/build.gradle 2023-10-02 23:19:34 UTC (rev 13301)
@@ -11,17 +11,16 @@
api (group: 'commons-io', name: 'commons-io', version: versions.commonsIo)
implementation (group: 'com.ibm.icu', name: 'icu4j', version: versions.icu4j)
- api (group: 'org.axsl', name: 'axsl-constants', version: versions.axsl)
- api (group: 'org.axsl', name: 'axsl-primitive', version: versions.axsl)
+ implementation (group: 'org.axsl', name: 'axsl-constants', version: versions.axsl)
+ api (group: 'org.axsl', name: 'axsl-fotree', version: versions.axsl)
api (group: 'org.axsl', name: 'axsl-i18n', version: versions.axsl)
+ api (group: 'org.axsl', name: 'axsl-kp-model', version: versions.axsl)
api (group: 'org.axsl', name: 'axsl-orthography', version: versions.axsl)
- api (group: 'org.axsl', name: 'axsl-fotree', version: versions.axsl)
- api (group: 'org.axsl', name: 'axsl-kp-model', version: versions.axsl)
- implementation (group: 'org.axsl', name: 'axsl-constants', version: versions.axsl)
+ api (group: 'org.axsl', name: 'axsl-primitive', version: versions.axsl)
api (group: 'org.axsl', name: 'axsl-value', version: versions.axsl)
+ api (project(':foray-common'))
api (project(':foray-primitive'))
- api (project(':foray-common'))
api (project(':foray-xml'))
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-10-02 22:55:58
|
Revision: 13300
http://sourceforge.net/p/foray/code/13300
Author: victormote
Date: 2023-10-02 22:55:54 +0000 (Mon, 02 Oct 2023)
Log Message:
-----------
Conform to aXSL changes: Move the text token flow interfaces from axsl-orthography to axsl-fotree.
Modified Paths:
--------------
trunk/foray/foray-areatree/src/main/java/org/foray/area/AbstractAncestralInlineArea.java
trunk/foray/foray-areatree/src/main/java/org/foray/area/LineArea4a.java
trunk/foray/foray-areatree/src/main/java/org/foray/area/TextAreaWords.java
trunk/foray/foray-content/src/main/java/org/foray/content/TextTokensContent4a.java
trunk/foray/foray-fotree/src/main/java/org/foray/fotree/fo/obj/FoTextWords4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/MutableTokenFlowLocation.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/TokenFlow4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/TokenFlow4aTests.java
trunk/foray/foray-pioneer/src/main/java/org/foray/pioneer/FoTextWordsPnr.java
Modified: trunk/foray/foray-areatree/src/main/java/org/foray/area/AbstractAncestralInlineArea.java
===================================================================
--- trunk/foray/foray-areatree/src/main/java/org/foray/area/AbstractAncestralInlineArea.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-areatree/src/main/java/org/foray/area/AbstractAncestralInlineArea.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -46,8 +46,8 @@
import org.axsl.fotree.fo.PageNumberCitationLast;
import org.axsl.fotree.fo.RetrieveMarker;
import org.axsl.fotree.fo.ScalingValueCitation;
+import org.axsl.fotree.text.TextTokenFlowLocation;
import org.axsl.galley.GlyphAreaSequenceG5;
-import org.axsl.orthography.TextTokenFlowLocation;
import java.math.BigDecimal;
import java.util.ArrayList;
Modified: trunk/foray/foray-areatree/src/main/java/org/foray/area/LineArea4a.java
===================================================================
--- trunk/foray/foray-areatree/src/main/java/org/foray/area/LineArea4a.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-areatree/src/main/java/org/foray/area/LineArea4a.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -53,10 +53,10 @@
import org.axsl.fotree.fo.RetrieveMarker;
import org.axsl.fotree.fo.ScalingValueCitation;
import org.axsl.fotree.role.NormalBlockAreaGenerator;
+import org.axsl.fotree.text.TextTokenFlowLocation;
import org.axsl.galley.GlyphAreaSequenceG5;
import org.axsl.galley.render.GalleyVisitor;
import org.axsl.galley.render.GalleyVisitorException;
-import org.axsl.orthography.TextTokenFlowLocation;
import org.axsl.value.Conditionality;
import org.axsl.value.LineStackingStrategy;
import org.axsl.value.RelativeAxis;
Modified: trunk/foray/foray-areatree/src/main/java/org/foray/area/TextAreaWords.java
===================================================================
--- trunk/foray/foray-areatree/src/main/java/org/foray/area/TextAreaWords.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-areatree/src/main/java/org/foray/area/TextAreaWords.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -32,7 +32,7 @@
import org.axsl.fotree.fo.FoTextWords;
import org.axsl.fotree.text.FoTextTokenFlow;
-import org.axsl.orthography.TextTokenFlowLocation;
+import org.axsl.fotree.text.TextTokenFlowLocation;
import org.axsl.value.group.TextModifiers;
import java.math.BigDecimal;
Modified: trunk/foray/foray-content/src/main/java/org/foray/content/TextTokensContent4a.java
===================================================================
--- trunk/foray/foray-content/src/main/java/org/foray/content/TextTokensContent4a.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-content/src/main/java/org/foray/content/TextTokensContent4a.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -36,12 +36,12 @@
import org.axsl.fotree.FoContext;
import org.axsl.fotree.fo.FoTextWords;
import org.axsl.fotree.text.FoTextTokenFlow;
+import org.axsl.fotree.text.TextTokenFlowLocation;
import org.axsl.kp.KpContext;
import org.axsl.kp.KpLeaf;
import org.axsl.kp.KpLeafIterator;
import org.axsl.kp.KpNode;
import org.axsl.orthography.Orthography;
-import org.axsl.orthography.TextTokenFlowLocation;
/**
* Content wrapper for {@link FoTextWords}.
Modified: trunk/foray/foray-fotree/src/main/java/org/foray/fotree/fo/obj/FoTextWords4a.java
===================================================================
--- trunk/foray/foray-fotree/src/main/java/org/foray/fotree/fo/obj/FoTextWords4a.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-fotree/src/main/java/org/foray/fotree/fo/obj/FoTextWords4a.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -37,6 +37,7 @@
import org.axsl.fotree.text.FoOrthographyServer;
import org.axsl.fotree.text.FoTextToken;
import org.axsl.fotree.text.FoTextTokenFlow;
+import org.axsl.fotree.text.FoWhitespace;
import org.axsl.kp.KpBranch;
import org.axsl.kp.KpContext;
import org.axsl.kp.KpLeaf;
@@ -43,7 +44,6 @@
import org.axsl.kp.KpLeafIterator;
import org.axsl.kp.KpNode;
import org.axsl.orthography.OrthographyException;
-import org.axsl.orthography.Whitespace;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -91,7 +91,7 @@
public boolean isAllWhiteSpace() {
for (int index = 0; index < this.tokenFlow.qtyTokens(); index ++) {
final FoTextToken token = this.tokenFlow.tokenAt(index);
- if (! (token instanceof Whitespace)) {
+ if (! (token instanceof FoWhitespace)) {
return false;
}
}
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/MutableTokenFlowLocation.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/MutableTokenFlowLocation.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/MutableTokenFlowLocation.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -30,8 +30,7 @@
import org.foray.common.ComparablePlus;
-import org.axsl.orthography.TextTokenFlow;
-import org.axsl.orthography.TextTokenFlowLocation;
+import org.axsl.fotree.text.TextTokenFlowLocation;
/**
* A mutable version of {@link TextTokenFlowLocation}, useful when iterating locations.
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/TokenFlow4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/TokenFlow4a.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/TokenFlow4a.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -31,13 +31,13 @@
import org.axsl.constants.PrimitiveConstants;
import org.axsl.fotree.text.FoTextToken;
import org.axsl.fotree.text.FoTextTokenFlow;
+import org.axsl.fotree.text.FoWhitespace;
import org.axsl.fotree.text.FoWord;
import org.axsl.fotree.text.FoWordSegment;
+import org.axsl.fotree.text.TextTokenFlowLocation;
import org.axsl.kp.KpContext;
import org.axsl.kp.KpLeafIterator;
import org.axsl.kp.KpNode;
-import org.axsl.orthography.TextTokenFlowLocation;
-import org.axsl.orthography.Whitespace;
import org.axsl.value.WhiteSpaceTreatment;
import org.axsl.value.group.TextModifiers;
@@ -255,7 +255,7 @@
*/
private int relevantWhitespaceBefore(final int tokenIndex) {
final FoTextToken token = tokenAt(tokenIndex);
- if (! (token instanceof Whitespace)) {
+ if (! (token instanceof FoWhitespace)) {
return -1;
}
if (tokenIndex == 0) {
@@ -262,8 +262,8 @@
return -1;
}
final FoTextToken previousToken = tokenAt(tokenIndex - 1);
- if (previousToken instanceof Whitespace) {
- final Whitespace whitespace = (Whitespace) previousToken;
+ if (previousToken instanceof FoWhitespace) {
+ final FoWhitespace whitespace = (FoWhitespace) previousToken;
return whitespace.charAt(whitespace.length() - 1);
}
/* The previous token is not whitespace. */
@@ -281,7 +281,7 @@
*/
private int relevantWhitespaceAfter(final int tokenIndex) {
final FoTextToken token = tokenAt(tokenIndex);
- if (! (token instanceof Whitespace)) {
+ if (! (token instanceof FoWhitespace)) {
return -1;
}
if (tokenIndex == qtyTokens() - 1) {
@@ -288,8 +288,8 @@
return -1;
}
final FoTextToken nextToken = tokenAt(tokenIndex + 1);
- if (nextToken instanceof Whitespace) {
- final Whitespace whitespace = (Whitespace) nextToken;
+ if (nextToken instanceof FoWhitespace) {
+ final FoWhitespace whitespace = (FoWhitespace) nextToken;
return whitespace.charAt(0);
}
/* The next token is not whitespace. */
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/TokenFlow4aTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/TokenFlow4aTests.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/TokenFlow4aTests.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -28,7 +28,7 @@
package org.foray.orthography;
-import org.axsl.orthography.TextTokenFlowLocation;
+import org.axsl.fotree.text.TextTokenFlowLocation;
import org.axsl.unicode.block.U0000_Basic_Latin;
import org.axsl.value.LinefeedTreatment;
import org.axsl.value.TextTransform;
Modified: trunk/foray/foray-pioneer/src/main/java/org/foray/pioneer/FoTextWordsPnr.java
===================================================================
--- trunk/foray/foray-pioneer/src/main/java/org/foray/pioneer/FoTextWordsPnr.java 2023-10-02 20:42:41 UTC (rev 13299)
+++ trunk/foray/foray-pioneer/src/main/java/org/foray/pioneer/FoTextWordsPnr.java 2023-10-02 22:55:54 UTC (rev 13300)
@@ -36,11 +36,11 @@
import org.axsl.area.NormalBlockArea;
import org.axsl.content.TextTokensContent;
import org.axsl.fotree.fo.FoTextWords;
+import org.axsl.fotree.text.TextTokenFlowLocation;
import org.axsl.kp.KpLeafIterator;
import org.axsl.kp.KpResult;
import org.axsl.kp.KpUserAgent;
import org.axsl.linebreak.LineBreaker;
-import org.axsl.orthography.TextTokenFlowLocation;
/**
* Pioneer Layout for {@link FoTextWords}.
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-10-02 20:42:45
|
Revision: 13299
http://sourceforge.net/p/foray/code/13299
Author: victormote
Date: 2023-10-02 20:42:41 +0000 (Mon, 02 Oct 2023)
Log Message:
-----------
Conform to aXSL change: Consolidate tokenization concepts into the Lexer.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java 2023-10-02 20:03:21 UTC (rev 13298)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/OrthographyServer4a.java 2023-10-02 20:42:41 UTC (rev 13299)
@@ -388,10 +388,7 @@
}
}
- /**
- * Returns the text tokenizer.
- * @return The text tokenizer.
- */
+ @Override
public Lexer4a getLexer() {
if (this.lexer == null) {
this.lexer = new LexerJavaBreakIterator(this);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-10-02 20:03:24
|
Revision: 13298
http://sourceforge.net/p/foray/code/13298
Author: victormote
Date: 2023-10-02 20:03:21 +0000 (Mon, 02 Oct 2023)
Log Message:
-----------
Conform to aXSL changes: Remove org.axsl.orthography.optional package, moving its interfaces up a level.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/DerivativePattern.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerIcu4jBreakIterator.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerJavaBreakIterator.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/SegmentDictionary.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/SimpleDictionary.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/WordWrapperFactory.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPast1WordFactory.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPlural1WordFactory.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPlural2WordFactory.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPossessive1WordFactory.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPossessive2WordFactory.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPast1WordFactoryTests.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPlural1WordFactoryTests.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPlural2WordFactoryTests.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPossessive1WordFactoryTests.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPossessive2WordFactoryTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/DerivativePattern.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/DerivativePattern.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/DerivativePattern.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -28,8 +28,8 @@
package org.foray.orthography;
+import org.axsl.orthography.Dictionary;
import org.axsl.orthography.Word;
-import org.axsl.orthography.optional.Dictionary;
import java.util.List;
import java.util.regex.Pattern;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -34,7 +34,7 @@
import org.foray.primitive.StringUtils;
import org.axsl.i18n.WritingSystem;
-import org.axsl.orthography.optional.Lexer;
+import org.axsl.orthography.Lexer;
import org.axsl.primitive.sequence.IntSequence;
import org.axsl.unicode.block.U0000_Basic_Latin;
import org.axsl.unicode.block.U0080_Latin_1_Supplement;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerIcu4jBreakIterator.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerIcu4jBreakIterator.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerIcu4jBreakIterator.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -32,7 +32,7 @@
import org.foray.primitive.sequence.IntArrayBuilder;
import org.axsl.i18n.WritingSystem;
-import org.axsl.orthography.optional.Lexer;
+import org.axsl.orthography.Lexer;
import org.axsl.primitive.sequence.IntSequence;
import com.ibm.icu.text.BreakIterator;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerJavaBreakIterator.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerJavaBreakIterator.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerJavaBreakIterator.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -31,7 +31,7 @@
import org.foray.primitive.sequence.IntArrayBuilder;
import org.axsl.i18n.WritingSystem;
-import org.axsl.orthography.optional.Lexer;
+import org.axsl.orthography.Lexer;
import org.axsl.primitive.sequence.IntSequence;
import java.text.BreakIterator;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -37,11 +37,11 @@
import org.foray.primitive.CharacterUtils;
import org.axsl.fotree.text.FoOrthography;
+import org.axsl.orthography.Dictionary;
+import org.axsl.orthography.Lexer;
+import org.axsl.orthography.Lexer.TokenType;
import org.axsl.orthography.OrthographyException;
import org.axsl.orthography.Word;
-import org.axsl.orthography.optional.Dictionary;
-import org.axsl.orthography.optional.Lexer;
-import org.axsl.orthography.optional.Lexer.TokenType;
import java.util.ArrayList;
import java.util.List;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/SegmentDictionary.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/SegmentDictionary.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/SegmentDictionary.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -32,9 +32,9 @@
import org.foray.common.i18n.WritingSystem4a;
import org.axsl.i18n.WritingSystem;
+import org.axsl.orthography.Dictionary;
import org.axsl.orthography.Word.PartOfSpeech;
import org.axsl.orthography.Word.PosQualifier;
-import org.axsl.orthography.optional.Dictionary;
import java.util.Arrays;
import java.util.HashMap;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/SimpleDictionary.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/SimpleDictionary.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/SimpleDictionary.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -31,10 +31,10 @@
import org.foray.common.i18n.WritingSystem4a;
import org.axsl.i18n.WritingSystem;
+import org.axsl.orthography.Dictionary;
import org.axsl.orthography.Word;
import org.axsl.orthography.Word.PartOfSpeech;
import org.axsl.orthography.Word.PosQualifier;
-import org.axsl.orthography.optional.Dictionary;
import java.util.Collections;
import java.util.HashMap;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/WordWrapperFactory.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/WordWrapperFactory.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/WordWrapperFactory.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -28,7 +28,7 @@
package org.foray.orthography;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
/**
* Factory that knows how to create an instance of a {@link WordWrapper}.
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -40,11 +40,11 @@
import org.foray.xml.dtd.DtdAttribute;
import org.axsl.i18n.WritingSystem;
+import org.axsl.orthography.Dictionary;
+import org.axsl.orthography.Lexer;
+import org.axsl.orthography.Lexer.TokenType;
import org.axsl.orthography.Orthography;
import org.axsl.orthography.OrthographyException;
-import org.axsl.orthography.optional.Dictionary;
-import org.axsl.orthography.optional.Lexer;
-import org.axsl.orthography.optional.Lexer.TokenType;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/WordChecker.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -34,8 +34,8 @@
import org.foray.orthography.OrthographyServerConfig;
import org.foray.orthography.SegmentDictionary;
+import org.axsl.orthography.Dictionary;
import org.axsl.orthography.OrthographyException;
-import org.axsl.orthography.optional.Dictionary;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPast1WordFactory.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPast1WordFactory.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPast1WordFactory.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -32,7 +32,7 @@
import org.foray.orthography.WordWrapperFactory;
import org.axsl.fotree.text.FoWord;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
/**
* Factory class for {@link LatinPast1Word}.
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPlural1WordFactory.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPlural1WordFactory.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPlural1WordFactory.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -32,7 +32,7 @@
import org.foray.orthography.WordWrapperFactory;
import org.axsl.fotree.text.FoWord;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
/**
* Factory class for {@link LatinPlural1Word}.
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPlural2WordFactory.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPlural2WordFactory.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPlural2WordFactory.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -32,7 +32,7 @@
import org.foray.orthography.WordWrapperFactory;
import org.axsl.fotree.text.FoWord;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
/**
* Factory class for {@link LatinPlural2Word}.
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPossessive1WordFactory.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPossessive1WordFactory.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPossessive1WordFactory.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -32,7 +32,7 @@
import org.foray.orthography.WordWrapperFactory;
import org.axsl.fotree.text.FoWord;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
/**
* Factory class for {@link LatinPossessive1Word}.
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPossessive2WordFactory.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPossessive2WordFactory.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/wrapper/LatinPossessive2WordFactory.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -32,7 +32,7 @@
import org.foray.orthography.WordWrapperFactory;
import org.axsl.fotree.text.FoWord;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
/**
* Factory class for {@link LatinPossessive2Word}.
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -31,10 +31,10 @@
import org.foray.common.i18n.WritingSystem4a;
import org.axsl.i18n.WritingSystem;
+import org.axsl.orthography.Lexer;
+import org.axsl.orthography.Lexer.Token;
+import org.axsl.orthography.Lexer.TokenType;
import org.axsl.orthography.OrthographyException;
-import org.axsl.orthography.optional.Lexer;
-import org.axsl.orthography.optional.Lexer.Token;
-import org.axsl.orthography.optional.Lexer.TokenType;
import org.axsl.unicode.block.U2000_General_Punctuation;
import static org.junit.jupiter.api.Assertions.assertEquals;
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPast1WordFactoryTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPast1WordFactoryTests.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPast1WordFactoryTests.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -31,7 +31,7 @@
import org.foray.common.primitive.CharSequenceUtils;
import org.foray.orthography.StringWordTests;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPlural1WordFactoryTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPlural1WordFactoryTests.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPlural1WordFactoryTests.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -31,7 +31,7 @@
import org.foray.common.primitive.CharSequenceUtils;
import org.foray.orthography.StringWordTests;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPlural2WordFactoryTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPlural2WordFactoryTests.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPlural2WordFactoryTests.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -31,7 +31,7 @@
import org.foray.common.primitive.CharSequenceUtils;
import org.foray.orthography.StringWordTests;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPossessive1WordFactoryTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPossessive1WordFactoryTests.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPossessive1WordFactoryTests.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -31,7 +31,7 @@
import org.foray.common.primitive.CharSequenceUtils;
import org.foray.orthography.StringWordTests;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPossessive2WordFactoryTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPossessive2WordFactoryTests.java 2023-10-02 17:01:06 UTC (rev 13297)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/wrapper/LatinPossessive2WordFactoryTests.java 2023-10-02 20:03:21 UTC (rev 13298)
@@ -31,7 +31,7 @@
import org.foray.common.primitive.CharSequenceUtils;
import org.foray.orthography.StringWordTests;
-import org.axsl.orthography.optional.Dictionary;
+import org.axsl.orthography.Dictionary;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-10-02 17:01:12
|
Revision: 13297
http://sourceforge.net/p/foray/code/13297
Author: victormote
Date: 2023-10-02 17:01:06 +0000 (Mon, 02 Oct 2023)
Log Message:
-----------
Since there is only one lexer now, cache it as an instance variable.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-29 17:10:37 UTC (rev 13296)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-10-02 17:01:06 UTC (rev 13297)
@@ -40,6 +40,7 @@
import org.foray.xml.dtd.DtdAttribute;
import org.axsl.i18n.WritingSystem;
+import org.axsl.orthography.Orthography;
import org.axsl.orthography.OrthographyException;
import org.axsl.orthography.optional.Dictionary;
import org.axsl.orthography.optional.Lexer;
@@ -158,6 +159,9 @@
/** The Orthography server. */
private OrthographyServer4a server;
+ /** The lexer. */
+ private Lexer4a lexer;
+
/** The list of ad-hoc dictionaries, usually parsed from the command-line. */
private List<Dictionary> adhocDictionaries = new ArrayList<Dictionary>();
@@ -192,6 +196,7 @@
final OrthographyServerConfig serverConfig = new OrthographyServerConfig();
serverConfig.setOrthographyConfigurationLocation(orthographyConfigPath);
this.server = new OrthographyServer4a(serverConfig);
+ this.lexer = this.server.getLexer();
if (adhocDictionaryPaths != null) {
for (URL adhocDictionaryPath : adhocDictionaryPaths) {
@@ -240,7 +245,6 @@
@Override
public void startElement(final String uri, final String localName, final String qName, final Attributes attributes)
throws SAXException {
- final Lexer4a lexer = this.server.getLexer();
final WritingSystem4a oldWritingSystem = getCurrentWritingSystem();
if ("word".equals(localName)) {
@@ -270,7 +274,6 @@
if (this.elementStack.size() < 1) {
throw new SAXException("Element stack is empty but should not be.");
}
- final Lexer4a lexer = this.server.getLexer();
final WritingSystem4a oldWritingSystem = getCurrentWritingSystem();
final Element element = this.elementStack.pop();
@@ -307,8 +310,7 @@
* @param location The location of the text in the original document.
*/
private void checkWords(final String location) {
- final Lexer4a lexer = this.server.getLexer();
- lexer.lock();
+ this.lexer.lock();
/* Writing system should never be null, but orthography could be. */
WritingSystem lastWritingSystem = null;
@@ -349,8 +351,7 @@
return;
} else {
final Lexer.Token savedToken = token.getImmutableCopy();
- final Lexer4a lexer = this.server.getLexer();
- if (lexer.hasNext()) {
+ if (this.lexer.hasNext()) {
final Lexer.Token nextToken = lexer.peekNext();
if (nextToken.getTokenType() == Lexer.TokenType.AMBIGUOUS_TRAILING_PUNCTUATION) {
final String testWord = savedToken.getText().toString() + nextToken.getText().toString();
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-29 17:10:40
|
Revision: 13296
http://sourceforge.net/p/foray/code/13296
Author: victormote
Date: 2023-09-29 17:10:37 +0000 (Fri, 29 Sep 2023)
Log Message:
-----------
Dictionary and orthography improvements.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/main/data/dictionaries/fre-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/main/data/dictionaries/ita-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/main/data/dictionaries/lat-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml
Added Paths:
-----------
trunk/foray/foray-orthography/src/main/data/dictionaries/pol-Latn-ZZZ.dict.xml
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2023-09-29 00:19:34 UTC (rev 13295)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2023-09-29 17:10:37 UTC (rev 13296)
@@ -15069,7 +15069,7 @@
<phrase><t>Ben Bel-la</t></phrase>
<w><t>Ben-bow</t></w>
<w><t>Ben-brook</t></w>
-<w><t>bench</t></w>
+<w><t>bench</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>bench-er</t></w>
<w><t>bench-less</t></w>
<w><t>Bench-ley</t></w>
@@ -25363,6 +25363,7 @@
<w><t>CGM</t></w>
<w><t>CGS</t></w>
<phrase><t>cgs u-nits</t></phrase>
+<w><t>ch.</t><abbrev referenced-word="chapter"/></w>
<w><t>chab-a-zite</t></w>
<w><t>Chab-lis</t></w>
<w><t>chab-lis</t></w>
@@ -36099,6 +36100,7 @@
<w><t>CSIRO</t></w>
<w><t>CSM</t></w>
<w><t>CST</t></w>
+<w><t>Ct.</t><abbrev referenced-word="Court"/></w>
<w><t>Cte-a-tus</t></w>
<w><t>cte-nid-i-a</t></w>
<w><t>cte-nid-i-al</t></w>
@@ -41761,7 +41763,7 @@
<phrase><t>di-es non</t></phrase>
<w><t>die-stock</t></w>
<w><t>di-e-strus</t></w>
-<w><t>di-et</t></w>
+<w><t>di-et</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>di-e-tar-ies</t></w>
<w><t>di-e-tar-y</t></w>
<w><t>di-et-ed</t></w>
@@ -48303,7 +48305,7 @@
<w><t>e-ma-ci-a-tion</t></w>
<w><t>em-a-gram</t></w>
<w><t>em-a-nant</t></w>
-<w><t>em-a-nate</t></w>
+<w><t>em-a-nate</t><verb><regular-root/></verb></w>
<w><t>em-a-nat-ed</t></w>
<w><t>em-a-nat-ing</t></w>
<w><t>em-a-na-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -51271,7 +51273,7 @@
<w><t>E-ta</t></w>
<w><t>e-ta</t></w>
<w><t>e-tae-ri-o</t></w>
-<w><t>et al.</t></w>
+<w><t>et al.</t><abbrev referenced-word="Latin et alii (masculine) or et aliæ (feminine) = 'and others'"></abbrev></w>
<w><t>et-a-lon</t></w>
<w><t>e-ta-lon</t></w>
<w><t>et-a-min</t></w>
@@ -54974,7 +54976,7 @@
<w><t>fer-rule</t></w>
<w><t>fer-ruled</t></w>
<w><t>fer-rul-ing</t></w>
-<w><t>fer-ry</t></w>
+<w><t>fer-ry</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>fer-ry-boat</t></w>
<w><t>fer-ry-ing</t></w>
<w><t>fer-ry-man</t></w>
@@ -71934,7 +71936,7 @@
<w><t>hunt-ed</t></w>
<w><t>hunt-ed-ly</t></w>
<w><t>hunt-er</t></w>
-<w><t>Hun-ter</t></w>
+<w><t>Hun-ter</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>hunt-er-like</t></w>
<phrase><t>hun-ter’s moon</t></phrase>
<w><t>hunt-ing</t></w>
@@ -74885,7 +74887,7 @@
<w><t>im-por-tant-ly</t></w>
<w><t>im-por-ta-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>im-por-tee</t></w>
-<w><t>im-port-er</t></w>
+<w><t>im-port-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>im-por-tu-na-cy</t></w>
<w><t>im-por-tu-nate</t></w>
<w><t>im-por-tu-nate-ly</t></w>
@@ -80681,7 +80683,7 @@
<w><t>jack-shaft</t></w>
<w><t>jack-smelt</t></w>
<w><t>jack-snipe</t></w>
-<w><t>Jack-son</t></w>
+<w><t>Jack-son</t><noun><convertible-to-possessive/></noun></w>
<w><t>Jack-so-ni-an</t></w>
<w><t>Jack-son-ism</t></w>
<w><t>Jack-son-ville</t></w>
@@ -82210,7 +82212,8 @@
<w><t>ju-rig-ging</t></w>
<w><t>ju-ris-con-sult</t></w>
<w><t>ju-ris-dic-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
-<w><t>ju-ris-dic-tion-al-ly</t></w>
+<w><t>ju-ris-dic-tion-al</t><adjective></adjective></w>
+<w><t>ju-ris-dic-tion-al-ly</t><adverb/></w>
<w><t>ju-ris-dic-tive</t></w>
<w><t>jurisp</t></w>
<w><t>ju-ris-pru-dence</t></w>
@@ -87482,7 +87485,7 @@
<phrase><t>Lib-er-al Par-ty</t></phrase>
<phrase><t>lib-er-al stud-ies</t></phrase>
<phrase><t>Lib-er-al Un-ion-ist</t></phrase>
-<w><t>lib-er-ate</t></w>
+<w><t>lib-er-ate</t><verb><regular-root/></verb></w>
<w><t>lib-er-at-ed</t></w>
<w><t>lib-er-at-ing</t></w>
<w><t>lib-er-a-tion</t></w>
@@ -113456,7 +113459,7 @@
<w><t>o-ver-ro-man-ti-ciz-ing</t></w>
<w><t>o-ver-rough</t></w>
<w><t>o-ver-rude</t></w>
-<w><t>o-ver-rule</t></w>
+<w><t>o-ver-rule</t><verb><regular-root/></verb></w>
<w><t>o-ver-ruled</t></w>
<w><t>o-ver-rul-er</t></w>
<w><t>o-ver-rul-ing</t></w>
@@ -118411,7 +118414,7 @@
<phrase><t>Pe-ter Pan col-lar</t></phrase>
<phrase><t>Pe-ter pence</t></phrase>
<phrase><t>Pe-ter Prin-ci-ple</t></phrase>
-<w><t>Pe-ters</t></w>
+<w><t>Pe-ters</t><noun><convertible-to-possessive/></noun></w>
<w><t>Pe-ters-burg</t></w>
<w><t>pe-ter-sham</t></w>
<w><t>Pe-ter-son</t></w>
@@ -163950,7 +163953,7 @@
<phrase><t>tor-sion bar</t></phrase>
<w><t>torsk</t></w>
<w><t>tor-so</t></w>
-<w><t>tort</t></w>
+<w><t>tort</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>torte</t></w>
<w><t>tor-teau</t></w>
<w><t>Tor-te-lier</t></w>
@@ -167116,6 +167119,7 @@
<w><t>tzi-tzith</t></w>
<w><t>Tzu=po</t></w>
<w><t>U/S</t></w>
+<w><t>U.S.</t><abbrev referenced-word="United States"/></w>
<w><t>UAM</t></w>
<w><t>UAR</t></w>
<w><t>UART</t></w>
@@ -183377,6 +183381,7 @@
<w><t>ways</t></w>
<w><t>way-side</t></w>
<w><t>way-ward</t></w>
+<w><t>way-ward-ness</t><noun/></w>
<w><t>way-worn</t></w>
<w><t>Way-za-ta</t></w>
<w><t>wayz-goose</t></w>
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/fre-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/fre-Latn-ZZZ.dict.xml 2023-09-29 00:19:34 UTC (rev 13295)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/fre-Latn-ZZZ.dict.xml 2023-09-29 17:10:37 UTC (rev 13296)
@@ -10,6 +10,11 @@
hard-hyphen-char="=" soft-hyphen-char="-">
+<!--
+Dictionary of French words.
+-->
+
+
<w><t>blanche</t></w>
<w><t>bour-geois</t></w>
<w><t>carte</t></w>
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/ita-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/ita-Latn-ZZZ.dict.xml 2023-09-29 00:19:34 UTC (rev 13295)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/ita-Latn-ZZZ.dict.xml 2023-09-29 17:10:37 UTC (rev 13296)
@@ -10,6 +10,7 @@
hard-hyphen-char="=" soft-hyphen-char="-">
<!--
+Dictionary of Italian words.
-->
<w><t>fi-na-le</t><noun></noun></w>
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/lat-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/lat-Latn-ZZZ.dict.xml 2023-09-29 00:19:34 UTC (rev 13295)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/lat-Latn-ZZZ.dict.xml 2023-09-29 17:10:37 UTC (rev 13296)
@@ -10,6 +10,7 @@
hard-hyphen-char="=" soft-hyphen-char="-">
<!--
+Dictionary of Latin words.
-->
<w><t>a</t></w>
@@ -20,6 +21,7 @@
<w><t>am-or</t></w>
<w><t>an-no</t></w>
<w><t>ann-um</t></w>
+<w><t>an-te</t></w>
<w><t>a-pel-la</t></w>
<w><t>ar-gu-ment-a</t></w>
<w><t>ar-gu-ment-um</t></w>
@@ -28,6 +30,7 @@
<w><t>bap-tism-a</t></w>
<w><t>bel-li</t></w>
<w><t>Ben-e-dic-tus</t><noun/></w>
+<w><t>be-ne-pla-ci-to</t></w>
<w><t>bo-na</t></w>
<w><t>cæ-ter-is</t></w>
<w><t>cap-i-te</t></w>
@@ -62,6 +65,8 @@
<w><t>Dom-i-ni</t></w>
<w><t>dra-ma-tis</t></w>
<w><t>dul-ci-a</t></w>
+<w><t>du-ran-te</t></w>
+<w><t>be-ne-pla-ci-to</t></w>
<w><t>e.g.</t><abbrev referenced-word="id est"/><comment>Latin "that is."</comment></w>
<w><t>e. g.</t><abbrev referenced-word="id est"/><comment>Latin "that is."</comment></w>
<w><t>e-go</t></w>
@@ -71,6 +76,7 @@
<w><t>etc.</t><abbrev referenced-word="et cetera"/></w>
<w><t>e-van-gel-i-ar-i-um</t></w>
<w><t>ex</t></w>
+<w><t>ex-trem-um</t></w>
<w><t>fa-cias</t></w>
<w><t>fac-to</t></w>
<w><t>fa-to</t></w>
@@ -82,11 +88,13 @@
<w><t>gen-er-a-ti-o</t></w>
<w><t>glo-ri-a</t></w>
<w><t>ha-be-as</t></w>
+<w><t>hac</t></w>
<w><t>hoc</t></w>
<w><t>ho-mi-nem</t></w>
<w><t>hy-dro-ma-ni-a</t></w>
-<w><t>i.e</t><abbrev referenced-word="id est"/></w>
+<w><t>i.e.</t><abbrev referenced-word="id est"/></w>
<w><t>i. e.</t><abbrev referenced-word="id est"/><comment>Contains embedded non-breaking space.</comment></w>
+<w><t>ibid.</t><abbrev referenced-word="ibidem"/></w>
<w><t>ig-nis</t></w>
<w><t>im-mer-go</t></w>
<w><t>im-per-i-i</t></w>
@@ -99,9 +107,11 @@
<w><t>ju-ris</t></w>
<w><t>jus</t></w>
<w><t>li-ber-or-um</t></w>
+<w><t>li-ber-um</t></w>
<w><t>lin-qui-mus</t></w>
<w><t>lo-co</t></w>
<w><t>me-um</t></w>
+<w><t>Mons Sacer</t></w>
<w><t>nas-ci-tur</t></w>
<w><t>ne</t></w>
<w><t>nem</t><abbrev referenced-word="nemine"/></w>
@@ -173,6 +183,7 @@
follows is named as authority for what precedes.</comment></w>
<w><t>to-to</t></w>
<w><t>trans-eunte</t></w>
+<w><t>tri-bu-no-rum</t></w>
<w><t>tri-um</t></w>
<w><t>tu-um</t></w>
<w><t>ul-ti-ma</t></w>
Added: trunk/foray/foray-orthography/src/main/data/dictionaries/pol-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/pol-Latn-ZZZ.dict.xml (rev 0)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/pol-Latn-ZZZ.dict.xml 2023-09-29 17:10:37 UTC (rev 13296)
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!DOCTYPE axsl-dictionary
+ PUBLIC "-//aXSL//DTD Dictionary V0.1//EN"
+ "http://www.axsl.org/dtds/0.1/en/axsl-dictionary.dtd">
+
+<axsl-dictionary
+ id="org.foray.pol.Latn.ZZZ"
+ language="pol" script="Latn"
+ hard-hyphen-char="=" soft-hyphen-char="-">
+
+<!--
+Dictionary of Polish words.
+-->
+
+
+<w><t>Nie</t></w>
+<w><t>Poz-wal-am</t></w>
+
+
+</axsl-dictionary>
Property changes on: trunk/foray/foray-orthography/src/main/data/dictionaries/pol-Latn-ZZZ.dict.xml
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Rev
\ No newline at end of property
Modified: trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml 2023-09-29 00:19:34 UTC (rev 13295)
+++ trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml 2023-09-29 17:10:37 UTC (rev 13296)
@@ -334,6 +334,14 @@
</unparsed-dictionary>
</dictionary-resource>
+ <dictionary-resource id="org.foray.pol.Latn.ZZZ">
+ <unparsed-dictionary>
+ <dictionary-element>
+ <resource-location type="url">file:///C:/vic/foray/trunk/foray/foray-orthography/src/main/data/dictionaries/pol-Latn-ZZZ.dict.xml</resource-location>
+ </dictionary-element>
+ </unparsed-dictionary>
+ </dictionary-resource>
+
<hyphenation-patterns-resource id="hyph-patterns-eng">
<parsed-resource>
<resource-location type="classpath">/resources/org/foray/orthography/hyphPatterns/eng.jbso</resource-location>
@@ -398,4 +406,9 @@
<dictionary reference="org.foray.non.Latn.ZZZ"/>
</orthography>
+ <!-- Polish. -->
+ <orthography language-iso-3char="pol" script-iso-4char="Latn" country-iso-3char="ZZZ">
+ <dictionary reference="org.foray.pol.Latn.ZZZ"/>
+ </orthography>
+
</axsl-orthography-config>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-29 00:19:37
|
Revision: 13295
http://sourceforge.net/p/foray/code/13295
Author: victormote
Date: 2023-09-29 00:19:34 +0000 (Fri, 29 Sep 2023)
Log Message:
-----------
Move creation of actual dictionary to the endElement method, to enable parsing of multiple dictionaries from one file.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/DictionaryParser.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/DictionaryParser.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/DictionaryParser.java 2023-09-28 17:13:09 UTC (rev 13294)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/DictionaryParser.java 2023-09-29 00:19:34 UTC (rev 13295)
@@ -97,12 +97,13 @@
/** The character that should actually be used in the word content as the hard hyphen characters. */
private char actualHardHyphenChar = '-';
+ /** The current dictionary being parsed. (Not to be confused with {@link #currentDictionary}, the dictionary object
+ * that will eventually be returned to the client code). */
+ private DictionaryElement currentDictionaryElement;
+
/** The list of dictionaries that have been parsed by this parser. */
private List<SegmentDictionary> parsedDictionaries = new ArrayList<SegmentDictionary>();
- /** The current dictionary being parsed. */
- private DictionaryElement currentDictionary;
-
/** The current word content being parsed. */
private StringWordSegment[] currentSegments;
@@ -158,29 +159,6 @@
}
cleanup();
- final StringWordSegment[] uniqueWordSegments = new StringWordSegment[segmentSet.size()];
- segmentSet.toArray(uniqueWordSegments);
- Arrays.sort(uniqueWordSegments);
- final SegmentDictionary dictionary = new SegmentDictionary(this.currentDictionary.id,
- this.currentDictionary.imports, this.currentDictionary.writingSystem, uniqueWordSegments,
- this.wordMap.size());
-
- for (Map.Entry<String, StringWord> entry : this.wordMap.entrySet()) {
- dictionary.addWord(entry.getKey(), entry.getValue());
- }
- for (Map.Entry<String, List<StringWord>> entry : this.ambiguousWordMap.entrySet()) {
- final List<StringWord> list = entry.getValue();
- final SegmentDictionaryWord[] sdWords = new SegmentDictionaryWord[list.size()];
- for (int index = 0; index < list.size(); index ++) {
- final StringWord stringWord = list.get(index);
- sdWords[index] = new SegmentDictionaryWord(stringWord.getPartsOfSpeech(), dictionary, stringWord);
- }
- final AmbiguousWord<SegmentDictionaryWord> ambWord = new AmbiguousWord<SegmentDictionaryWord>(sdWords);
- dictionary.addAmbiguousWord(entry.getKey(), ambWord);
- }
-
- dictionary.optimize();
- this.parsedDictionaries.add(dictionary);
return this.parsedDictionaries;
}
@@ -299,26 +277,27 @@
}
case "word-group": break;
case "axsl-dictionary": {
- this.currentDictionary = new DictionaryElement();
+ this.currentDictionaryElement = new DictionaryElement();
- this.currentDictionary.id = attributes.getValue(StringUtils.EMPTY_STRING, "id");
+ this.currentDictionaryElement.id = attributes.getValue(StringUtils.EMPTY_STRING, "id");
final String language = attributes.getValue(StringUtils.EMPTY_STRING, "language");
final String country = attributes.getValue(StringUtils.EMPTY_STRING, "country");
final String script = attributes.getValue(StringUtils.EMPTY_STRING, "script");
- this.currentDictionary.writingSystem = WritingSystem4a.find(language, script, country);
- debugMessage("Begin dictionary word list parsing: " + this.currentDictionary.writingSystem.toString());
+ this.currentDictionaryElement.writingSystem = WritingSystem4a.find(language, script, country);
+ debugMessage("Begin dictionary word list parsing: " +
+ this.currentDictionaryElement.writingSystem.toString());
final String soft = attributes.getValue(StringUtils.EMPTY_STRING, "soft-hyphen-char");
if (soft.length() != 1) {
throw new SAXException("Attribute soft-hyphen-char must have exactly one char.");
}
- this.currentDictionary.softHyphenChar = soft.charAt(0);
+ this.currentDictionaryElement.softHyphenChar = soft.charAt(0);
final String hard = attributes.getValue(StringUtils.EMPTY_STRING, "hard-hyphen-char");
if (hard.length() != 1) {
throw new SAXException("Attribute hard-hyphen-char must have exactly one char.");
}
- this.currentDictionary.hardHyphenChar = hard.charAt(0);
- final Locale locale = this.currentDictionary.writingSystem.toLocale();
+ this.currentDictionaryElement.hardHyphenChar = hard.charAt(0);
+ final Locale locale = this.currentDictionaryElement.writingSystem.toLocale();
if (locale != null) {
this.collator = Collator.getInstance(locale);
this.collator.setDecomposition(Collator.FULL_DECOMPOSITION);
@@ -329,10 +308,10 @@
break;
}
case "import-dictionary": {
- this.currentDictionary.imports.add(attributes.getValue(StringUtils.EMPTY_STRING, "dictionary-id"));
+ this.currentDictionaryElement.imports.add(attributes.getValue(StringUtils.EMPTY_STRING, "dictionary-id"));
break;
}
- case "axsl-dictionaries": break;
+ case "axsl-dictionary-collection": break;
case "phrase": break;
case "vf": break;
case "lemma": break;
@@ -415,7 +394,7 @@
int inputLineIndex = 0;
while (inputLineIndex < inputLine.length()) {
final char theChar = inputLine.charAt(inputLineIndex);
- if (theChar == this.currentDictionary.softHyphenChar) {
+ if (theChar == this.currentDictionaryElement.softHyphenChar) {
if (builder.length() < 1) {
throw new SAXException("0-length syllable on line: " + getLocationString(getLocator()));
}
@@ -424,7 +403,7 @@
segmentSet.add(wordSegment);
builder.delete(0, builder.length());
} else {
- if (theChar == this.currentDictionary.hardHyphenChar) {
+ if (theChar == this.currentDictionaryElement.hardHyphenChar) {
builder.append(this.actualHardHyphenChar);
} else {
builder.append(theChar);
@@ -460,13 +439,37 @@
case "ordinal": break;
case "word-group": break;
case "axsl-dictionary": {
- debugMessage("End parsing for dictionary: " + this.currentDictionary.writingSystem.toString());
- debugMessage("Qty of unique word segments parsed: " + segmentSet.size());
+ final StringWordSegment[] uniqueWordSegments = new StringWordSegment[this.segmentSet.size()];
+ this.segmentSet.toArray(uniqueWordSegments);
+ Arrays.sort(uniqueWordSegments);
+ final SegmentDictionary dictionary = new SegmentDictionary(this.currentDictionaryElement.id,
+ this.currentDictionaryElement.imports, this.currentDictionaryElement.writingSystem,
+ uniqueWordSegments, this.wordMap.size());
+
+ for (Map.Entry<String, StringWord> entry : this.wordMap.entrySet()) {
+ dictionary.addWord(entry.getKey(), entry.getValue());
+ }
+ for (Map.Entry<String, List<StringWord>> entry : this.ambiguousWordMap.entrySet()) {
+ final List<StringWord> list = entry.getValue();
+ final SegmentDictionaryWord[] sdWords = new SegmentDictionaryWord[list.size()];
+ for (int index = 0; index < list.size(); index ++) {
+ final StringWord stringWord = list.get(index);
+ sdWords[index] = new SegmentDictionaryWord(stringWord.getPartsOfSpeech(), dictionary, stringWord);
+ }
+ final AmbiguousWord<SegmentDictionaryWord> ambWord = new AmbiguousWord<SegmentDictionaryWord>(sdWords);
+ dictionary.addAmbiguousWord(entry.getKey(), ambWord);
+ }
+
+ dictionary.optimize();
+ this.parsedDictionaries.add(dictionary);
+
+ debugMessage("End parsing for dictionary: " + this.currentDictionaryElement.writingSystem.toString());
+ debugMessage("Qty of unique word segments parsed: " + this.segmentSet.size());
debugMessage("Qty of words parsed: " + wordMap.size());
break;
}
+ case "axsl-dictionary-collection": break;
case "import-dictionary": break;
- case "axsl-dictionaries": break;
case "phrase": {
final StringWord word = new StringWord(this.currentPartsOfSpeech, this.currentSegments);
final String actualContent = word.getActualContent().toString();
@@ -534,7 +537,7 @@
} else {
if (this.collator.compare(collatingContent, this.lastWord) < 0) {
warningMessage("Out of sequence (Collator " +
- this.currentDictionary.writingSystem.toLocale().toString() + "): " + actualContent);
+ this.currentDictionaryElement.writingSystem.toLocale().toString() + "): " + actualContent);
}
this.lastWord = collatingContent;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 17:13:11
|
Revision: 13294
http://sourceforge.net/p/foray/code/13294
Author: victormote
Date: 2023-09-28 17:13:09 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Dictionary improvements.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2023-09-28 13:40:29 UTC (rev 13293)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2023-09-28 17:13:09 UTC (rev 13294)
@@ -5022,6 +5022,7 @@
<w><t>a-me-na-bly</t></w>
<phrase><t>a-men cor-ner</t></phrase>
<w><t>a-mend</t><verb><regular-root/></verb></w>
+<w><t>a-mend-a-bil-i-ty</t><noun/><comment>Not in NOAD.</comment></w>
<w><t>a-mend-a-ble</t></w>
<w><t>a-mend-a-tory</t></w>
<w><t>a-mend-a-to-ry</t></w>
@@ -5374,7 +5375,7 @@
<w><t>am-phi-coe-lous</t></w>
<w><t>am-phi-cra-ni-a</t></w>
<w><t>am-phic-ty-on</t></w>
-<w><t>Am-phic-ty-on</t></w>
+<w><t>Am-phic-ty-on</t><noun><pluralizable/></noun></w>
<w><t>am-phic-ty-on-ic</t></w>
<w><t>am-phic-ty-o-ny</t></w>
<w><t>Am-phid-a-mas</t></w>
@@ -8666,7 +8667,7 @@
<w><t>ap-prov-ing-ly</t></w>
<w><t>approx</t></w>
<w><t>ap-prox-i-mal</t></w>
-<w><t>ap-prox-i-mate</t></w>
+<w><t>ap-prox-i-mate</t><verb><regular-root/></verb><adjective/></w>
<w><t>ap-prox-i-mat-ed</t></w>
<w><t>ap-prox-i-mate-ly</t></w>
<w><t>ap-prox-i-mat-ing</t></w>
@@ -8903,7 +8904,7 @@
<w><t>ar-ba-list-er</t></w>
<w><t>Ar-be-la</t></w>
<w><t>Ar-bil</t></w>
-<w><t>ar-bi-ter</t></w>
+<w><t>ar-bi-ter</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ar-bi-tra-ble</t></w>
<w><t>ar-bi-trage</t></w>
<w><t>ar-bi-trag-er</t></w>
@@ -9627,7 +9628,7 @@
<w><t>ar-ray</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>ar-ray-al</t></w>
<w><t>ar-rear</t></w>
-<w><t>ar-rear-age</t></w>
+<w><t>ar-rear-age</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ar-rears</t></w>
<w><t>ar-re-not-o-kous</t></w>
<w><t>ar-re-not-o-ky</t></w>
@@ -13283,7 +13284,7 @@
<w><t>bar-ba-rised</t></w>
<w><t>bar-ba-ris-ing</t></w>
<w><t>bar-ba-rism</t></w>
-<w><t>bar-bar-i-ty</t></w>
+<w><t>bar-bar-i-ty</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>bar-ba-ri-za-tion</t></w>
<w><t>bar-ba-rize</t></w>
<w><t>bar-ba-rized</t></w>
@@ -14585,7 +14586,7 @@
<w><t>bed-warm-er</t></w>
<w><t>bed=wet-ting</t></w>
<w><t>Bed-worth</t></w>
-<w><t>bee</t></w>
+<w><t>bee</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Bee</t></w>
<w><t>Beeb</t></w>
<w><t>Bee-be</t></w>
@@ -17368,7 +17369,7 @@
<w><t>blood-stock</t></w>
<w><t>blood-stone</t></w>
<w><t>blood-stream</t></w>
-<w><t>blood-suck-er</t></w>
+<w><t>blood-suck-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>blood-suck-ing</t></w>
<w><t>blood-thirst-i-ly</t></w>
<w><t>blood-thirst-i-ness</t></w>
@@ -19281,7 +19282,7 @@
<w><t>bra-va</t></w>
<w><t>bra-va-do</t></w>
<phrase><t>Bra-vais lat-tice</t></phrase>
-<w><t>brave</t></w>
+<w><t>brave</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb><adjective><extensible/></adjective></w>
<w><t>brave-ly</t></w>
<w><t>brave-ness</t></w>
<w><t>brav-er</t></w>
@@ -19928,7 +19929,7 @@
<w><t>broi-der</t></w>
<w><t>broi-der-er</t></w>
<w><t>broi-der-y</t></w>
-<w><t>broil</t></w>
+<w><t>broil</t><verb><regular-root/></verb></w>
<w><t>broil-er</t></w>
<phrase><t>broil-er house</t></phrase>
<w><t>broil-ing-ly</t></w>
@@ -20245,7 +20246,7 @@
<w><t>bru-tal-ized</t></w>
<w><t>bru-tal-iz-ing</t></w>
<w><t>bru-tal-ly</t></w>
-<w><t>brute</t></w>
+<w><t>brute</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Bru-te</t></w>
<w><t>brut-ed</t></w>
<w><t>brute-like</t></w>
@@ -20969,7 +20970,7 @@
<w><t>Bur-ney</t></w>
<w><t>Burn-ham</t></w>
<phrase><t>Burn-ham scale</t></phrase>
-<w><t>burn-ing</t></w>
+<w><t>burn-ing</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>burn-ing bush</t></phrase>
<w><t>burn-ing=bush</t></w>
<phrase><t>burn-ing glass</t></phrase>
@@ -24574,7 +24575,7 @@
<w><t>cau-ter-ized</t></w>
<w><t>cau-ter-iz-ing</t></w>
<w><t>cau-ter-y</t></w>
-<w><t>cau-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>cau-tion</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>cau-tion-ar-y</t></w>
<w><t>cau-tion-er</t></w>
<phrase><t>cau-tion mon-ey</t></phrase>
@@ -30723,7 +30724,7 @@
<phrase><t>com-mon de-nom-i-na-tor</t></phrase>
<phrase><t>com-mon di-vi-sor</t></phrase>
<phrase><t>Com-mon En-trance</t></phrase>
-<w><t>com-mon-er</t></w>
+<w><t>com-mon-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>Com-mon E-ra</t></phrase>
<phrase><t>com-mon fee</t></phrase>
<phrase><t>com-mon frac-tion</t></phrase>
@@ -30992,7 +30993,7 @@
<w><t>com-pet-er</t></w>
<w><t>com-pet-ing</t></w>
<w><t>com-pet-ing-ly</t></w>
-<w><t>com-pe-ti-tion</t></w>
+<w><t>com-pe-ti-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>com-pet-i-tive</t></w>
<w><t>com-pet-i-tive-ly</t></w>
<w><t>com-pet-i-tive-ness</t></w>
@@ -35532,6 +35533,7 @@
<w><t>cro-cus-es</t></w>
<w><t>Croe-si</t></w>
<w><t>Croe-sus</t></w>
+<w><t>Crœ-sus</t></w>
<w><t>Croe-sus-es</t></w>
<w><t>croft</t></w>
<w><t>croft-er</t></w>
@@ -38530,7 +38532,7 @@
<phrase><t>deck ten-nis</t></phrase>
<w><t>decl</t></w>
<w><t>de-claim</t><verb><regular-root/></verb></w>
-<w><t>de-claim-er</t></w>
+<w><t>de-claim-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>dec-la-ma-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>de-clam-a-tory</t></w>
<w><t>de-clam-a-to-ry</t></w>
@@ -39904,7 +39906,7 @@
<w><t>Den-ni-son</t></w>
<w><t>Den-ny</t></w>
<w><t>denom</t></w>
-<w><t>de-nom-i-nate</t></w>
+<w><t>de-nom-i-nate</t><verb><regular-root/></verb></w>
<w><t>de-nom-i-nat-ed</t></w>
<w><t>de-nom-i-nat-ing</t></w>
<w><t>de-nom-i-na-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -40094,7 +40096,7 @@
<w><t>de-paup-er-ate</t></w>
<w><t>de-pau-per-a-tion</t></w>
<w><t>de-pend</t><verb><regular-root/></verb></w>
-<w><t>de-pend-a-bil-i-ty</t></w>
+<w><t>de-pend-a-bil-i-ty</t><noun/></w>
<w><t>de-pend-a-ble</t></w>
<w><t>de-pend-a-ble-ness</t></w>
<w><t>de-pend-a-bly</t></w>
@@ -41952,7 +41954,7 @@
<w><t>Di-ip-o-li-a</t></w>
<w><t>Di-jon</t></w>
<w><t>dik=dik</t></w>
-<w><t>dike</t></w>
+<w><t>dike</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Di-ke</t></w>
<w><t>dik-er</t></w>
<w><t>di-ke-tone</t></w>
@@ -43338,7 +43340,7 @@
<w><t>dis-pro-por-tion-ate-ness</t></w>
<w><t>dis-pro-por-tion-a-tion</t></w>
<w><t>dis-prov-a-ble</t></w>
-<w><t>dis-prove</t></w>
+<w><t>dis-prove</t><verb><regular-root/></verb></w>
<w><t>dis-proved</t></w>
<w><t>dis-prov-er</t></w>
<w><t>dis-prov-ing</t></w>
@@ -43370,7 +43372,7 @@
<w><t>dis-qui-et-ing-ly</t></w>
<w><t>dis-qui-et-ly</t></w>
<w><t>dis-qui-e-tude</t></w>
-<w><t>dis-qui-si-tion</t></w>
+<w><t>dis-qui-si-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>dis-qui-si-tion-al</t></w>
<w><t>Dis-rae-li</t></w>
<w><t>dis-rate</t></w>
@@ -43571,7 +43573,7 @@
<w><t>dis-tain</t></w>
<w><t>dis-tal</t></w>
<w><t>dis-tal-ly</t></w>
-<w><t>dis-tance</t></w>
+<w><t>dis-tance</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>dis-tanced</t></w>
<w><t>dis-tance-less</t></w>
<w><t>dis-tanc-ing</t></w>
@@ -49631,7 +49633,7 @@
<w><t>en-spir-it</t></w>
<w><t>en-sta-tite</t></w>
<w><t>en-sta-tit-ic</t></w>
-<w><t>en-sue</t></w>
+<w><t>en-sue</t><verb><regular-root/></verb></w>
<w><t>en-sued</t></w>
<w><t>en-su-ing</t></w>
<w><t>en-su-ing-ly</t></w>
@@ -52302,7 +52304,7 @@
<w><t>excl</t></w>
<w><t>ex-claim</t><verb><regular-root/></verb></w>
<w><t>ex-claim-er</t></w>
-<w><t>ex-cla-ma-tion</t></w>
+<w><t>ex-cla-ma-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-cla-ma-tion-al</t></w>
<phrase><t>ex-cla-ma-tion mark</t></phrase>
<w><t>ex-clam-a-to-ri-ly</t></w>
@@ -52781,7 +52783,7 @@
<w><t>ex-pe-dit-ed</t></w>
<w><t>ex-pe-dit-er</t></w>
<w><t>ex-pe-dit-ing</t></w>
-<w><t>ex-pe-di-tion</t></w>
+<w><t>ex-pe-di-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-pe-di-tion-ar-y</t></w>
<w><t>ex-pe-di-tious</t></w>
<w><t>ex-pe-di-tious-ly</t></w>
@@ -53192,7 +53194,7 @@
<w><t>ex-tor-sive-ly</t></w>
<w><t>ex-tort</t><verb><regular-root/></verb></w>
<w><t>ex-tort-er</t></w>
-<w><t>ex-tor-tion</t></w>
+<w><t>ex-tor-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-tor-tion-ar-y</t></w>
<w><t>ex-tor-tion-ate</t></w>
<w><t>ex-tor-tion-ate-ly</t></w>
@@ -57111,7 +57113,7 @@
<w><t>fo-cus-sing</t></w>
<w><t>fod-der</t></w>
<w><t>fodg-el</t></w>
-<w><t>foe</t></w>
+<w><t>foe</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Foe-cun-di-ta-tis</t></w>
<w><t>foehn</t></w>
<w><t>foe-man</t></w>
@@ -77002,7 +77004,7 @@
<w><t>in-law-ry</t></w>
<w><t>in-lay</t></w>
<w><t>in-lay-er</t></w>
-<w><t>in-let</t></w>
+<w><t>in-let</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>in-li-er</t></w>
<w><t>in-line</t></w>
<w><t>in=line</t></w>
@@ -77171,7 +77173,7 @@
<w><t>in-qui-lin-i-ty</t></w>
<w><t>in-qui-li-nous</t></w>
<w><t>in-quir-a-ble</t></w>
-<w><t>in-quire</t></w>
+<w><t>in-quire</t><verb><regular-root/></verb></w>
<w><t>in-quired</t></w>
<w><t>in-quir-er</t></w>
<w><t>in-quir-ies</t></w>
@@ -78625,7 +78627,7 @@
<w><t>in-ter-pos-er</t></w>
<w><t>in-ter-pos-ing</t></w>
<w><t>in-ter-pos-ing-ly</t></w>
-<w><t>in-ter-po-si-tion</t></w>
+<w><t>in-ter-po-si-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>in-ter-pour</t></w>
<w><t>in-ter-pres-sure</t></w>
<w><t>in-ter-pret</t><verb><regular-root/></verb></w>
@@ -79126,7 +79128,7 @@
<w><t>in-tra-vi-tal</t></w>
<phrase><t>in-tra-zon-al soil</t></phrase>
<w><t>in-treat</t></w>
-<w><t>in-trench</t></w>
+<w><t>in-trench</t><verb><regular-root/></verb></w>
<w><t>in-trench-er</t></w>
<w><t>in-trench-ment</t></w>
<w><t>in-trep-id</t></w>
@@ -81014,7 +81016,7 @@
<w><t>jarp</t></w>
<w><t>jar-rah</t></w>
<w><t>Jar-rell</t></w>
-<w><t>jar-ring</t></w>
+<w><t>jar-ring</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>jar-ring-ly</t></w>
<w><t>Jar-row</t></w>
<w><t>Jar-ry</t></w>
@@ -81337,7 +81339,7 @@
<w><t>Jes-sie</t></w>
<w><t>jess-ing</t></w>
<w><t>Jes-sy</t></w>
-<w><t>jest</t></w>
+<w><t>jest</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>jest-book</t></w>
<w><t>jest-er</t></w>
<w><t>jest-ful</t></w>
@@ -81564,7 +81566,7 @@
<w><t>Jo-ash</t></w>
<w><t>Job</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>job</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
-<w><t>job-ber</t></w>
+<w><t>job-ber</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>job-ber-y</t></w>
<w><t>job-bing</t></w>
<phrase><t>job-bing print-er</t></phrase>
@@ -83962,7 +83964,7 @@
<w><t>knap-weed</t></w>
<w><t>knar</t></w>
<w><t>knar-ry</t></w>
-<w><t>knave</t></w>
+<w><t>knave</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>knav-er-ies</t></w>
<w><t>knav-er-y</t></w>
<w><t>knav-ish</t></w>
@@ -89287,7 +89289,7 @@
<w><t>loo-kum</t></w>
<w><t>look=up</t></w>
<w><t>look-y</t></w>
-<w><t>loom</t></w>
+<w><t>loom</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>loom-ing</t></w>
<w><t>loon</t></w>
<w><t>loon-ey</t></w>
@@ -95518,7 +95520,7 @@
<w><t>might-i-er</t></w>
<w><t>might-i-est</t></w>
<w><t>might-i-ly</t></w>
-<w><t>might-i-ness</t></w>
+<w><t>might-i-ness</t><noun/></w>
<w><t>might-y</t></w>
<w><t>Mi-gnon</t></w>
<w><t>mi-gnon</t></w>
@@ -96228,7 +96230,7 @@
<w><t>mis-cal-cu-la-tor</t></w>
<w><t>mis-call</t></w>
<w><t>mis-call-er</t></w>
-<w><t>mis-car-riage</t></w>
+<w><t>mis-car-riage</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>mis-car-ry</t></w>
<w><t>mis-cast</t></w>
<w><t>mis-cast-ing</t></w>
@@ -97922,7 +97924,7 @@
<w><t>monts=de=pié-té</t></w>
<w><t>Mont-ser-rat</t></w>
<w><t>Mon-ty</t></w>
-<w><t>mon-u-ment</t></w>
+<w><t>mon-u-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>mon-u-men-tal</t></w>
<w><t>mon-u-men-tal-ise</t></w>
<w><t>mon-u-men-tal-ised</t></w>
@@ -107781,7 +107783,7 @@
<w><t>no-vel-la</t></w>
<w><t>No-vel-lo</t></w>
<w><t>Nov-els</t></w>
-<w><t>nov-el-ty</t></w>
+<w><t>nov-el-ty</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>nov-el-vel-las</t></w>
<w><t>nov-el-vel-le</t></w>
<w><t>No-vem-ber</t></w>
@@ -109392,7 +109394,7 @@
<w><t>om-i-nous-ly</t></w>
<w><t>om-i-nous-ness</t></w>
<w><t>o-mis-si-ble</t></w>
-<w><t>o-mis-sion</t></w>
+<w><t>o-mis-sion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>o-mis-sive</t></w>
<w><t>o-mis-sive-ly</t></w>
<w><t>o-mit</t></w>
@@ -111865,7 +111867,7 @@
<w><t>out-weed</t></w>
<w><t>out-weep</t></w>
<w><t>out-weep-ing</t></w>
-<w><t>out-weigh</t></w>
+<w><t>out-weigh</t><verb><regular-root/></verb></w>
<w><t>out-well</t></w>
<w><t>out-went</t></w>
<w><t>out-wept</t></w>
@@ -112195,7 +112197,7 @@
<w><t>o-ver-chafe</t></w>
<w><t>o-ver-chafed</t></w>
<w><t>o-ver-chaf-ing</t></w>
-<w><t>o-ver-charge</t></w>
+<w><t>o-ver-charge</t><verb><regular-root/></verb></w>
<w><t>o-ver-charg-er</t></w>
<w><t>o-ver-char-i-ta-ble</t></w>
<w><t>o-ver-char-i-ta-ble-ness</t></w>
@@ -116356,7 +116358,7 @@
<w><t>pat-ri-lo-cal-i-ty</t></w>
<w><t>pat-ri-mo-ni-al</t></w>
<w><t>pat-ri-mo-ni-al-ly</t></w>
-<w><t>pat-ri-mo-ny</t></w>
+<w><t>pat-ri-mo-ny</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>pa-tri-ot</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>pa-tri-ot-ic</t></w>
<w><t>pa-tri-ot-i-cal-ly</t></w>
@@ -125046,7 +125048,7 @@
<w><t>pref-er-en-tial-ist</t></w>
<w><t>pref-er-en-tial-ly</t></w>
<phrase><t>pref-er-en-tial vot-ing</t></phrase>
-<w><t>pre-fer-ment</t></w>
+<w><t>pre-fer-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>pre-fer-men-ta-tion</t></w>
<w><t>pre-ferred</t></w>
<w><t>pre-fer-red-ly</t></w>
@@ -132242,7 +132244,7 @@
<w><t>rail-car</t></w>
<w><t>rail-er</t></w>
<w><t>rail-head</t></w>
-<w><t>rail-ing</t></w>
+<w><t>rail-ing</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>rail-ing-ly</t></w>
<w><t>rail-ler-y</t></w>
<w><t>rail-road</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -133169,7 +133171,8 @@
<w><t>re-an-a-lyzed</t></w>
<w><t>re-an-a-lyz-ing</t></w>
<w><t>re-an-chor</t></w>
-<w><t>re-an-i-ma-tion</t></w>
+<w><t>re-an-i-mate</t><verb><regular-root/></verb></w>
+<w><t>re-an-i-ma-tion</t><noun/></w>
<w><t>re-an-nex</t></w>
<w><t>re-an-nex-a-tion</t></w>
<w><t>re-an-no-tate</t></w>
@@ -133506,7 +133509,7 @@
<w><t>re-cap-i-tal-ize</t></w>
<w><t>re-cap-i-tal-ized</t></w>
<w><t>re-cap-i-tal-iz-ing</t></w>
-<w><t>re-ca-pit-u-late</t></w>
+<w><t>re-ca-pit-u-late</t><verb><regular-root/></verb></w>
<w><t>re-ca-pit-u-lated</t></w>
<w><t>re-ca-pit-u-lat-ing</t></w>
<w><t>re-ca-pit-u-la-tion</t></w>
@@ -137453,7 +137456,7 @@
<w><t>re-taped</t></w>
<w><t>re-tap-ing</t></w>
<w><t>re-tar</t></w>
-<w><t>re-tard</t></w>
+<w><t>re-tard</t><verb><regular-root/></verb></w>
<w><t>re-tard-ant</t></w>
<w><t>re-tard-ate</t></w>
<w><t>re-tar-da-tion</t></w>
@@ -139760,7 +139763,7 @@
<w><t>Rous-sil-lon</t></w>
<w><t>roust</t></w>
<w><t>roust-a-bout</t></w>
-<w><t>rout</t></w>
+<w><t>rout</t><verb><regular-root/></verb></w>
<w><t>route</t></w>
<w><t>rout-ed</t></w>
<w><t>route-man</t></w>
@@ -143217,7 +143220,7 @@
<phrase><t>sea ot-ter</t></phrase>
<w><t>sea-plane</t></w>
<w><t>sea=poach-er</t></w>
-<w><t>sea-port</t></w>
+<w><t>sea-port</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>sea pow-er</t></phrase>
<phrase><t>sea purs-lane</t></phrase>
<w><t>sea-quake</t></w>
@@ -143302,7 +143305,7 @@
<w><t>sec-a-teurs</t></w>
<w><t>Sec-chi</t></w>
<w><t>sec-co</t></w>
-<w><t>se-cede</t></w>
+<w><t>se-cede</t><verb><regular-root/></verb></w>
<w><t>se-cern</t></w>
<w><t>se-cern-ent</t></w>
<w><t>se-cern-ment</t></w>
@@ -143495,7 +143498,7 @@
<w><t>sed-i-men-to-log-ic</t></w>
<w><t>sed-i-men-tol-o-gist</t></w>
<w><t>sed-i-men-tol-o-gy</t></w>
-<w><t>se-di-tion</t></w>
+<w><t>se-di-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>se-di-tious</t></w>
<w><t>se-di-tious-ly</t></w>
<w><t>se-di-tious-ness</t></w>
@@ -145580,7 +145583,7 @@
<w><t>sen-ti-men-tal-iz-ing</t></w>
<phrase><t>sen-ti-ment-al val-ue</t></phrase>
<w><t>sen-ti-ment-less</t></w>
-<w><t>sen-ti-nel</t></w>
+<w><t>sen-ti-nel</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>sen-ti-nel-like</t></w>
<w><t>sen-ti-nel-ship</t></w>
<w><t>sen-try</t></w>
@@ -150167,6 +150170,7 @@
<w><t>so-lid-i-fy</t></w>
<w><t>so-lid-i-fy-ing</t></w>
<w><t>sol-id-il-lu</t></w>
+<w><t>so-lid-i-ty</t><noun><convertible-to-possessive/></noun></w>
<w><t>sol-id=look-ing</t></w>
<w><t>sol-id-ly</t></w>
<w><t>sol-id-ness</t></w>
@@ -151854,7 +151858,7 @@
<w><t>sple-no-meg-a-ly</t></w>
<w><t>spleu-chan</t></w>
<w><t>spleu-ghan</t></w>
-<w><t>splice</t></w>
+<w><t>splice</t><verb><regular-root/></verb></w>
<w><t>splice-a-ble</t></w>
<w><t>splic-er</t></w>
<w><t>splic-ing</t></w>
@@ -153875,7 +153879,7 @@
<w><t>sto-lon-if-er-ous-ly</t></w>
<w><t>sto-lon-i-za-tion</t></w>
<w><t>sto-ma</t></w>
-<w><t>stom-ach</t></w>
+<w><t>stom-ach</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>stom-ach-ache</t></w>
<w><t>stom-ach=ach-y</t></w>
<w><t>stom-ach-er</t></w>
@@ -156014,7 +156018,7 @@
<w><t>sub-si-di-za-tion</t></w>
<w><t>sub-si-dize</t></w>
<w><t>sub-si-diz-er</t></w>
-<w><t>sub-si-dy</t></w>
+<w><t>sub-si-dy</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>sub-sil-i-cate</t></w>
<w><t>sub-sim-i-an</t></w>
<w><t>sub-sim-i-ous</t></w>
@@ -156153,7 +156157,7 @@
<w><t>sub-tep-id</t></w>
<w><t>sub-te-pid-i-ty</t></w>
<w><t>sub-te-rete</t></w>
-<w><t>sub-ter-fuge</t></w>
+<w><t>sub-ter-fuge</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>sub-term-i-nal</t></w>
<w><t>sub-ter-mi-nal</t></w>
<w><t>sub-ter-nat-u-ral</t></w>
@@ -158033,7 +158037,7 @@
<w><t>sup-pli-ant</t></w>
<w><t>sup-pli-ant-ly</t></w>
<w><t>sup-pli-ant-ness</t></w>
-<w><t>sup-pli-cant</t></w>
+<w><t>sup-pli-cant</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>sup-pli-cate</t></w>
<w><t>sup-pli-cat-ed</t></w>
<w><t>sup-pli-cat-ing</t></w>
@@ -163807,7 +163811,8 @@
<w><t>topog</t></w>
<w><t>to-pog-ra-pher</t></w>
<w><t>top-o-graph-ic</t></w>
-<w><t>top-o-graph-i-cal-ly</t></w>
+<w><t>top-o-graph-i-cal</t><adjective/></w>
+<w><t>top-o-graph-i-cal-ly</t><adverb/></w>
<w><t>to-pog-ra-phy</t></w>
<w><t>top-o-log-ic</t></w>
<phrase><t>top-o-log-i-cal group</t></phrase>
@@ -165544,7 +165549,7 @@
<w><t>tri-lo-bite</t></w>
<w><t>tri-loc-u-lar</t></w>
<w><t>tril-o-gy</t></w>
-<w><t>trim</t></w>
+<w><t>trim</t><verb><regular-root/></verb></w>
<w><t>tri-ma-ran</t></w>
<w><t>tri-mer</t></w>
<w><t>tri-mer-ic</t></w>
@@ -168728,7 +168733,8 @@
<w><t>un-chanced</t></w>
<w><t>un-chan-cy</t></w>
<w><t>un-change-a-bil-i-ty</t></w>
-<w><t>un-change-a-ble</t></w>
+<w><t>un-change-a-ble</t><adjective/></w>
+<w><t>un-change-a-bly</t><adverb/></w>
<w><t>un-changed</t></w>
<w><t>un-change-ful</t></w>
<w><t>un-chang-ing</t></w>
@@ -177179,7 +177185,7 @@
<w><t>un-sex-u-al</t></w>
<w><t>un-shab-bi-ly</t></w>
<w><t>un-shab-by</t></w>
-<w><t>un-shack-le</t></w>
+<w><t>un-shack-le</t><verb><regular-root/></verb></w>
<w><t>un-shade</t></w>
<w><t>un-shad-ed</t></w>
<w><t>un-shad-i-ly</t></w>
@@ -179062,6 +179068,7 @@
<w><t>un-want-ed</t></w>
<w><t>un-wan-ton</t></w>
<w><t>un-war-bled</t></w>
+<w><t>un-war-i-ly</t><adverb/></w>
<w><t>un-war-like</t></w>
<w><t>un-warm-a-ble</t></w>
<w><t>un-warmed</t></w>
@@ -179073,7 +179080,7 @@
<w><t>un-war-rant-a-ble</t><adjective/></w>
<w><t>un-war-rant-a-bly</t><adverb/></w>
<w><t>un-war-rant-ed</t></w>
-<w><t>un-war-y</t></w>
+<w><t>un-war-y</t><adjective/></w>
<w><t>un-wash-a-ble</t></w>
<w><t>un-washed</t></w>
<w><t>un-wast-a-ble</t></w>
@@ -179319,7 +179326,7 @@
<w><t>up-borne</t></w>
<w><t>up-bound</t></w>
<w><t>up=bow</t></w>
-<w><t>up-braid</t></w>
+<w><t>up-braid</t><verb><regular-root/></verb></w>
<w><t>up-braid-ing</t></w>
<w><t>up-braid-ing-ly</t></w>
<w><t>up-bring-ing</t></w>
@@ -180720,7 +180727,7 @@
<w><t>ve-nous</t></w>
<w><t>ve-nous-ly</t></w>
<w><t>ve-nous-ness</t></w>
-<w><t>vent</t></w>
+<w><t>vent</t><verb><regular-root/></verb></w>
<w><t>Ven-t</t></w>
<w><t>vent-age</t></w>
<w><t>ven-tail</t></w>
@@ -181323,7 +181330,7 @@
<w><t>vic-to-ri-ous-ness</t></w>
<w><t>Vic-tor-man-u-el</t></w>
<w><t>Vic-to-ry</t></w>
-<w><t>vic-to-ry</t></w>
+<w><t>vic-to-ry</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>vic-to-ry-less</t></w>
<w><t>vic-tress</t></w>
<w><t>vic-tri-ces</t></w>
@@ -181664,7 +181671,7 @@
<w><t>vir-gin-al</t></w>
<phrase><t>Vir-gin Birth</t></phrase>
<phrase><t>vir-gin birth</t></phrase>
-<w><t>Vir-gin-ia</t></w>
+<w><t>Vir-gin-ia</t><noun><convertible-to-possessive/></noun></w>
<w><t>Vir-gin-i-a</t></w>
<phrase><t>Vir-gin-i-a Beach</t></phrase>
<phrase><t>Vir-gin-i-a creep-er</t></phrase>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 13:40:32
|
Revision: 13293
http://sourceforge.net/p/foray/code/13293
Author: victormote
Date: 2023-09-28 13:40:29 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Minor cleanup.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 13:24:11 UTC (rev 13292)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 13:40:29 UTC (rev 13293)
@@ -466,22 +466,22 @@
* @param inputItem The input item being tokenized.
*/
protected void findBreakTypes(final InputItem inputItem) {
- for (int breakIndex = 0; breakIndex < inputItem.inputBreaks.size(); breakIndex ++) {
- final List<InputBreak> inputBreaks = inputItem.inputBreaks;
+ final List<InputBreak> inputBreaks = inputItem.inputBreaks;
+ for (int breakIndex = 0; breakIndex < inputBreaks.size(); breakIndex ++) {
+ final int startOffset = inputBreaks.get(breakIndex).offset;
if (breakIndex >= inputBreaks.size() - 1) {
inputBreaks.get(breakIndex).type = TokenType2.END;
} else {
- final int sequenceIndex = inputBreaks.get(breakIndex).offset;
- final int end = inputBreaks.get(breakIndex + 1).offset;
+ final int endOffset = inputBreaks.get(breakIndex + 1).offset;
/* Special cases where the first char alone does not tell the whole story. */
- if (NumberUtils.isArabicNumber(inputItem.text, sequenceIndex, end)) {
+ if (NumberUtils.isArabicNumber(inputItem.text, startOffset, endOffset)) {
inputBreaks.get(breakIndex).type = TokenType2.WORD;
continue;
}
/* Interpret the sequence from the first char only. */
- final int testChar = inputItem.text.charAt(sequenceIndex);
+ final int testChar = inputItem.text.charAt(startOffset);
inputBreaks.get(breakIndex).type = computeCharType(testChar);
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 13:24:13
|
Revision: 13292
http://sourceforge.net/p/foray/code/13292
Author: victormote
Date: 2023-09-28 13:24:11 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Remove no-longer-needed token type START.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 13:10:37 UTC (rev 13291)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 13:24:11 UTC (rev 13292)
@@ -150,9 +150,6 @@
/** Ambiguous trailing punctuation that is currently thought to be resolvable when the context is considered. */
TRANSIENT_TRAILING_PUNCTUATION(null),
- /** This is not a true token type. It marks the start of the character sequence. */
- START(null),
-
/** This is not a true token type. It marks the end of the character sequence. */
END(null);
@@ -729,7 +726,7 @@
*/
private void createComputedTokens(final InputItem inputItem) {
final List<InputBreak> inputBreaks = inputItem.inputBreaks;
- TokenType2 lastBreakType = TokenType2.START;
+ TokenType2 lastBreakType = null;
int nextTokenOffset = 0;
for (int breakIndex = 0; breakIndex < inputBreaks.size(); breakIndex ++) {
@@ -736,7 +733,7 @@
final InputBreak inputBreak = inputBreaks.get(breakIndex);
final TokenType2 currentBreakType = inputBreak.type;
final int currentOffset = inputBreak.offset;
- if (lastBreakType != TokenType2.START
+ if (lastBreakType != null
&& (currentBreakType != lastBreakType
|| currentBreakType != TokenType2.WORD)) {
final Token4a token = new Token4a();
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 13:10:39
|
Revision: 13291
http://sourceforge.net/p/foray/code/13291
Author: victormote
Date: 2023-09-28 13:10:37 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Enable test of a token item ending with leading punctuation.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-28 13:08:50 UTC (rev 13290)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-28 13:10:37 UTC (rev 13291)
@@ -38,7 +38,6 @@
import org.axsl.unicode.block.U2000_General_Punctuation;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
@@ -675,7 +674,6 @@
* Test of a phrase in a different writing system, preceded by a period.
*/
@Test
- @Disabled("Solution is in progress.")
public void testPunctuationBeforeDifferentWritingSystem() {
final Lexer4a out = getObjectUnderTest();
out.addUntokenized("We adjourned “", WritingSystem4a.USA);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 13:08:53
|
Revision: 13290
http://sourceforge.net/p/foray/code/13290
Author: victormote
Date: 2023-09-28 13:08:50 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Bracket token input with BREAK token types during token resolution, instead of END.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 12:59:49 UTC (rev 13289)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 13:08:50 UTC (rev 13290)
@@ -502,7 +502,7 @@
*/
private void resolvePossibleIntrawordPunctuation() {
final TokenType2 preSequenceBreakType = TokenType2.BREAK;
- final TokenType2 postSequenceBreakType = TokenType2.END;
+ final TokenType2 postSequenceBreakType = TokenType2.BREAK;
/* First iterate in reverse order. */
for (int breakIndex = this.input.size() - 1; breakIndex > -1; breakIndex --) {
@@ -530,8 +530,7 @@
/* Look for ambiguous punctuation immediate followed by whitespace and immediately preceded by trailing
* punctuation. Resolve it to trailing punctuation. */
if (currentBreak.type == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION
- && (nextBreakType == TokenType2.BREAK
- || nextBreakType == TokenType2.END)
+ && (nextBreakType == TokenType2.BREAK)
&& (previousBreakType == TokenType2.TRAILING_PUNCTUATION
|| previousBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION)) {
currentBreak.type = TokenType2.TRAILING_PUNCTUATION;
@@ -551,8 +550,7 @@
* word. */
if (currentBreak.type != TokenType2.BREAK
&& previousBreakType == TokenType2.BREAK
- && (nextBreakType == TokenType2.BREAK
- || nextBreakType == TokenType2.END)) {
+ && nextBreakType == TokenType2.BREAK) {
currentBreak.type = TokenType2.WORD;
}
@@ -627,7 +625,7 @@
*/
private void resolveAttachedLeadingPunctuation() {
final TokenType2 preSequenceBreakType = TokenType2.BREAK;
- final TokenType2 postSequenceBreakType = TokenType2.END;
+ final TokenType2 postSequenceBreakType = TokenType2.BREAK;
/* Resolve attached leading punctuation. */
for (int breakIndex = 0; breakIndex < this.input.size(); breakIndex ++) {
final InputBreak currentBreak = this.input.get(breakIndex);
@@ -669,7 +667,7 @@
*/
private void resolveAttachedTrailingPunctuation() {
final TokenType2 preSequenceBreakType = TokenType2.BREAK;
- final TokenType2 postSequenceBreakType = TokenType2.END;
+ final TokenType2 postSequenceBreakType = TokenType2.BREAK;
/* Resolve attached trailing punctuation. Iterate these in reverse order. */
for (int breakIndex = this.input.size() - 1; breakIndex > 0; breakIndex --) {
final InputBreak currentBreak = this.input.get(breakIndex);
@@ -680,8 +678,7 @@
switch (currentBreak.type) {
case TRAILING_PUNCTUATION: {
switch (nextBreakType) {
- case BREAK:
- case END: {
+ case BREAK: {
switch (previousBreakType) {
case BREAK: {
/* Surrounded by breaks. Treat this as a word. */
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 12:59:52
|
Revision: 13289
http://sourceforge.net/p/foray/code/13289
Author: victormote
Date: 2023-09-28 12:59:49 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Add disabled test for leading punctuation immediately before a new input item.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-28 12:04:01 UTC (rev 13288)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-28 12:59:49 UTC (rev 13289)
@@ -38,6 +38,7 @@
import org.axsl.unicode.block.U2000_General_Punctuation;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
@@ -670,4 +671,28 @@
testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
+ /**
+ * Test of a phrase in a different writing system, preceded by a period.
+ */
+ @Test
+ @Disabled("Solution is in progress.")
+ public void testPunctuationBeforeDifferentWritingSystem() {
+ final Lexer4a out = getObjectUnderTest();
+ out.addUntokenized("We adjourned “", WritingSystem4a.USA);
+ out.addUntokenized("sine die", WritingSystem4a.LATIN);
+ out.addUntokenized(".”", WritingSystem4a.USA);
+ final List<Lexer.Token> actual = tokenize();
+ assertEquals(10, actual.size());
+ testToken(actual.get(0), "We", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
+ testToken(actual.get(2), "adjourned", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
+ testToken(actual.get(4), "“", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(5), "sine", TokenType.WORD, WritingSystem4a.LATIN);
+ testToken(actual.get(6), " ", TokenType.BREAK, WritingSystem4a.LATIN);
+ testToken(actual.get(7), "die", TokenType.WORD, WritingSystem4a.LATIN);
+ testToken(actual.get(8), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(9), "”", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ }
+
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 12:04:05
|
Revision: 13288
http://sourceforge.net/p/foray/code/13288
Author: victormote
Date: 2023-09-28 12:04:01 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Simplify the locking and processing sequence.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 11:58:36 UTC (rev 13287)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 12:04:01 UTC (rev 13288)
@@ -303,9 +303,6 @@
/** The input that has been submitted for processing. */
private Input input = new Input();
- /** Indicates whether the current content has been tokenized. */
- private boolean isTokenized;
-
/** The index into the result arrays that will be used by the next call to {@link #next()}. */
private int nextResultIndex;
@@ -375,12 +372,12 @@
@Override
public void lock() {
this.isLocked = true;
+ process();
}
@Override
public void clear() {
this.input.items.clear();
- this.isTokenized = false;
this.output.clear();
this.isLocked = false;
}
@@ -390,9 +387,6 @@
if (! this.isLocked) {
throw new IllegalStateException("This lexer is not locked.");
}
- if (! this.isTokenized) {
- process();
- }
if (! hasNext()) {
throw new NoSuchElementException();
}
@@ -417,9 +411,6 @@
if (! this.isLocked) {
throw new IllegalStateException("This lexer is not locked.");
}
- if (! this.isTokenized) {
- process();
- }
return this.nextResultIndex < this.output.size();
}
@@ -429,7 +420,6 @@
private void process() {
this.output.clear();
this.nextResultIndex = 0;
- this.isTokenized = true;
if (this.input.items.size() < 1) {
return;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 11:58:38
|
Revision: 13287
http://sourceforge.net/p/foray/code/13287
Author: victormote
Date: 2023-09-28 11:58:36 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Use the token writing system, not the last parsed one.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-28 11:21:28 UTC (rev 13286)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-28 11:58:36 UTC (rev 13287)
@@ -297,18 +297,16 @@
final String text = getAndClearText();
lexer.addUntokenized(text, oldWritingSystem);
- checkWords(oldWritingSystem, this.lastLocation);
+ checkWords(this.lastLocation);
}
}
/**
- * Check for a change in writing system, and, if there is one, flush the text accumulator and spell-check the words
- * in it.
- * @param writingSystem The writing system to be used to spell-check the accumulated text.
- * @param location The location of the word in the original document.
+ * Iterate the tokens from the Lexer and spell-check the words.
+ * @param location The location of the text in the original document.
*/
- private void checkWords(final WritingSystem4a writingSystem, final String location) {
+ private void checkWords(final String location) {
final Lexer4a lexer = this.server.getLexer();
lexer.lock();
@@ -317,7 +315,7 @@
Orthography4a orthography = null;
if (lexer.hasNext()) {
lastWritingSystem = lexer.peekNext().getWritingSystem();
- orthography = this.server.getOrthography(writingSystem);
+ orthography = this.server.getOrthography(lastWritingSystem);
}
while (lexer.hasNext()) {
final Lexer.Token token = lexer.next();
@@ -376,7 +374,6 @@
* @return The writing system parsed from {@code attributes}.
*/
private WritingSystem4a parseWritingSystem(final Attributes attributes) {
- /* TODO: This is horrible. Make this all configurable. */
final String languageAttr = XML_LANG_ATTRIBUTE.getValue(attributes);
if (languageAttr == null) {
if (getCurrentWritingSystem() == null) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 11:21:30
|
Revision: 13286
http://sourceforge.net/p/foray/code/13286
Author: victormote
Date: 2023-09-28 11:21:28 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Fix process of getting the orthography.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 10:55:57 UTC (rev 13285)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 11:21:28 UTC (rev 13286)
@@ -263,7 +263,7 @@
/**
* FOray implementation.
*/
- private class Token4a implements Lexer.Token {
+ public class Token4a implements Lexer.Token {
/** The text. */
private CharSequence text;
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-28 10:55:57 UTC (rev 13285)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-28 11:21:28 UTC (rev 13286)
@@ -35,7 +35,6 @@
import org.foray.orthography.OrthographyServer4a;
import org.foray.orthography.OrthographyServerConfig;
import org.foray.orthography.SegmentDictionary;
-import org.foray.primitive.CharacterUtils;
import org.foray.xml.SaxParser;
import org.foray.xml.SaxUtils;
import org.foray.xml.dtd.DtdAttribute;
@@ -310,32 +309,23 @@
* @param location The location of the word in the original document.
*/
private void checkWords(final WritingSystem4a writingSystem, final String location) {
- final Orthography4a orthography = writingSystem == null ? null : this.server.getOrthography(writingSystem);
- final Lexer lexer = this.server.getLexer();
+ final Lexer4a lexer = this.server.getLexer();
+ lexer.lock();
- final StringBuilder textAccumulator = getTextAccumulator();
-
- /* Remove any trailing chars in the text buffer that are actually leading punctuation. */
- while (textAccumulator.length() > 0
- && CharacterUtils.isAttachedLeadingPunctuation(textAccumulator.charAt(textAccumulator.length() - 1))) {
- textAccumulator.deleteCharAt(textAccumulator.length() - 1);
+ /* Writing system should never be null, but orthography could be. */
+ WritingSystem lastWritingSystem = null;
+ Orthography4a orthography = null;
+ if (lexer.hasNext()) {
+ lastWritingSystem = lexer.peekNext().getWritingSystem();
+ orthography = this.server.getOrthography(writingSystem);
}
-
- /* Remove any leading chars in the text buffer that are actually trailing punctuation. */
- while (textAccumulator.length() > 1
- && CharacterUtils.isAttachedTrailingPunctuation(textAccumulator.charAt(0))
- && CharacterUtils.isWordBreakChar(textAccumulator.charAt(1))) {
- textAccumulator.deleteCharAt(0);
- }
-
- final String text = getAndClearText();
- if (text.length() > 0) {
- lexer.addUntokenized(text, writingSystem);
- }
- lexer.lock();
while (lexer.hasNext()) {
final Lexer.Token token = lexer.next();
if (token.getTokenType() == TokenType.WORD) {
+ if (! lastWritingSystem.equals(token.getWritingSystem())) {
+ orthography = this.server.getOrthography(token.getWritingSystem());
+ }
+ lastWritingSystem = token.getWritingSystem();
checkWord(orthography, token, location);
}
}
@@ -350,7 +340,7 @@
*/
private void checkWord(final Orthography4a orthography, final Lexer.Token token, final String location) {
if (orthography == null) {
- /* Treat as a misspelling. */
+ /* Treat as an error. */
this.output.println("(no config) " + token.getText());
return;
}
@@ -380,7 +370,7 @@
}
/**
- * For a given set of attributes, parses the {@link WritingSystem} from them and then checks the {@link Orthogrpahy}
+ * For a given set of attributes, parses the {@link WritingSystem} from them and then checks the {@link Orthography}
* for that {@link WritingSystem}.
* @param attributes The attributes that possibly contain language, country, and script data.
* @return The writing system parsed from {@code attributes}.
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 10:55:59
|
Revision: 13285
http://sourceforge.net/p/foray/code/13285
Author: victormote
Date: 2023-09-28 10:55:57 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Clean up timing of running the Lexer.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-28 10:28:12 UTC (rev 13284)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-28 10:55:57 UTC (rev 13285)
@@ -36,7 +36,6 @@
import org.foray.orthography.OrthographyServerConfig;
import org.foray.orthography.SegmentDictionary;
import org.foray.primitive.CharacterUtils;
-import org.foray.primitive.StringUtils;
import org.foray.xml.SaxParser;
import org.foray.xml.SaxUtils;
import org.foray.xml.dtd.DtdAttribute;
@@ -110,7 +109,7 @@
/** The qualified name, if specified. */
private String qName;
- /** The writing system specified in an xml:lang attribute for this element, if any. */
+ /** The writing system specified for this element, if any. */
private WritingSystem4a writingSystem;
/**
@@ -242,11 +241,19 @@
@Override
public void startElement(final String uri, final String localName, final String qName, final Attributes attributes)
throws SAXException {
+ final Lexer4a lexer = this.server.getLexer();
final WritingSystem4a oldWritingSystem = getCurrentWritingSystem();
if ("word".equals(localName)) {
- checkWords(oldWritingSystem, this.lastLocation);
+ /* Push any existing content to the lexer. */
+ final String text = getAndClearText();
+ lexer.addUntokenized(text, oldWritingSystem);
}
+ if ("foreign".equals(localName)) {
+ /* Push any existing content to the lexer. */
+ final String text = getAndClearText();
+ lexer.addUntokenized(text, oldWritingSystem);
+ }
final String location = LOCATION_ATTRIBUTE.getValue(attributes);
this.lastLocation = location == null ? "N/A" : location;
@@ -255,15 +262,7 @@
element.namespace = uri;
element.localName = localName;
element.qName = qName;
-
- /* Is there a change in writing system? */
- final WritingSystem4a newWritingSystem = parseWritingSystem(attributes);
- element.writingSystem = newWritingSystem;
-
- if (newWritingSystem != null
- && ! ObjectUtils.safeEquals(oldWritingSystem, newWritingSystem)) {
- checkWords(oldWritingSystem, this.lastLocation);
- }
+ element.writingSystem = parseWritingSystem(attributes);
this.elementStack.push(element);
}
@@ -272,7 +271,7 @@
if (this.elementStack.size() < 1) {
throw new SAXException("Element stack is empty but should not be.");
}
-
+ final Lexer4a lexer = this.server.getLexer();
final WritingSystem4a oldWritingSystem = getCurrentWritingSystem();
final Element element = this.elementStack.pop();
@@ -282,27 +281,24 @@
/* Tokenize the Word element manually. */
if ("word".equals(localName)) {
- /* Any text before the Word should have been processed in startElement. What is in the text buffer right
- * now should be exactly the content of the word element. */
- this.server.getLexer().addWordToken(getAndClearText(), oldWritingSystem);
-// final Orthography4a orthography =
-// oldWritingSystem == null ? null : this.server.getOrthography(oldWritingSystem);
-// checkWord(oldWritingSystem, orthography, getAndClearText(), this.lastLocation);
+ /* Any text before this "word" element should have been processed in startElement. What is in the text
+ * buffer right now should be exactly the content of the "word" element. */
+ final String text = getAndClearText();
+ lexer.addWordToken(text, oldWritingSystem);
}
-
- final WritingSystem4a newWritingSystem = getCurrentWritingSystem();
-
- if (! ObjectUtils.safeEquals(oldWritingSystem, newWritingSystem)) {
- checkWords(oldWritingSystem, this.lastLocation);
+ if ("foreign".equals(localName)) {
+ /* Any text before this "foreign" element should have been processed in startElement. What is in the text
+ * buffer right now should be exactly the content of the "foreign" element. */
+ final String text = getAndClearText();
+ lexer.addUntokenized(text, oldWritingSystem);
}
- /* Whether the writing system has changed or not, if we are at the end of a terminal element, process text. */
+ /* We are at the end of a terminal element. Check the spelling. */
if ("text".equals(localName)) {
+ final String text = getAndClearText();
+ lexer.addUntokenized(text, oldWritingSystem);
+
checkWords(oldWritingSystem, this.lastLocation);
- /* Since we are at the end of a terminal element, the next text should be the beginning of a word.
- * Pretend like the end of this element is a single space, to avoid confusion about where and if a word
- * is starting. */
- appendText(StringUtils.SINGLE_SPACE);
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 10:28:15
|
Revision: 13284
http://sourceforge.net/p/foray/code/13284
Author: victormote
Date: 2023-09-28 10:28:12 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Simply ignore empty content. Throw exception for null parameters.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 10:02:24 UTC (rev 13283)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 10:28:12 UTC (rev 13284)
@@ -326,8 +326,14 @@
if (this.isLocked) {
throw new IllegalStateException("This lexer is locked.");
}
+ if (text == null) {
+ throw new IllegalArgumentException("The text cannot be null");
+ }
+ if (writingSystem == null) {
+ throw new IllegalArgumentException("The writing system cannot be null");
+ }
if (text.length() < 1) {
- throw new IllegalArgumentException("Cannot add empty content.");
+ return;
}
final InputItem inputItem = new InputItem();
@@ -343,8 +349,14 @@
if (this.isLocked) {
throw new IllegalStateException("This lexer is locked.");
}
+ if (text == null) {
+ throw new IllegalArgumentException("The text cannot be null");
+ }
+ if (writingSystem == null) {
+ throw new IllegalArgumentException("The writing system cannot be null");
+ }
if (text.length() < 1) {
- throw new IllegalArgumentException("Cannot add empty content.");
+ return;
}
final Matcher matcher = XML_WHITESPACE_TO_NORMALIZE.matcher(text);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 10:02:27
|
Revision: 13283
http://sourceforge.net/p/foray/code/13283
Author: victormote
Date: 2023-09-28 10:02:24 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Dictionary improvements.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-GBR.dict.xml
trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-USA.dict.xml
trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-GBR.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-GBR.dict.xml 2023-09-28 03:23:21 UTC (rev 13282)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-GBR.dict.xml 2023-09-28 10:02:24 UTC (rev 13283)
@@ -21,6 +21,18 @@
<w><t>be-hove</t><verb><regular-root/></verb></w>
<w><t>cen-tre</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>co=la-bour-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>dis-til</t><verb/></w>
+<w><t>dis-tils</t><verb><lemma>distil</lemma></verb></w>
+<w><t>draught</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>draught-board</t></w>
+<w><t>draught-er</t></w>
+<w><t>draught-i-er</t></w>
+<w><t>draught-i-est</t></w>
+<w><t>draught-i-ly</t></w>
+<w><t>draught-i-ness</t></w>
+<w><t>draughts-board</t></w>
+<w><t>draughts-man</t></w>
+<w><t>draught-y</t></w>
<w><t>ful-fil</t><verb/></w>
<w><t>ful-fil-ment</t><noun/></w>
<w><t>ful-fils</t><verb><vf><singular/></vf></verb></w>
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-USA.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-USA.dict.xml 2023-09-28 03:23:21 UTC (rev 13282)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-USA.dict.xml 2023-09-28 10:02:24 UTC (rev 13283)
@@ -21,6 +21,19 @@
<w><t>be-hoove</t><verb><regular-root/></verb></w>
<w><t>cen-ter</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>co=la-bor-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>dis-till</t><verb/></w>
+<w><t>draft</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
+<w><t>draft-a-ble</t></w>
+<phrase><t>draft dodg-er</t></phrase>
+<w><t>draft-ee</t></w>
+<w><t>draft-er</t></w>
+<w><t>draft-i-er</t></w>
+<w><t>draft-i-est</t></w>
+<w><t>draft-i-ly</t></w>
+<w><t>draft-i-ness</t></w>
+<w><t>drafts-man</t></w>
+<w><t>drafts-man-ship</t></w>
+<w><t>draft-y</t></w>
<w><t>ful-fill</t><verb><regular-root/></verb></w>
<w><t>ful-fill-ment</t><noun/></w>
<w><t>la-bor</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2023-09-28 03:23:21 UTC (rev 13282)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2023-09-28 10:02:24 UTC (rev 13283)
@@ -56,7 +56,7 @@
-->
<w><t>&</t></w>
-<w><t>&c</t><abbrev referenced-word="etc., et cetera"/></w>
+<w><t>&c.</t><abbrev referenced-word="etc., et cetera"/></w>
<w><t>a</t></w>
<w><t>a.d.</t><abbrev referenced-word="anno Domini"/><comment>Latin "year of our Lord"</comment></w>
<w><t>a. d.</t><abbrev referenced-word="anno Domini"/><comment>Latin "year of our Lord"</comment></w>
@@ -428,7 +428,7 @@
<w><t>a-bom-i-na-ble-ness</t></w>
<phrase><t>a-bom-i-na-ble snow-man</t></phrase>
<w><t>a-bom-i-na-bly</t></w>
-<w><t>a-bom-i-nate</t></w>
+<w><t>a-bom-i-nate</t><verb><regular-root/></verb></w>
<w><t>a-bom-i-nat-ed</t></w>
<w><t>a-bom-i-nat-ing</t></w>
<w><t>a-bom-i-na-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -1630,7 +1630,7 @@
<w><t>ac-tu-ar-ies</t></w>
<w><t>ac-tu-ary</t></w>
<w><t>ac-tu-ar-y</t></w>
-<w><t>ac-tu-ate</t></w>
+<w><t>ac-tu-ate</t><verb><regular-root/></verb></w>
<w><t>ac-tu-at-ed</t></w>
<w><t>ac-tu-at-ing</t></w>
<w><t>ac-tu-a-tion</t></w>
@@ -2006,7 +2006,7 @@
<w><t>ad-mi-ra-tion</t></w>
<w><t>ad-mi-ra-tive</t></w>
<w><t>ad-mi-ra-tive-ly</t></w>
-<w><t>ad-mire</t></w>
+<w><t>ad-mire</t><verb><regular-root/></verb></w>
<w><t>ad-mired</t></w>
<w><t>ad-mir-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ad-mir-ing</t></w>
@@ -2080,7 +2080,7 @@
<w><t>a-dor-a-ble-ness</t></w>
<w><t>a-dor-a-bly</t></w>
<w><t>ad-o-ra-tion</t></w>
-<w><t>a-dore</t></w>
+<w><t>a-dore</t><verb><regular-root/></verb></w>
<w><t>a-dored</t></w>
<w><t>a-dor-er</t></w>
<w><t>a-dor-ing</t></w>
@@ -2594,8 +2594,8 @@
<w><t>ae-ti-ol-o-gy</t></w>
<w><t>Aet-na</t></w>
<w><t>Ae-to-li-a</t></w>
-<w><t>Ae-to-li-an</t></w>
-<w><t>Æ-to-li-an</t></w>
+<w><t>Ae-to-li-an</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>Æ-to-li-an</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Ae-to-lus</t></w>
<w><t>a-faced</t></w>
<w><t>a-fac-ing</t></w>
@@ -2657,7 +2657,7 @@
<w><t>af-firm-a-bly</t></w>
<w><t>af-firm-ance</t></w>
<w><t>af-firm-ant</t></w>
-<w><t>af-fir-ma-tion</t></w>
+<w><t>af-fir-ma-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>af-firm-a-tive</t></w>
<w><t>af-firm-a-tive=ac-tion</t></w>
<w><t>af-firm-a-tive-ly</t></w>
@@ -4278,7 +4278,7 @@
<w><t>al-li-gate</t></w>
<w><t>al-li-gat-ed</t></w>
<w><t>al-li-gat-ing</t></w>
-<w><t>al-li-ga-tor</t></w>
+<w><t>al-li-ga-tor</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>al-li-ga-tor-fish</t></w>
<w><t>al-li-ga-tor-fish-es</t></w>
<phrase><t>al-li-ga-tor pear</t></phrase>
@@ -4657,7 +4657,7 @@
<w><t>al-ter-nat-ing-ly</t></w>
<w><t>al-ter-na-tion</t></w>
<phrase><t>al-ter-na-tion of gen-er-a-tions</t></phrase>
-<w><t>al-ter-na-tive</t></w>
+<w><t>al-ter-na-tive</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>al-ter-na-tive-ly</t></w>
<w><t>al-ter-na-tive-ness</t></w>
<w><t>al-ter-na-tiv-i-ty</t></w>
@@ -9463,7 +9463,7 @@
<w><t>Ar-magh</t></w>
<w><t>Ar-mag-nac</t></w>
<w><t>Ar-ma-gnac</t></w>
-<w><t>ar-ma-ment</t></w>
+<w><t>ar-ma-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ar-ma-men-tar-i-um</t></w>
<w><t>Ar-mand</t></w>
<w><t>ar-mar-i-a</t></w>
@@ -10196,7 +10196,7 @@
<w><t>as-persed</t></w>
<w><t>as-pers-er</t></w>
<w><t>as-pers-ing</t></w>
-<w><t>as-per-sion</t></w>
+<w><t>as-per-sion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>as-per-sive</t></w>
<w><t>as-per-sive-ly</t></w>
<w><t>as-per-so-ri-a</t></w>
@@ -11967,7 +11967,7 @@
<w><t>ax-i-o-log-i-cal-ly</t></w>
<w><t>ax-i-ol-o-gist</t></w>
<w><t>ax-i-ol-o-gy</t></w>
-<w><t>ax-i-om</t></w>
+<w><t>ax-i-om</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ax-i-o-mat-ic</t></w>
<w><t>ax-i-o-mat-i-cal</t></w>
<w><t>ax-i-o-mat-i-cal-ly</t></w>
@@ -16289,7 +16289,7 @@
<w><t>bi-na-tion</t></w>
<w><t>bin-au-ral</t></w>
<w><t>Bin-chois</t></w>
-<w><t>bind</t></w>
+<w><t>bind</t><verb><regular-root value="false"/></verb></w>
<w><t>bind-a-ble</t></w>
<w><t>bind-er</t></w>
<w><t>bin-der</t></w>
@@ -16303,6 +16303,7 @@
<w><t>bind-ing-ness</t></w>
<w><t>bin-dle</t></w>
<phrase><t>bind o-ver</t></phrase>
+<w><t>binds</t><verb><lemma>bind</lemma></verb></w>
<w><t>bind-weed</t></w>
<w><t>bine</t></w>
<w><t>Bi-net</t></w>
@@ -18003,7 +18004,7 @@
<w><t>bol-só-nes</t></w>
<w><t>bol-ster</t></w>
<w><t>bol-ster-er</t></w>
-<w><t>bolt</t></w>
+<w><t>bolt</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>bolt=ac-tion</t></w>
<w><t>bol-tel</t></w>
<w><t>bolt-er</t></w>
@@ -18377,7 +18378,7 @@
<w><t>Bor-der</t></w>
<w><t>bor-de-reau</t></w>
<w><t>bor-dered</t></w>
-<w><t>bor-der-er</t></w>
+<w><t>bor-der-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>bor-der-land</t></w>
<w><t>bor-der-less</t></w>
<w><t>bor-der-light</t></w>
@@ -19614,7 +19615,7 @@
<w><t>brides-maid</t></w>
<w><t>bride-well</t></w>
<w><t>Brid-ey</t></w>
-<w><t>bridge</t></w>
+<w><t>bridge</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>Bridge</t></w>
<w><t>bridge-a-ble</t></w>
<w><t>bridge-board</t></w>
@@ -21193,7 +21194,7 @@
<w><t>but-ler-ship</t></w>
<w><t>but-ler-y</t></w>
<w><t>But-su</t></w>
-<w><t>butt</t></w>
+<w><t>butt</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Butt</t></w>
<w><t>but-tals</t></w>
<w><t>Butte</t></w>
@@ -21686,6 +21687,7 @@
<w><t>Caer-phil-ly</t></w>
<w><t>caes-al-pin-i-a-ceous</t></w>
<w><t>Cae-sar</t></w>
+<w><t>Cæ-sar</t></w>
<w><t>Cae-sar-au-gus-ta</t></w>
<w><t>Caes-a-re-a</t></w>
<phrase><t>Caes-a-re-a Maz-a-ca</t></phrase>
@@ -22319,7 +22321,7 @@
<w><t>cam-pa-gna</t></w>
<w><t>Cam-pa-gne</t></w>
<w><t>cam-pa-gus</t></w>
-<w><t>cam-paign</t></w>
+<w><t>cam-paign</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>cam-paign-er</t></w>
<w><t>Cam-pa-nia</t></w>
<w><t>Cam-pa-ni-a</t></w>
@@ -22901,7 +22903,7 @@
<w><t>cap-ing</t></w>
<w><t>cap-i-ta</t></w>
<w><t>ca-pi-ta</t></w>
-<w><t>cap-i-tal</t></w>
+<w><t>cap-i-tal</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>cap-i-tal ac-count</t></phrase>
<phrase><t>cap-i-tal as-sets</t></phrase>
<phrase><t>cap-i-tal ex-pen-di-ture</t></phrase>
@@ -30449,7 +30451,7 @@
<w><t>com-bo</t></w>
<w><t>com-bust</t></w>
<w><t>com-bus-ti-bil-i-ty</t></w>
-<w><t>com-bus-ti-ble</t></w>
+<w><t>com-bus-ti-ble</t><noun><pluralizable/></noun><adjective/></w>
<w><t>com-bus-ti-ble-ness</t></w>
<w><t>com-bus-ti-bly</t></w>
<w><t>com-bus-tion</t></w>
@@ -31794,7 +31796,7 @@
<w><t>con-form-ism</t></w>
<w><t>con-form-ist</t></w>
<w><t>con-form-i-ty</t></w>
-<w><t>con-found</t></w>
+<w><t>con-found</t><verb><regular-root/></verb></w>
<w><t>con-found-a-ble</t></w>
<w><t>con-found-ed</t></w>
<w><t>con-found-ed-ly</t></w>
@@ -32500,7 +32502,7 @@
<w><t>contd</t></w>
<w><t>conte</t></w>
<w><t>con-té</t></w>
-<w><t>con-temn</t></w>
+<w><t>con-temn</t><verb><regular-root/></verb></w>
<w><t>con-temn-er</t></w>
<w><t>con-tem-ni-ble</t></w>
<w><t>con-tem-ni-bly</t></w>
@@ -33569,7 +33571,7 @@
<w><t>cor-net-to</t></w>
<phrase><t>corn ex-change</t></phrase>
<w><t>corn-fed</t></w>
-<w><t>corn-field</t></w>
+<w><t>corn-field</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>corn-flag</t></w>
<w><t>corn-flakes</t></w>
<w><t>corn-flour</t></w>
@@ -34130,7 +34132,7 @@
<phrase><t>cot-tage loaf</t></phrase>
<phrase><t>cot-tage pi-an-o</t></phrase>
<phrase><t>cot-tage pie</t></phrase>
-<w><t>cot-tag-er</t></w>
+<w><t>cot-tag-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>cot-tar</t></w>
<w><t>Cott-bus</t></w>
<w><t>cotte</t></w>
@@ -38675,7 +38677,7 @@
<w><t>de-cou-pling</t></w>
<w><t>de-coy</t></w>
<w><t>de-coy-er</t></w>
-<w><t>de-crease</t></w>
+<w><t>de-crease</t><verb><regular-root/></verb></w>
<w><t>de-creas-ing</t></w>
<w><t>de-creas-ing-ly</t></w>
<w><t>de-cree</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -38845,7 +38847,7 @@
<w><t>de-fam-ing</t></w>
<w><t>de-fam-ing-ly</t></w>
<w><t>de-fang</t></w>
-<w><t>de-fault</t></w>
+<w><t>de-fault</t><verb><regular-root/></verb></w>
<w><t>de-fault-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>de-fea-sance</t></w>
<w><t>de-fea-si-bil-i-ty</t></w>
@@ -39032,7 +39034,7 @@
<w><t>de-form-ed-ness</t></w>
<w><t>de-form-er</t></w>
<w><t>de-for-me-ter</t></w>
-<w><t>de-form-i-ty</t></w>
+<w><t>de-form-i-ty</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>de-fraud</t><verb><regular-root/></verb></w>
<w><t>de-frau-da-tion</t></w>
<w><t>de-fraud-er</t></w>
@@ -39326,7 +39328,7 @@
<w><t>Del-ia</t></w>
<w><t>De-li-an</t></w>
<phrase><t>De-li-an League</t></phrase>
-<w><t>de-lib-er-ate</t></w>
+<w><t>de-lib-er-ate</t><verb><regular-root/></verb><adjective/></w>
<w><t>de-lib-er-at-ed</t></w>
<w><t>de-lib-er-ate-ly</t></w>
<w><t>de-lib-er-ate-ness</t></w>
@@ -43603,10 +43605,8 @@
<w><t>dis-ti-chal</t></w>
<w><t>dis-ti-chous</t></w>
<w><t>dis-ti-chous-ly</t></w>
-<w><t>dis-til</t></w>
<w><t>dis-til-er-ies</t></w>
<w><t>dis-til-er-y</t></w>
-<w><t>dis-till</t></w>
<w><t>dis-till-a-ble</t></w>
<w><t>dis-til-land</t></w>
<w><t>dis-til-late</t></w>
@@ -43629,7 +43629,7 @@
<w><t>dis-tin-gu</t></w>
<w><t>dis-tin-gué</t></w>
<w><t>dis-tin-guée</t></w>
-<w><t>dis-tin-guish</t></w>
+<w><t>dis-tin-guish</t><verb><regular-root/></verb></w>
<w><t>dis-tin-guish-a-bil-i-ty</t></w>
<w><t>dis-tin-guish-a-ble</t></w>
<w><t>dis-tin-guish-a-ble-ness</t></w>
@@ -43710,7 +43710,7 @@
<w><t>dis-trust-ful-ly</t></w>
<w><t>dis-trust-ful-ness</t></w>
<w><t>dis-turb</t><verb><regular-root/></verb></w>
-<w><t>dis-turb-ance</t></w>
+<w><t>dis-turb-ance</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>dis-turbed</t></w>
<w><t>dis-turb-er</t></w>
<w><t>dis-turb-ing</t></w>
@@ -45175,18 +45175,6 @@
<w><t>drae-ger-man</t></w>
<w><t>draff</t></w>
<w><t>draff-y</t></w>
-<w><t>draft</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
-<w><t>draft-a-ble</t></w>
-<phrase><t>draft dodg-er</t></phrase>
-<w><t>draft-ee</t></w>
-<w><t>draft-er</t></w>
-<w><t>draft-i-er</t></w>
-<w><t>draft-i-est</t></w>
-<w><t>draft-i-ly</t></w>
-<w><t>draft-i-ness</t></w>
-<w><t>drafts-man</t></w>
-<w><t>drafts-man-ship</t></w>
-<w><t>draft-y</t></w>
<w><t>drag</t><verb><regular-root/></verb></w>
<w><t>dra-g</t></w>
<w><t>dra-gée</t></w>
@@ -45296,16 +45284,6 @@
<w><t>drat</t></w>
<w><t>drat-ted</t></w>
<w><t>drat-ting</t></w>
-<w><t>draught</t></w>
-<w><t>draught-board</t></w>
-<w><t>draught-er</t></w>
-<w><t>draught-i-er</t></w>
-<w><t>draught-i-est</t></w>
-<w><t>draught-i-ly</t></w>
-<w><t>draught-i-ness</t></w>
-<w><t>draughts-board</t></w>
-<w><t>draughts-man</t></w>
-<w><t>draught-y</t></w>
<w><t>Dra-va</t></w>
<w><t>drave</t></w>
<w><t>Dra-ve</t></w>
@@ -48123,7 +48101,7 @@
<w><t>e-lix-ir</t></w>
<w><t>Eliz</t></w>
<w><t>E-li-za</t></w>
-<w><t>E-liz-a-beth</t></w>
+<w><t>E-liz-a-beth</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>E-liz-a-be-than</t></w>
<phrase><t>E-liz-a-be-than son-net</t></phrase>
<phrase><t>E-liz-a-beth I</t></phrase>
@@ -48213,7 +48191,7 @@
<w><t>e-lon-gat-ing</t></w>
<w><t>e-lon-ga-tion</t></w>
<w><t>e-lon-ga-tive</t></w>
-<w><t>e-lope</t></w>
+<w><t>e-lope</t><verb><regular-root/></verb></w>
<w><t>e-loped</t></w>
<w><t>e-lope-ment</t></w>
<w><t>e-lop-er</t></w>
@@ -49009,7 +48987,7 @@
<w><t>en-cul-tu-rat-ing</t></w>
<w><t>en-cul-tu-ra-tion</t></w>
<w><t>en-cul-tu-ra-tive</t></w>
-<w><t>en-cum-ber</t></w>
+<w><t>en-cum-ber</t><verb><regular-root/></verb></w>
<w><t>en-cum-ber-ing-ly</t></w>
<w><t>en-cum-brance</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>en-cum-branc-er</t></w>
@@ -52508,7 +52486,7 @@
<w><t>ex-em-plum</t></w>
<w><t>ex-empt</t><verb><regular-root/></verb><adjective><extensible value="false"/></adjective></w>
<w><t>ex-empt-i-ble</t></w>
-<w><t>ex-emp-tion</t></w>
+<w><t>ex-emp-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-emp-tive</t></w>
<w><t>ex-en-ter-ate</t></w>
<w><t>ex-en-ter-at-ed</t></w>
@@ -52685,12 +52663,12 @@
<w><t>ex-or-bi-tan-cy</t></w>
<w><t>ex-or-bi-tant</t></w>
<w><t>ex-or-bi-tant-ly</t></w>
-<w><t>ex-or-cise</t></w>
+<w><t>ex-or-cise</t><verb><regular-root/></verb></w>
<w><t>ex-or-cised</t></w>
<w><t>ex-or-cise-ment</t></w>
<w><t>ex-or-cis-er</t></w>
<w><t>ex-or-cis-ing</t></w>
-<w><t>ex-or-cism</t></w>
+<w><t>ex-or-cism</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-or-cis-mal</t></w>
<w><t>ex-or-cist</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-or-cize</t></w>
@@ -52758,7 +52736,7 @@
<w><t>ex-pan-sive-ly</t></w>
<w><t>ex-pan-siv-i-ty</t></w>
<phrase><t>ex par-te</t></phrase>
-<w><t>ex-pa-ti-ate</t></w>
+<w><t>ex-pa-ti-ate</t><verb><regular-root/></verb></w>
<w><t>ex-pa-ti-at-ed</t></w>
<w><t>ex-pa-ti-at-ing</t></w>
<w><t>ex-pa-ti-a-tion</t></w>
@@ -54320,7 +54298,7 @@
<w><t>fat-i-gat-ed</t></w>
<w><t>fat-i-gat-ing</t></w>
<w><t>fat-i-ga-tion</t></w>
-<w><t>fa-tigue</t></w>
+<w><t>fa-tigue</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>fa-tigued</t></w>
<w><t>fa-tigue-less</t></w>
<w><t>fa-ti-guing</t></w>
@@ -54506,7 +54484,7 @@
<phrase><t>Feast of Lan-terns</t></phrase>
<phrase><t>Feast of Tab-er-na-cles</t></phrase>
<w><t>feast=or=fam-ine</t></w>
-<w><t>feat</t></w>
+<w><t>feat</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>feath-er</t></w>
<w><t>feath-er-back</t></w>
<phrase><t>feath-er bed</t></phrase>
@@ -54747,7 +54725,7 @@
<w><t>fel-ly</t></w>
<phrase><t>fe-lo de se</t></phrase>
<w><t>fe-lo=de=se</t></w>
-<w><t>fel-on</t></w>
+<w><t>fel-on</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>fe-lo-nes=de=se</t></w>
<w><t>fe-lo-ni-ous</t></w>
<w><t>fe-lo-ni-ous-ly</t></w>
@@ -61497,7 +61475,7 @@
<w><t>Ger-main</t></w>
<w><t>Ger-maine</t></w>
<w><t>ger-man</t></w>
-<w><t>Ger-man</t></w>
+<w><t>Ger-man</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Ger-mán</t></w>
<phrase><t>Ger-man Bap-tist Breth-ren</t></phrase>
<phrase><t>Ger-man cock-roach</t></phrase>
@@ -64870,7 +64848,7 @@
<w><t>grow-ing-ly</t></w>
<phrase><t>grow-ing pains</t></phrase>
<phrase><t>grow in-to</t></phrase>
-<w><t>growl</t></w>
+<w><t>growl</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>growl-er</t></w>
<w><t>growl-ing-ly</t></w>
<w><t>grown</t></w>
@@ -74919,7 +74897,7 @@
<w><t>im-pos-a-ble</t></w>
<w><t>im-pose</t><verb><regular-root/></verb></w>
<w><t>im-posed</t></w>
-<w><t>im-pos-er</t></w>
+<w><t>im-pos-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>im-pos-ing</t></w>
<w><t>im-pos-ing-ly</t></w>
<w><t>im-pos-ing-ness</t></w>
@@ -75009,7 +74987,7 @@
<w><t>im-pres-sive</t></w>
<w><t>im-pres-sive-ly</t></w>
<w><t>im-pres-sive-ness</t></w>
-<w><t>im-press-ment</t></w>
+<w><t>im-press-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>im-pres-sure</t></w>
<w><t>im-prest</t></w>
<w><t>im-pri-ma-tur</t></w>
@@ -75403,7 +75381,7 @@
<w><t>in-ci-ta-tion</t></w>
<w><t>in-cite</t></w>
<w><t>in-cit-ed</t></w>
-<w><t>in-cite-ment</t></w>
+<w><t>in-cite-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>in-cit-er</t></w>
<w><t>in-cit-ing</t></w>
<w><t>in-cit-ing-ly</t></w>
@@ -76974,7 +76952,7 @@
<w><t>in-junc-tive</t></w>
<w><t>in-junc-tive-ly</t></w>
<w><t>in-jur-a-ble</t></w>
-<w><t>in-jure</t></w>
+<w><t>in-jure</t><verb><regular-root/></verb></w>
<w><t>in-jured</t></w>
<w><t>in-jured-ly</t></w>
<w><t>in-jured-ness</t></w>
@@ -77201,7 +77179,7 @@
<w><t>in-quir-ing-ly</t></w>
<w><t>in-quir-y</t></w>
<w><t>In-qui-si-tion</t></w>
-<w><t>in-qui-si-tion</t></w>
+<w><t>in-qui-si-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>in-qui-si-tion-al</t></w>
<w><t>in-qui-si-tion-ist</t></w>
<w><t>in-quis-i-tive</t></w>
@@ -77351,7 +77329,7 @@
<w><t>in-sin-cere</t></w>
<w><t>in-sin-cere-ly</t></w>
<w><t>in-sin-cer-i-ty</t></w>
-<w><t>in-sin-u-ate</t></w>
+<w><t>in-sin-u-ate</t><verb><regular-root/></verb></w>
<w><t>in-sin-u-at-ed</t></w>
<w><t>in-sin-u-at-ing</t></w>
<w><t>in-sin-u-at-ing-ly</t></w>
@@ -78399,7 +78377,7 @@
<w><t>in-ter-meas-ure</t></w>
<w><t>in-ter-meas-ured</t></w>
<w><t>in-ter-meas-ur-ing</t></w>
-<w><t>in-ter-med-dle</t></w>
+<w><t>in-ter-med-dle</t><verb><regular-root/></verb></w>
<w><t>in-ter-med-dler</t></w>
<w><t>in-ter-me-di-a-cy</t></w>
<w><t>in-ter-me-di-ar-ies</t></w>
@@ -78457,7 +78435,7 @@
<w><t>in-ter-mit-ting</t></w>
<w><t>in-ter-mit-ting-ly</t></w>
<w><t>in-ter-mit-tor</t></w>
-<w><t>in-ter-mix</t></w>
+<w><t>in-ter-mix</t><verb><regular-root/></verb></w>
<w><t>in-ter-mix-a-ble</t></w>
<w><t>in-ter-mix-ed-ly</t></w>
<w><t>in-ter-mix-ture</t></w>
@@ -79329,7 +79307,7 @@
<w><t>in-vec-tive</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>in-vec-tive-ly</t></w>
<w><t>in-vec-tive-ness</t></w>
-<w><t>in-veigh</t></w>
+<w><t>in-veigh</t><verb><regular-root/></verb></w>
<w><t>in-veigh-er</t></w>
<w><t>in-vei-gle</t></w>
<w><t>in-vei-gle-ment</t></w>
@@ -89555,6 +89533,7 @@
<w><t>Lou-is-bourg</t></w>
<w><t>Lou-is-burg</t></w>
<phrase><t>lou-is d’or</t></phrase>
+<w><t>lou-is d’ors</t><noun/><comment>Unit of money</comment></w>
<w><t>Lou-ise</t></w>
<w><t>Lou-i-sette</t></w>
<w><t>Louis=Fer-di-nand</t></w>
@@ -93576,7 +93555,7 @@
<phrase><t>med-al play</t></phrase>
<w><t>Me-dan</t></w>
<w><t>Med-a-war</t></w>
-<w><t>med-dle</t></w>
+<w><t>med-dle</t><verb><regular-root/></verb></w>
<w><t>med-dler</t></w>
<w><t>med-dle-some</t></w>
<w><t>med-dle-some-ly</t></w>
@@ -95556,7 +95535,7 @@
<w><t>mi-grate</t></w>
<w><t>mi-grat-ed</t></w>
<w><t>mi-grat-ing</t></w>
-<w><t>mi-gra-tion</t></w>
+<w><t>mi-gra-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>mi-gra-tion-al</t></w>
<w><t>mi-gra-tive</t></w>
<w><t>mi-gra-to-ry</t></w>
@@ -96482,7 +96461,7 @@
<w><t>mis-in-struc-tion</t></w>
<w><t>mis-in-tend</t></w>
<w><t>mis-in-ten-tion</t></w>
-<w><t>mis-in-ter-pret</t></w>
+<w><t>mis-in-ter-pret</t><verb><regular-root/></verb></w>
<w><t>mis-in-ter-pret-a-ble</t></w>
<w><t>mis-in-ter-pre-ta-tion</t></w>
<w><t>mis-in-ter-pret-er</t></w>
@@ -97430,7 +97409,7 @@
<w><t>mon-ar-chism</t></w>
<w><t>mon-ar-chist</t></w>
<w><t>mon-ar-chist-ic</t></w>
-<w><t>mon-ar-chy</t></w>
+<w><t>mon-ar-chy</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>mo-nar-da</t></w>
<w><t>mon-as</t></w>
<w><t>mon-as-te-ri-al</t></w>
@@ -111382,7 +111361,7 @@
<w><t>out-moved</t></w>
<w><t>out-mov-ing</t></w>
<w><t>out-ness</t></w>
-<w><t>out-num-ber</t></w>
+<w><t>out-num-ber</t><verb><regular-root/></verb></w>
<w><t>out-of-fice</t></w>
<phrase><t>out of pock-et</t></phrase>
<w><t>out=of=pock-et</t></w>
@@ -114880,7 +114859,7 @@
<w><t>Pan-di-on</t></w>
<w><t>pan-dit</t></w>
<w><t>Pan-dit</t></w>
-<w><t>Pan-do-ra</t></w>
+<w><t>Pan-do-ra</t><noun><convertible-to-possessive/></noun></w>
<w><t>pan-do-ra</t></w>
<w><t>Pan-dore</t></w>
<w><t>pan-dore</t></w>
@@ -117405,7 +117384,7 @@
<w><t>pe-on-ism</t></w>
<w><t>Pe-o-ny</t></w>
<w><t>pe-o-ny</t></w>
-<w><t>peo-ple</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>peo-ple</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>peo-ple-less</t></w>
<w><t>peo-pler</t></w>
<phrase><t>peo-ple’s de-moc-ra-cy</t></phrase>
@@ -118093,7 +118072,7 @@
<w><t>per-pe-trat-ed</t></w>
<w><t>per-pe-trat-ing</t></w>
<w><t>per-pe-tra-tion</t></w>
-<w><t>per-pe-tra-tor</t></w>
+<w><t>per-pe-tra-tor</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Per-pet-u-a</t></w>
<w><t>per-pet-u-a-ble</t></w>
<w><t>per-pet-u-al</t></w>
@@ -118675,7 +118654,7 @@
<w><t>phan-tas-tic</t></w>
<w><t>Phan-ta-sus</t></w>
<w><t>phan-ta-sy</t></w>
-<w><t>phan-tom</t></w>
+<w><t>phan-tom</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>phan-tom-like</t></w>
<phrase><t>phan-tom limb</t></phrase>
<w><t>phar</t></w>
@@ -124976,7 +124955,7 @@
<w><t>pre-ëx-il-ic</t></w>
<w><t>pre=ex-il-ic</t></w>
<w><t>pre-ex-ist</t></w>
-<w><t>pre-ëx-ist</t></w>
+<w><t>pre-ëx-ist</t><verb><regular-root/></verb></w>
<w><t>pre=ex-ist</t></w>
<w><t>pre-ex-ist-ence</t></w>
<w><t>pre-ëx-ist-ence</t></w>
@@ -126331,7 +126310,7 @@
<w><t>pre-sem-i-nal</t></w>
<w><t>pre-sem-i-nar-y</t></w>
<w><t>pre=Se-mit-ic</t></w>
-<w><t>pres-ence</t></w>
+<w><t>pres-ence</t><noun><pluralizable/></noun></w>
<phrase><t>pres-ence cham-ber</t></phrase>
<phrase><t>pres-ence of mind</t></phrase>
<w><t>pre-se-nil-i-ty</t></w>
@@ -127152,7 +127131,7 @@
<w><t>prin-cess-like</t></w>
<phrase><t>prin-cess roy-al</t></phrase>
<w><t>Prince-ton</t></w>
-<w><t>prin-ci-pal</t></w>
+<w><t>prin-ci-pal</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>prin-ci-pal ax-is</t></phrase>
<phrase><t>prin-ci-pal boy</t></phrase>
<phrase><t>prin-ci-pal fo-cus</t></phrase>
@@ -128022,7 +128001,7 @@
<w><t>pro-mot-ed</t></w>
<w><t>pro-mot-er</t></w>
<w><t>pro-mot-ing</t></w>
-<w><t>pro-mo-tion</t></w>
+<w><t>pro-mo-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>pro-mo-tion-al</t></w>
<w><t>pro-mo-tive</t></w>
<w><t>pro-mo-tive-ness</t></w>
@@ -128860,7 +128839,7 @@
<w><t>pru-ri-tus</t></w>
<w><t>Prus</t></w>
<w><t>Prus-sia</t></w>
-<w><t>Prus-sian</t></w>
+<w><t>Prus-sian</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>Prus-sian blue</t></phrase>
<w><t>prus-sian-ise</t></w>
<w><t>Prus-sian-ise</t></w>
@@ -130712,7 +130691,7 @@
<w><t>quare</t></w>
<w><t>quark</t></w>
<w><t>Quar-ne-ro</t></w>
-<w><t>quar-rel</t></w>
+<w><t>quar-rel</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>quar-reled</t></w>
<w><t>quar-rel-er</t></w>
<w><t>quar-rel-ing</t></w>
@@ -133519,6 +133498,7 @@
<w><t>re-caned</t></w>
<w><t>re-can-ing</t></w>
<w><t>re-cant</t></w>
+<w><t>re-can-ta-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>re-cant-er</t></w>
<w><t>re-cant-ing-ly</t></w>
<w><t>re-cap</t></w>
@@ -134393,7 +134373,7 @@
<w><t>Re-don</t></w>
<w><t>re-done</t></w>
<phrase><t>red o-sier</t></phrase>
-<w><t>re-dou-ble</t></w>
+<w><t>re-dou-ble</t><verb><regular-root/></verb></w>
<w><t>re-dou-bler</t></w>
<w><t>re-doubt</t></w>
<w><t>re-doubt-a-ble</t></w>
@@ -135224,7 +135204,7 @@
<w><t>re-guid-ed</t></w>
<w><t>re-guid-ing</t></w>
<w><t>reg-u-la-ble</t></w>
-<w><t>reg-u-lar</t></w>
+<w><t>reg-u-lar</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>reg-u-lar-ise</t></w>
<w><t>reg-u-lar-i-ty</t></w>
<w><t>reg-u-lar-i-za-tion</t></w>
@@ -135232,7 +135212,7 @@
<w><t>reg-u-lar-iz-er</t></w>
<w><t>reg-u-lar-ly</t></w>
<w><t>reg-u-lar-ness</t></w>
-<w><t>reg-u-late</t></w>
+<w><t>reg-u-late</t><verb><regular-root/></verb></w>
<w><t>reg-u-lat-ed</t></w>
<w><t>reg-u-lat-ing</t></w>
<w><t>reg-u-la-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -136013,7 +135993,7 @@
<w><t>re-mit-al</t></w>
<w><t>re-mit-ta-ble</t></w>
<w><t>re-mit-tal</t></w>
-<w><t>re-mit-tance</t></w>
+<w><t>re-mit-tance</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>re-mit-tance man</t></phrase>
<w><t>re-mit-ted</t></w>
<w><t>re-mit-tee</t></w>
@@ -136590,7 +136570,7 @@
<w><t>re-praise</t></w>
<w><t>re-praised</t></w>
<w><t>re-prais-ing</t></w>
-<w><t>rep-re-hend</t></w>
+<w><t>rep-re-hend</t><verb><regular-root/></verb></w>
<w><t>rep-re-hend-a-ble</t></w>
<w><t>rep-re-hend-er</t></w>
<w><t>rep-re-hen-si-bil-i-ty</t></w>
@@ -138787,7 +138767,7 @@
<phrase><t>Ri-o Bran-co</t></phrase>
<phrase><t>Ri-o de Ja-nei-ro</t></phrase>
<phrase><t>Ri-o Grande</t></phrase>
-<w><t>ri-ot</t></w>
+<w><t>ri-ot</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>Ri-ot Act</t></phrase>
<w><t>ri-ot-er</t></w>
<w><t>ri-ot-ing-ly</t></w>
@@ -139194,7 +139174,7 @@
<w><t>Rog-ers</t></w>
<w><t>Ro-get</t></w>
<w><t>Ro-gi-er</t></w>
-<w><t>rogue</t></w>
+<w><t>rogue</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ro-guer-ies</t></w>
<w><t>ro-guer-y</t></w>
<phrase><t>rogues’ gal-ler-y</t></phrase>
@@ -140557,7 +140537,7 @@
<w><t>Sa-dat</t></w>
<w><t>sad-den</t></w>
<w><t>sad-dhu</t></w>
-<w><t>sad-dle</t></w>
+<w><t>sad-dle</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>sad-dle-back</t></w>
<w><t>sad-dle=backed</t></w>
<w><t>sad-dle-bag</t></w>
@@ -141680,7 +141660,7 @@
<w><t>sar-to-ri-us</t></w>
<w><t>Sar-tre</t></w>
<w><t>Sa-ruk</t></w>
-<w><t>Sar-um</t></w>
+<w><t>Sar-um</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>Sar-um use</t></phrase>
<w><t>Sar-vo-da-ya</t></w>
<w><t>Sa-sa-me=yu-ki</t></w>
@@ -146279,7 +146259,7 @@
<w><t>Sha-bu-oth</t></w>
<w><t>Sha-cha-rith</t></w>
<w><t>shack</t></w>
-<w><t>shack-le</t></w>
+<w><t>shack-le</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>shack-ler</t></w>
<w><t>Shack-le-ton</t></w>
<w><t>shack-o</t></w>
@@ -146510,7 +146490,7 @@
<w><t>share-hold-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Shar-en</t></w>
<phrase><t>share pre-mi-um</t></phrase>
-<w><t>shar-er</t></w>
+<w><t>shar-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Sha-ret</t></w>
<w><t>Sha-ri</t></w>
<w><t>sha-ri-a</t></w>
@@ -146967,7 +146947,7 @@
<w><t>ship-way</t></w>
<w><t>ship-worm</t></w>
<w><t>ship-wreck</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
-<w><t>ship-wright</t></w>
+<w><t>ship-wright</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ship-yard</t></w>
<w><t>Shi-r</t></w>
<w><t>shi-ra-lee</t></w>
@@ -149424,7 +149404,7 @@
<w><t>smug-ger</t></w>
<w><t>smug-gest</t></w>
<w><t>smug-gle</t><verb><regular-root/></verb></w>
-<w><t>smug-gler</t></w>
+<w><t>smug-gler</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>smug-ly</t></w>
<w><t>smug-ness</t></w>
<w><t>smut</t></w>
@@ -149795,7 +149775,7 @@
<w><t>soap-suds-y</t></w>
<w><t>soap-wort</t></w>
<w><t>soap-y</t></w>
-<w><t>soar</t></w>
+<w><t>soar</t><verb><regular-root/></verb></w>
<w><t>soar-a-bil-i-ty</t></w>
<w><t>soar-a-ble</t></w>
<w><t>soar-er</t></w>
@@ -150666,7 +150646,7 @@
<w><t>soul=search-ing</t></w>
<w><t>Soult</t></w>
<phrase><t>sou mar-qu</t></phrase>
-<w><t>sound</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
+<w><t>sound</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb><adjective><extensible/></adjective></w>
<w><t>Sound</t></w>
<w><t>sound-a-ble</t></w>
<phrase><t>sound bar-ri-er</t></phrase>
@@ -152559,7 +152539,7 @@
<w><t>sta-di-um</t></w>
<w><t>sta-di-ums</t></w>
<w><t>stad-le</t></w>
-<w><t>stadt-hold-er</t></w>
+<w><t>stadt-hold-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>staff</t></w>
<w><t>Staf-fa</t></w>
<phrase><t>staff col-lege</t></phrase>
@@ -153784,7 +153764,7 @@
<w><t>stoc-ca-ta</t></w>
<w><t>sto-chas-tic</t></w>
<w><t>sto-chas-ti-cal-ly</t></w>
-<w><t>stock</t></w>
+<w><t>stock</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>Stock</t></w>
<w><t>stock-ade</t></w>
<w><t>stock-ad-ed</t></w>
@@ -154171,7 +154151,7 @@
<w><t>strain-less</t></w>
<w><t>strain-less-ly</t></w>
<w><t>strait</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
-<w><t>strait-en</t></w>
+<w><t>strait-en</t><verb><regular-root/></verb></w>
<w><t>strait-jack-et</t></w>
<w><t>strait-laced</t></w>
<w><t>strait=lac-ed-ly</t></w>
@@ -160364,7 +160344,8 @@
<w><t>tau-tol-o-gize</t></w>
<w><t>tau-tol-o-gized</t></w>
<w><t>tau-tol-o-giz-ing</t></w>
-<w><t>tau-tol-o-gous-ly</t></w>
+<w><t>tau-tol-o-gous</t><adjective/></w>
+<w><t>tau-tol-o-gous-ly</t><adverb/></w>
<w><t>tau-tol-o-gy</t></w>
<w><t>Tau-tol-o-gy</t></w>
<w><t>tau-to-mer</t></w>
@@ -162306,7 +162287,7 @@
<w><t>thirl</t></w>
<w><t>thirl-age</t></w>
<w><t>Thirl-mere</t></w>
-<w><t>thirst</t></w>
+<w><t>thirst</t><noun><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>thirst-er</t></w>
<w><t>thirst-i-er</t></w>
<w><t>thirst-i-est</t></w>
@@ -166671,7 +166652,7 @@
<w><t>turn-er-ies</t></w>
<w><t>turn-er-y</t></w>
<w><t>turn-hall</t></w>
-<w><t>turn-ing</t></w>
+<w><t>turn-ing</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>turn-ing cir-cle</t></phrase>
<phrase><t>turn-ing point</t></phrase>
<w><t>tur-nip</t></w>
@@ -180158,8 +180139,8 @@
<w><t>Vance</t></w>
<w><t>Van-cou-ver</t></w>
<w><t>van-da</t></w>
-<w><t>van-dal</t></w>
-<w><t>Van-dal</t></w>
+<w><t>van-dal</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>Van-dal</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>van-dal-ise</t></w>
<w><t>van-dal-ish</t></w>
<w><t>van-dal-ism</t></w>
@@ -180642,7 +180623,7 @@
<w><t>ve-nat-ic</t></w>
<w><t>ve-nat-i-cal</t></w>
<w><t>ve-na-tion</t></w>
-<w><t>vend</t></w>
+<w><t>vend</t><verb><regular-root/></verb></w>
<w><t>Ven-d</t></w>
<w><t>Ven-da</t></w>
<w><t>ven-dace</t></w>
@@ -181449,7 +181430,7 @@
<w><t>Vik-ki</t></w>
<w><t>vil</t></w>
<w><t>vi-la-yet</t></w>
-<w><t>vile</t></w>
+<w><t>vile</t><adjective><extensible/></adjective></w>
<w><t>vile-ly</t></w>
<w><t>vile-ness</t></w>
<w><t>Vil-fre-do</t></w>
@@ -184697,7 +184678,7 @@
<w><t>whil-li-kins</t></w>
<w><t>whi-lom</t></w>
<w><t>whilst</t></w>
-<w><t>whim</t></w>
+<w><t>whim</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>whim-brel</t></w>
<w><t>whim-per</t></w>
<w><t>whim-per-er</t></w>
@@ -185310,7 +185291,7 @@
<w><t>wind-i-est</t></w>
<w><t>wind-i-ly</t></w>
<w><t>wind-i-ness</t></w>
-<w><t>wind-ing</t></w>
+<w><t>wind-ing</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>wind-ing-ly</t></w>
<w><t>wind-ing-ness</t></w>
<phrase><t>wind-ing sheet</t></phrase>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-28 03:23:24
|
Revision: 13282
http://sourceforge.net/p/foray/code/13282
Author: victormote
Date: 2023-09-28 03:23:21 +0000 (Thu, 28 Sep 2023)
Log Message:
-----------
Create class to store the breaks in.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 18:13:01 UTC (rev 13281)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-28 03:23:21 UTC (rev 13282)
@@ -178,6 +178,18 @@
}
/**
+ * A single break in an {@link InputItem}.
+ */
+ private class InputBreak {
+
+ /** The offset into an {@link InputItem} text, indicating the location of the break. */
+ private int offset;
+
+ /** The type of break at {@link #offset}. */
+ private TokenType2 type;
+ }
+
+ /**
* Stores the items presented as input to this lexer.
*/
private class InputItem {
@@ -191,12 +203,9 @@
/** Indicates if this is a word token. If false, it is untokenized input. */
private boolean isWordToken;
- /** The indexes into {@link #text} where each computed break occurs. */
- private IntSequence breakOffsets;
+ /** The list of breaks for this input item. */
+ private List<InputBreak> inputBreaks = new ArrayList<InputBreak>();
- /** The type of each break in {@link #breakOffsets}. */
- private TokenType2[] breakTypes;
-
}
/**
@@ -216,7 +225,7 @@
int size = 0;
for (int itemIndex = 0; itemIndex < this.items.size(); itemIndex ++) {
final InputItem item = this.items.get(itemIndex);
- size += item.isWordToken ? 1 : item.breakTypes.length;
+ size += item.isWordToken ? 1 : item.inputBreaks.size();
}
return size;
}
@@ -227,7 +236,7 @@
* continuous sequence of breaks.
* @return The token type at index {@code index}.
*/
- TokenType2 get(final int index) {
+ InputBreak get(final int index) {
if (index < 0) {
throw new MarkedIndexOutOfBoundsException(index, size());
}
@@ -236,51 +245,19 @@
final InputItem item = this.items.get(itemIndex);
if (item.isWordToken) {
if (offset == 0) {
- return TokenType2.WORD;
+ return item.inputBreaks.get(0);
}
offset --;
} else {
- if (offset < item.breakTypes.length) {
- return item.breakTypes[offset];
+ if (offset < item.inputBreaks.size()) {
+ return item.inputBreaks.get(offset);
}
- offset -= item.breakTypes.length;
+ offset -= item.inputBreaks.size();
}
}
throw new MarkedIndexOutOfBoundsException(index, size());
}
- /**
- * Sets the token type of the break at a given index.
- * @param index The index of the break whose token type is to be set, as if the input items were part of a
- * single continuous sequence of breaks.
- * @param newTokenType The new token type for the break at {@code index}.
- */
- void set(final int index, final TokenType2 newTokenType) {
- if (index < 0) {
- throw new MarkedIndexOutOfBoundsException(index, size());
- }
- int offset = index;
- for (int itemIndex = 0; itemIndex < this.items.size(); itemIndex ++) {
- final InputItem item = this.items.get(itemIndex);
- if (item.isWordToken) {
- if (offset == 0) {
- if (newTokenType != TokenType2.WORD) {
- throw new IllegalStateException("Cannot change the token type of an explicit word token.");
- }
- return;
- }
- offset --;
- } else {
- if (offset < item.breakTypes.length) {
- item.breakTypes[offset] = newTokenType;
- return;
- }
- offset -= item.breakTypes.length;
- }
- }
- throw new MarkedIndexOutOfBoundsException(index, size());
- }
-
}
/**
@@ -376,6 +353,10 @@
inputItem.text = normalizedSequence;
inputItem.writingSystem = writingSystem;
inputItem.isWordToken = true;
+ final InputBreak inputBreak = new InputBreak();
+ inputBreak.offset = 0;
+ inputBreak.type = TokenType2.WORD;
+ inputItem.inputBreaks.add(inputBreak);
this.input.items.add(inputItem);
}
@@ -449,12 +430,16 @@
/* This is normal untokenized content. */
/* First pass is to find all of the breaks that the BreakIterator can find. */
- inputItem.breakOffsets = findRawBreaks(inputItem.text, inputItem.writingSystem);
+ final IntSequence breakOffsets = findRawBreaks(inputItem.text, inputItem.writingSystem);
+ for (int breakIndex = 0; breakIndex < breakOffsets.length(); breakIndex ++) {
+ final InputBreak inputBreak = new InputBreak();
+ inputBreak.offset = breakOffsets.intAt(breakIndex);
+ inputItem.inputBreaks.add(inputBreak);
+ }
-
/* The BreakIterator is helpful, but for our purposes does not dig deeply enough.
* Second pass is to find out the type of each character that is at a break. */
- inputItem.breakTypes = findBreakTypes(inputItem);
+ findBreakTypes(inputItem);
}
}
@@ -480,31 +465,27 @@
/**
* Determines the type of character that triggered each raw break.
* @param inputItem The input item being tokenized.
- * @return An array with a one-to-one correspondence with {@link InputItem#breakOffsets}, containing the type of
- * character at that break.
*/
- protected TokenType2[] findBreakTypes(final InputItem inputItem) {
- final IntSequence breakOffsets = inputItem.breakOffsets;
- final TokenType2[] breakTypes = new TokenType2[breakOffsets.length()];
- for (int breakIndex = 0; breakIndex < breakOffsets.length(); breakIndex ++) {
- if (breakIndex >= breakOffsets.length() - 1) {
- breakTypes[breakIndex] = TokenType2.END;
+ protected void findBreakTypes(final InputItem inputItem) {
+ for (int breakIndex = 0; breakIndex < inputItem.inputBreaks.size(); breakIndex ++) {
+ final List<InputBreak> inputBreaks = inputItem.inputBreaks;
+ if (breakIndex >= inputBreaks.size() - 1) {
+ inputBreaks.get(breakIndex).type = TokenType2.END;
} else {
- final int sequenceIndex = breakOffsets.intAt(breakIndex);
- final int end = breakOffsets.intAt(breakIndex + 1);
+ final int sequenceIndex = inputBreaks.get(breakIndex).offset;
+ final int end = inputBreaks.get(breakIndex + 1).offset;
/* Special cases where the first char alone does not tell the whole story. */
if (NumberUtils.isArabicNumber(inputItem.text, sequenceIndex, end)) {
- breakTypes[breakIndex] = TokenType2.WORD;
+ inputBreaks.get(breakIndex).type = TokenType2.WORD;
continue;
}
/* Interpret the sequence from the first char only. */
final int testChar = inputItem.text.charAt(sequenceIndex);
- breakTypes[breakIndex] = computeCharType(testChar);
+ inputBreaks.get(breakIndex).type = computeCharType(testChar);
}
}
- return breakTypes;
}
@@ -523,35 +504,35 @@
/* First iterate in reverse order. */
for (int breakIndex = this.input.size() - 1; breakIndex > -1; breakIndex --) {
- final TokenType2 currentBreakType = this.input.get(breakIndex);
+ final InputBreak currentBreak = this.input.get(breakIndex);
final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType :
- this.input.get(breakIndex - 1);
+ this.input.get(breakIndex - 1).type;
final TokenType2 nextBreakType = breakIndex == this.input.size() - 1 ? postSequenceBreakType :
- this.input.get(breakIndex + 1);
+ this.input.get(breakIndex + 1).type;
/* Look for transient or ambiguous punctuation that is immediately followed by word chars.
* That punctuation is considered part of the word. */
- if ((currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
- || currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION)
+ if ((currentBreak.type == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
+ || currentBreak.type == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION)
&& nextBreakType == TokenType2.WORD) {
- this.input.set(breakIndex, TokenType2.WORD);
+ currentBreak.type = TokenType2.WORD;
}
/* Look for transient trailing punctuation immediately followed by resolved trailing punctuation.
* Change the transient to resolved. */
- if (currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
+ if (currentBreak.type == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
&& nextBreakType == TokenType2.TRAILING_PUNCTUATION) {
- this.input.set(breakIndex, TokenType2.TRAILING_PUNCTUATION);
+ currentBreak.type = TokenType2.TRAILING_PUNCTUATION;
}
/* Look for ambiguous punctuation immediate followed by whitespace and immediately preceded by trailing
* punctuation. Resolve it to trailing punctuation. */
- if (currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION
+ if (currentBreak.type == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION
&& (nextBreakType == TokenType2.BREAK
|| nextBreakType == TokenType2.END)
&& (previousBreakType == TokenType2.TRAILING_PUNCTUATION
|| previousBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION)) {
- this.input.set(breakIndex, TokenType2.TRAILING_PUNCTUATION);
+ currentBreak.type = TokenType2.TRAILING_PUNCTUATION;
}
}
@@ -558,22 +539,22 @@
/* Now iterate in normal order. */
for (int breakIndex = 0; breakIndex < this.input.size(); breakIndex ++) {
- final TokenType2 currentBreakType = this.input.get(breakIndex);
+ final InputBreak currentBreak = this.input.get(breakIndex);
final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType :
- this.input.get(breakIndex - 1);
+ this.input.get(breakIndex - 1).type;
final TokenType2 nextBreakType = breakIndex == this.input.size() - 1 ? postSequenceBreakType :
- this.input.get(breakIndex + 1);
+ this.input.get(breakIndex + 1).type;
/* If the current type is not a whitespace char, but it is surrounded by whitespace chars, this marks a
* word. */
- if (currentBreakType != TokenType2.BREAK
+ if (currentBreak.type != TokenType2.BREAK
&& previousBreakType == TokenType2.BREAK
&& (nextBreakType == TokenType2.BREAK
|| nextBreakType == TokenType2.END)) {
- this.input.set(breakIndex, TokenType2.WORD);
+ currentBreak.type = TokenType2.WORD;
}
- switch (currentBreakType) {
+ switch (currentBreak.type) {
case TRANSIENT_TRAILING_PUNCTUATION: {
switch (previousBreakType) {
case WORD: {
@@ -580,11 +561,11 @@
switch (nextBreakType) {
case WORD: {
/* This also is part of the word. */
- this.input.set(breakIndex, TokenType2.WORD);
+ currentBreak.type = TokenType2.WORD;
break;
}
default: {
- this.input.set(breakIndex, TokenType2.TRAILING_PUNCTUATION);
+ currentBreak.type = TokenType2.TRAILING_PUNCTUATION;
break;
}
}
@@ -592,13 +573,13 @@
}
case TRAILING_PUNCTUATION: {
/* This is additional trailing punctuation. */
- this.input.set(breakIndex, TokenType2.TRAILING_PUNCTUATION);
+ currentBreak.type = TokenType2.TRAILING_PUNCTUATION;
break;
}
case BREAK: {
/* This cannot be trailing punctuation, so must be the first character in a new word, probably a
* contraction like "'tis" for example. */
- this.input.set(breakIndex, TokenType2.WORD);
+ currentBreak.type = TokenType2.WORD;
break;
}
default:
@@ -612,11 +593,11 @@
switch (previousBreakType) {
case WORD: {
/* This also is part of the word. */
- this.input.set(breakIndex, TokenType2.WORD);
+ currentBreak.type = TokenType2.WORD;
break;
}
default: {
- this.input.set(breakIndex, TokenType2.LEADING_PUNCTUATION);
+ currentBreak.type = TokenType2.LEADING_PUNCTUATION;
break;
}
}
@@ -624,7 +605,7 @@
}
case LEADING_PUNCTUATION: {
/* This is additional leading punctuation. */
- this.input.set(breakIndex, TokenType2.LEADING_PUNCTUATION);
+ currentBreak.type = TokenType2.LEADING_PUNCTUATION;
break;
}
default:
@@ -647,12 +628,12 @@
final TokenType2 postSequenceBreakType = TokenType2.END;
/* Resolve attached leading punctuation. */
for (int breakIndex = 0; breakIndex < this.input.size(); breakIndex ++) {
- final TokenType2 currentBreakType = this.input.get(breakIndex);
+ final InputBreak currentBreak = this.input.get(breakIndex);
final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType :
- this.input.get(breakIndex - 1);
+ this.input.get(breakIndex - 1).type;
final TokenType2 nextBreakType = breakIndex == this.input.size() - 1 ? postSequenceBreakType :
- this.input.get(breakIndex + 1);
- switch (currentBreakType) {
+ this.input.get(breakIndex + 1).type;
+ switch (currentBreak.type) {
case LEADING_PUNCTUATION: {
switch (previousBreakType) {
case BREAK: {
@@ -659,7 +640,7 @@
switch (nextBreakType) {
case BREAK: {
/* Surrounded by breaks. Treat this as a word. */
- this.input.set(breakIndex, TokenType2.WORD);
+ currentBreak.type = TokenType2.WORD;
break;
}
default: {
@@ -689,12 +670,12 @@
final TokenType2 postSequenceBreakType = TokenType2.END;
/* Resolve attached trailing punctuation. Iterate these in reverse order. */
for (int breakIndex = this.input.size() - 1; breakIndex > 0; breakIndex --) {
- final TokenType2 currentBreakType = this.input.get(breakIndex);
+ final InputBreak currentBreak = this.input.get(breakIndex);
final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType :
- this.input.get(breakIndex - 1);
+ this.input.get(breakIndex - 1).type;
final TokenType2 nextBreakType = breakIndex == this.input.size() - 1 ? postSequenceBreakType :
- this.input.get(breakIndex + 1);
- switch (currentBreakType) {
+ this.input.get(breakIndex + 1).type;
+ switch (currentBreak.type) {
case TRAILING_PUNCTUATION: {
switch (nextBreakType) {
case BREAK:
@@ -702,7 +683,7 @@
switch (previousBreakType) {
case BREAK: {
/* Surrounded by breaks. Treat this as a word. */
- this.input.set(breakIndex, TokenType2.WORD);
+ currentBreak.type = TokenType2.WORD;
break;
}
default: {
@@ -748,12 +729,14 @@
* @param inputItem The untokenized input for which tokens are to be created.
*/
private void createComputedTokens(final InputItem inputItem) {
+ final List<InputBreak> inputBreaks = inputItem.inputBreaks;
TokenType2 lastBreakType = TokenType2.START;
int nextTokenOffset = 0;
- for (int breakIndex = 0; breakIndex < inputItem.breakTypes.length; breakIndex ++) {
- final TokenType2 currentBreakType = inputItem.breakTypes[breakIndex];
- final int currentOffset = inputItem.breakOffsets.intAt(breakIndex);
+ for (int breakIndex = 0; breakIndex < inputBreaks.size(); breakIndex ++) {
+ final InputBreak inputBreak = inputBreaks.get(breakIndex);
+ final TokenType2 currentBreakType = inputBreak.type;
+ final int currentOffset = inputBreak.offset;
if (lastBreakType != TokenType2.START
&& (currentBreakType != lastBreakType
|| currentBreakType != TokenType2.WORD)) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-27 18:13:07
|
Revision: 13281
http://sourceforge.net/p/foray/code/13281
Author: victormote
Date: 2023-09-27 18:13:01 +0000 (Wed, 27 Sep 2023)
Log Message:
-----------
Enable test that now passes.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-27 18:01:26 UTC (rev 13280)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-27 18:13:01 UTC (rev 13281)
@@ -38,7 +38,6 @@
import org.axsl.unicode.block.U2000_General_Punctuation;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
@@ -652,7 +651,6 @@
* Test of a phrase in a different writing system, followed by a period.
*/
@Test
- @Disabled("Solution is a work in progress.")
public void testPunctuationAfterDifferentWritingSystem() {
final Lexer4a out = getObjectUnderTest();
out.addUntokenized("That is ", WritingSystem4a.USA);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-27 18:01:29
|
Revision: 13280
http://sourceforge.net/p/foray/code/13280
Author: victormote
Date: 2023-09-27 18:01:26 +0000 (Wed, 27 Sep 2023)
Log Message:
-----------
When resolving ambiguous token types, process all input items as if they were part of a single sequence.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 11:31:03 UTC (rev 13279)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 18:01:26 UTC (rev 13280)
@@ -28,6 +28,7 @@
package org.foray.orthography;
+import org.foray.common.MarkedIndexOutOfBoundsException;
import org.foray.primitive.CharacterUtils;
import org.foray.primitive.NumberUtils;
import org.foray.primitive.StringUtils;
@@ -113,13 +114,6 @@
*/
public abstract class Lexer4a implements Lexer {
-
-
- /* TODO: This implementation is pretty ugly at the moment and could stand some major work. Making the interface an
- * Iterator was an afterthough, and, we think, a good one, but this class does not really follow that model yet. */
-
-
-
/**
* Enumeration of possible token types to be used during processing, acting as a kind of extension of
* {@link TokenType}.
@@ -186,7 +180,7 @@
/**
* Stores the items presented as input to this lexer.
*/
- private class Input {
+ private class InputItem {
/** The text. */
private CharSequence text;
@@ -206,6 +200,90 @@
}
/**
+ * Wrapper around the list of input items that provides a flattened view of the content of those items, allowing
+ * them to be treated as a single sequence of tokens.
+ */
+ private class Input {
+
+ /** The list of input items that have been submitted for processing. */
+ private List<InputItem> items = new ArrayList<InputItem>();
+
+ /**
+ * Returns the total number of all breaks found in the input.
+ * @return The total number of all breaks found in the input.
+ */
+ int size() {
+ int size = 0;
+ for (int itemIndex = 0; itemIndex < this.items.size(); itemIndex ++) {
+ final InputItem item = this.items.get(itemIndex);
+ size += item.isWordToken ? 1 : item.breakTypes.length;
+ }
+ return size;
+ }
+
+ /**
+ * Returns the token type of the break at a given index.
+ * @param index The index of the token type that is sought, as if the input items were part of a single
+ * continuous sequence of breaks.
+ * @return The token type at index {@code index}.
+ */
+ TokenType2 get(final int index) {
+ if (index < 0) {
+ throw new MarkedIndexOutOfBoundsException(index, size());
+ }
+ int offset = index;
+ for (int itemIndex = 0; itemIndex < this.items.size(); itemIndex ++) {
+ final InputItem item = this.items.get(itemIndex);
+ if (item.isWordToken) {
+ if (offset == 0) {
+ return TokenType2.WORD;
+ }
+ offset --;
+ } else {
+ if (offset < item.breakTypes.length) {
+ return item.breakTypes[offset];
+ }
+ offset -= item.breakTypes.length;
+ }
+ }
+ throw new MarkedIndexOutOfBoundsException(index, size());
+ }
+
+ /**
+ * Sets the token type of the break at a given index.
+ * @param index The index of the break whose token type is to be set, as if the input items were part of a
+ * single continuous sequence of breaks.
+ * @param newTokenType The new token type for the break at {@code index}.
+ */
+ void set(final int index, final TokenType2 newTokenType) {
+ if (index < 0) {
+ throw new MarkedIndexOutOfBoundsException(index, size());
+ }
+ int offset = index;
+ for (int itemIndex = 0; itemIndex < this.items.size(); itemIndex ++) {
+ final InputItem item = this.items.get(itemIndex);
+ if (item.isWordToken) {
+ if (offset == 0) {
+ if (newTokenType != TokenType2.WORD) {
+ throw new IllegalStateException("Cannot change the token type of an explicit word token.");
+ }
+ return;
+ }
+ offset --;
+ } else {
+ if (offset < item.breakTypes.length) {
+ item.breakTypes[offset] = newTokenType;
+ return;
+ }
+ offset -= item.breakTypes.length;
+ }
+ }
+ throw new MarkedIndexOutOfBoundsException(index, size());
+ }
+
+ }
+
+ /**
* FOray implementation.
*/
private class Token4a implements Lexer.Token {
@@ -245,8 +323,8 @@
// /** The parent server. */
// private OrthographyServer4a server;
- /** The list of input items that have been submitted for processing. */
- private List<Input> input = new ArrayList<Input>();
+ /** The input that has been submitted for processing. */
+ private Input input = new Input();
/** Indicates whether the current content has been tokenized. */
private boolean isTokenized;
@@ -275,11 +353,11 @@
throw new IllegalArgumentException("Cannot add empty content.");
}
- final Input inputItem = new Input();
+ final InputItem inputItem = new InputItem();
inputItem.text = text;
inputItem.writingSystem = writingSystem;
inputItem.isWordToken = false;
- this.input.add(inputItem);
+ this.input.items.add(inputItem);
}
@@ -294,11 +372,11 @@
final Matcher matcher = XML_WHITESPACE_TO_NORMALIZE.matcher(text);
final String normalizedSequence = matcher.replaceAll(StringUtils.SINGLE_SPACE);
- final Input inputItem = new Input();
+ final InputItem inputItem = new InputItem();
inputItem.text = normalizedSequence;
inputItem.writingSystem = writingSystem;
inputItem.isWordToken = true;
- this.input.add(inputItem);
+ this.input.items.add(inputItem);
}
@Override
@@ -308,7 +386,7 @@
@Override
public void clear() {
- this.input.clear();
+ this.input.items.clear();
this.isTokenized = false;
this.output.clear();
this.isLocked = false;
@@ -359,48 +437,38 @@
this.output.clear();
this.nextResultIndex = 0;
this.isTokenized = true;
- if (this.input.size() < 1) {
+ if (this.input.items.size() < 1) {
return;
}
- for (int index = 0; index < this.input.size(); index ++) {
- final Input inputItem = this.input.get(index);
+ for (int index = 0; index < this.input.items.size(); index ++) {
+ final InputItem inputItem = this.input.items.get(index);
if (inputItem.isWordToken) {
/* Word input is already tokenized. */
} else {
/* This is normal untokenized content. */
- tokenizeImplicit(inputItem);
- }
- }
- createTokens();
- }
+ /* First pass is to find all of the breaks that the BreakIterator can find. */
+ inputItem.breakOffsets = findRawBreaks(inputItem.text, inputItem.writingSystem);
- /**
- * After handling explicit tokens, tokenizes the remaining chunk(s) of text using normal implicit tokenization.
- * @param inputItem The input item to be tokenized.
- */
- private void tokenizeImplicit(final Input inputItem) {
- /* First pass is to find all of the breaks that the BreakIterator can find. */
- inputItem.breakOffsets = findRawBreaks(inputItem.text, inputItem.writingSystem);
+ /* The BreakIterator is helpful, but for our purposes does not dig deeply enough.
+ * Second pass is to find out the type of each character that is at a break. */
+ inputItem.breakTypes = findBreakTypes(inputItem);
+ }
+ }
+ /* Third pass treats the entire sequence of input items as a single contiguous sequence of breaks and resolves
+ * as much ambiguity in those breaks as possible. */
- /* The BreakIterator is helpful, but for our purposes does not dig deeply enough.
- * Our purpose is to find where words start and end and to treat all other content as non-word or interword
- * content.
- * So our second pass is to find out the type of each character that is at a break. */
- inputItem.breakTypes = findBreakTypes(inputItem);
+ resolvePossibleIntrawordPunctuation();
+ resolveAttachedLeadingPunctuation();
+ resolveAttachedTrailingPunctuation();
- /* Third pass. Simplify the breakTypes array. */
- /* For normal case (no explicit tokens), the conceptual token immediately previous to the first one is a break
- * char. */
- final TokenType2 preSequenceBreakType = TokenType2.BREAK;
- /* The conceptual token immediately after the last actual token is the end char. */
- final TokenType2 postSequenceBreakType = TokenType2.END;
- filterBreakTypes(inputItem.breakTypes, preSequenceBreakType, postSequenceBreakType);
+ createTokens();
}
+
/**
* Uses a BreakIterator to find the breaks that it detects.
* @param sequence The sequence whose breaks are needed.
@@ -412,10 +480,10 @@
/**
* Determines the type of character that triggered each raw break.
* @param inputItem The input item being tokenized.
- * @return An array with a one-to-one correspondence with {@link Input#breakOffsets}, containing the type of
+ * @return An array with a one-to-one correspondence with {@link InputItem#breakOffsets}, containing the type of
* character at that break.
*/
- protected TokenType2[] findBreakTypes(final Input inputItem) {
+ protected TokenType2[] findBreakTypes(final InputItem inputItem) {
final IntSequence breakOffsets = inputItem.breakOffsets;
final TokenType2[] breakTypes = new TokenType2[breakOffsets.length()];
for (int breakIndex = 0; breakIndex < breakOffsets.length(); breakIndex ++) {
@@ -439,25 +507,6 @@
return breakTypes;
}
- /**
- * Combine and eliminate the elements in the charTypes array.
- * The touchstone here is the known word breaks which are always interword content.
- * Anything between them must be either attached to the word break to become a part of the interword content, or
- * must get coalesced into a "word" whether it is recognized as word content or not. If done properly, every element
- * in the array, when finished, should be either {@link TokenType2#WORD} or {@link TokenType2#BREAK}.
- * Anything not in those two categories will be treated in the final tokenization as {@link TokenType2#WORD}.
- * @param breakTypes The array of charTypes.
- * @param preSequenceBreakType The break type that is conceptually immediately before the first (index 0) break
- * type in {@code breakTypes}.
- * @param postSequenceBreakType The break type that is conceptually immediately after the last break type in
- * {@code breakTypes}.
- */
- protected void filterBreakTypes(final TokenType2[] breakTypes, final TokenType2 preSequenceBreakType,
- final TokenType2 postSequenceBreakType) {
- resolvePossibleIntrawordPunctuation(breakTypes, preSequenceBreakType, postSequenceBreakType);
- resolveAttachedLeadingPunctuation(breakTypes, preSequenceBreakType, postSequenceBreakType);
- resolveAttachedTrailingPunctuation(breakTypes, preSequenceBreakType, postSequenceBreakType);
- }
/**
* <p>Resolves possible intraword punctuation by converting each instance into the resolved type.<p>
@@ -467,21 +516,18 @@
* <li>converting each {@link TokenType2#TRANSIENT_TRAILING_PUNCTUATION} to either a
* {@link TokenType2#WORD} or a {@link TokenType2#TRAILING_PUNCTUATION}.</li>
* </ul>
- * @param breakTypes The array of charTypes.
- * @param preSequenceBreakType The break type that is conceptually immediately before the first (index 0) break
- * type in {@code breakTypes}.
- * @param postSequenceBreakType The break type that is conceptually immediately after the last break type in
- * {@code breakTypes}.
*/
- private void resolvePossibleIntrawordPunctuation(final TokenType2[] breakTypes,
- final TokenType2 preSequenceBreakType, final TokenType2 postSequenceBreakType) {
+ private void resolvePossibleIntrawordPunctuation() {
+ final TokenType2 preSequenceBreakType = TokenType2.BREAK;
+ final TokenType2 postSequenceBreakType = TokenType2.END;
/* First iterate in reverse order. */
- for (int breakIndex = breakTypes.length - 1; breakIndex > -1; breakIndex --) {
- final TokenType2 currentBreakType = breakTypes[breakIndex];
- final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
- final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
- : breakTypes[breakIndex + 1];
+ for (int breakIndex = this.input.size() - 1; breakIndex > -1; breakIndex --) {
+ final TokenType2 currentBreakType = this.input.get(breakIndex);
+ final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType :
+ this.input.get(breakIndex - 1);
+ final TokenType2 nextBreakType = breakIndex == this.input.size() - 1 ? postSequenceBreakType :
+ this.input.get(breakIndex + 1);
/* Look for transient or ambiguous punctuation that is immediately followed by word chars.
* That punctuation is considered part of the word. */
@@ -488,7 +534,7 @@
if ((currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
|| currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION)
&& nextBreakType == TokenType2.WORD) {
- breakTypes[breakIndex] = TokenType2.WORD;
+ this.input.set(breakIndex, TokenType2.WORD);
}
/* Look for transient trailing punctuation immediately followed by resolved trailing punctuation.
@@ -495,7 +541,7 @@
* Change the transient to resolved. */
if (currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
&& nextBreakType == TokenType2.TRAILING_PUNCTUATION) {
- breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
+ this.input.set(breakIndex, TokenType2.TRAILING_PUNCTUATION);
}
/* Look for ambiguous punctuation immediate followed by whitespace and immediately preceded by trailing
@@ -505,17 +551,18 @@
|| nextBreakType == TokenType2.END)
&& (previousBreakType == TokenType2.TRAILING_PUNCTUATION
|| previousBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION)) {
- breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
+ this.input.set(breakIndex, TokenType2.TRAILING_PUNCTUATION);
}
}
/* Now iterate in normal order. */
- for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
- final TokenType2 currentBreakType = breakTypes[breakIndex];
- final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
- final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
- : breakTypes[breakIndex + 1];
+ for (int breakIndex = 0; breakIndex < this.input.size(); breakIndex ++) {
+ final TokenType2 currentBreakType = this.input.get(breakIndex);
+ final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType :
+ this.input.get(breakIndex - 1);
+ final TokenType2 nextBreakType = breakIndex == this.input.size() - 1 ? postSequenceBreakType :
+ this.input.get(breakIndex + 1);
/* If the current type is not a whitespace char, but it is surrounded by whitespace chars, this marks a
* word. */
@@ -523,7 +570,7 @@
&& previousBreakType == TokenType2.BREAK
&& (nextBreakType == TokenType2.BREAK
|| nextBreakType == TokenType2.END)) {
- breakTypes[breakIndex] = TokenType2.WORD;
+ this.input.set(breakIndex, TokenType2.WORD);
}
switch (currentBreakType) {
@@ -533,11 +580,11 @@
switch (nextBreakType) {
case WORD: {
/* This also is part of the word. */
- breakTypes[breakIndex] = TokenType2.WORD;
+ this.input.set(breakIndex, TokenType2.WORD);
break;
}
default: {
- breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
+ this.input.set(breakIndex, TokenType2.TRAILING_PUNCTUATION);
break;
}
}
@@ -545,13 +592,13 @@
}
case TRAILING_PUNCTUATION: {
/* This is additional trailing punctuation. */
- breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
+ this.input.set(breakIndex, TokenType2.TRAILING_PUNCTUATION);
break;
}
case BREAK: {
/* This cannot be trailing punctuation, so must be the first character in a new word, probably a
* contraction like "'tis" for example. */
- breakTypes[breakIndex] = TokenType2.WORD;
+ this.input.set(breakIndex, TokenType2.WORD);
break;
}
default:
@@ -565,11 +612,11 @@
switch (previousBreakType) {
case WORD: {
/* This also is part of the word. */
- breakTypes[breakIndex] = TokenType2.WORD;
+ this.input.set(breakIndex, TokenType2.WORD);
break;
}
default: {
- breakTypes[breakIndex] = TokenType2.LEADING_PUNCTUATION;
+ this.input.set(breakIndex, TokenType2.LEADING_PUNCTUATION);
break;
}
}
@@ -577,7 +624,7 @@
}
case LEADING_PUNCTUATION: {
/* This is additional leading punctuation. */
- breakTypes[breakIndex] = TokenType2.LEADING_PUNCTUATION;
+ this.input.set(breakIndex, TokenType2.LEADING_PUNCTUATION);
break;
}
default:
@@ -594,20 +641,17 @@
/**
* Resolves attached leading punctuation.
- * @param breakTypes The array of charTypes.
- * @param preSequenceBreakType The break type that is conceptually immediately before the first (index 0) break
- * type in {@code breakTypes}.
- * @param postSequenceBreakType The break type that is conceptually immediately after the last break type in
- * {@code breakTypes}.
*/
- private void resolveAttachedLeadingPunctuation(final TokenType2[] breakTypes, final TokenType2 preSequenceBreakType,
- final TokenType2 postSequenceBreakType) {
+ private void resolveAttachedLeadingPunctuation() {
+ final TokenType2 preSequenceBreakType = TokenType2.BREAK;
+ final TokenType2 postSequenceBreakType = TokenType2.END;
/* Resolve attached leading punctuation. */
- for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
- final TokenType2 currentBreakType = breakTypes[breakIndex];
- final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
- final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
- : breakTypes[breakIndex + 1];
+ for (int breakIndex = 0; breakIndex < this.input.size(); breakIndex ++) {
+ final TokenType2 currentBreakType = this.input.get(breakIndex);
+ final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType :
+ this.input.get(breakIndex - 1);
+ final TokenType2 nextBreakType = breakIndex == this.input.size() - 1 ? postSequenceBreakType :
+ this.input.get(breakIndex + 1);
switch (currentBreakType) {
case LEADING_PUNCTUATION: {
switch (previousBreakType) {
@@ -615,7 +659,7 @@
switch (nextBreakType) {
case BREAK: {
/* Surrounded by breaks. Treat this as a word. */
- breakTypes[breakIndex] = TokenType2.WORD;
+ this.input.set(breakIndex, TokenType2.WORD);
break;
}
default: {
@@ -639,20 +683,17 @@
/**
* Resolves attached trailing punctuation.
- * @param breakTypes The array of charTypes.
- * @param preSequenceBreakType The break type that is conceptually immediately before the first (index 0) break
- * type in {@code breakTypes}.
- * @param postSequenceBreakType The break type that is conceptually immediately after the last break type in
- * {@code breakTypes}.
*/
- private void resolveAttachedTrailingPunctuation(final TokenType2[] breakTypes,
- final TokenType2 preSequenceBreakType, final TokenType2 postSequenceBreakType) {
+ private void resolveAttachedTrailingPunctuation() {
+ final TokenType2 preSequenceBreakType = TokenType2.BREAK;
+ final TokenType2 postSequenceBreakType = TokenType2.END;
/* Resolve attached trailing punctuation. Iterate these in reverse order. */
- for (int breakIndex = breakTypes.length - 1; breakIndex > 0; breakIndex --) {
- final TokenType2 currentBreakType = breakTypes[breakIndex];
- final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
- final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
- : breakTypes[breakIndex + 1];
+ for (int breakIndex = this.input.size() - 1; breakIndex > 0; breakIndex --) {
+ final TokenType2 currentBreakType = this.input.get(breakIndex);
+ final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType :
+ this.input.get(breakIndex - 1);
+ final TokenType2 nextBreakType = breakIndex == this.input.size() - 1 ? postSequenceBreakType :
+ this.input.get(breakIndex + 1);
switch (currentBreakType) {
case TRAILING_PUNCTUATION: {
switch (nextBreakType) {
@@ -661,7 +702,7 @@
switch (previousBreakType) {
case BREAK: {
/* Surrounded by breaks. Treat this as a word. */
- breakTypes[breakIndex] = TokenType2.WORD;
+ this.input.set(breakIndex, TokenType2.WORD);
break;
}
default: {
@@ -687,8 +728,8 @@
* Create the token list.
*/
protected void createTokens() {
- for (int index = 0; index < input.size(); index ++) {
- final Input inputItem = input.get(index);
+ for (int index = 0; index < input.items.size(); index ++) {
+ final InputItem inputItem = input.items.get(index);
if (inputItem.isWordToken) {
final Token4a token = new Token4a();
token.text = inputItem.text;
@@ -706,7 +747,7 @@
* Creates and adds the tokens for an item of untokenized input.
* @param inputItem The untokenized input for which tokens are to be created.
*/
- private void createComputedTokens(final Input inputItem) {
+ private void createComputedTokens(final InputItem inputItem) {
TokenType2 lastBreakType = TokenType2.START;
int nextTokenOffset = 0;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-27 11:31:06
|
Revision: 13279
http://sourceforge.net/p/foray/code/13279
Author: victormote
Date: 2023-09-27 11:31:03 +0000 (Wed, 27 Sep 2023)
Log Message:
-----------
Prevent empty content from being added to the Lexer.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 10:49:15 UTC (rev 13278)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 11:31:03 UTC (rev 13279)
@@ -267,16 +267,16 @@
}
@Override
- public void addUntokenized(final CharSequence sequence, final WritingSystem writingSystem) {
+ public void addUntokenized(final CharSequence text, final WritingSystem writingSystem) {
if (this.isLocked) {
throw new IllegalStateException("This lexer is locked.");
}
- if (sequence.length() < 1) {
- return;
+ if (text.length() < 1) {
+ throw new IllegalArgumentException("Cannot add empty content.");
}
final Input inputItem = new Input();
- inputItem.text = sequence;
+ inputItem.text = text;
inputItem.writingSystem = writingSystem;
inputItem.isWordToken = false;
this.input.add(inputItem);
@@ -284,14 +284,15 @@
}
@Override
- public void addWordToken(final CharSequence sequence, final WritingSystem writingSystem) {
+ public void addWordToken(final CharSequence text, final WritingSystem writingSystem) {
if (this.isLocked) {
throw new IllegalStateException("This lexer is locked.");
}
- if (sequence.length() < 1) {
- return;
+ if (text.length() < 1) {
+ throw new IllegalArgumentException("Cannot add empty content.");
}
- final Matcher matcher = XML_WHITESPACE_TO_NORMALIZE.matcher(sequence);
+
+ final Matcher matcher = XML_WHITESPACE_TO_NORMALIZE.matcher(text);
final String normalizedSequence = matcher.replaceAll(StringUtils.SINGLE_SPACE);
final Input inputItem = new Input();
inputItem.text = normalizedSequence;
@@ -362,16 +363,13 @@
return;
}
- /* The outer loop handles handles the pre-processed tokens passed in addWordToken(CharSequence). */
for (int index = 0; index < this.input.size(); index ++) {
final Input inputItem = this.input.get(index);
- final CharSequence sequence = inputItem.text;
- if (sequence.length() > 0) {
- if (inputItem.isWordToken) {
- } else {
- /* This is normal untokenized content. */
- tokenizeImplicit(inputItem);
- }
+ if (inputItem.isWordToken) {
+ /* Word input is already tokenized. */
+ } else {
+ /* This is normal untokenized content. */
+ tokenizeImplicit(inputItem);
}
}
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-27 10:49:15 UTC (rev 13278)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-27 11:31:03 UTC (rev 13279)
@@ -332,7 +332,10 @@
textAccumulator.deleteCharAt(0);
}
- lexer.addUntokenized(getAndClearText(), writingSystem);
+ final String text = getAndClearText();
+ if (text.length() > 0) {
+ lexer.addUntokenized(text, writingSystem);
+ }
lexer.lock();
while (lexer.hasNext()) {
final Lexer.Token token = lexer.next();
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-27 10:49:17
|
Revision: 13278
http://sourceforge.net/p/foray/code/13278
Author: victormote
Date: 2023-09-27 10:49:15 +0000 (Wed, 27 Sep 2023)
Log Message:
-----------
Defer token creation until all breaks have been computed.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 00:22:29 UTC (rev 13277)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 10:49:15 UTC (rev 13278)
@@ -368,28 +368,22 @@
final CharSequence sequence = inputItem.text;
if (sequence.length() > 0) {
if (inputItem.isWordToken) {
- final Token4a token = new Token4a();
- token.text = sequence;
- token.type = TokenType.WORD;
- token.writingSystem = inputItem.writingSystem;
- this.output.add(token);
} else {
/* This is normal untokenized content. */
- tokenizeImplicit(index);
+ tokenizeImplicit(inputItem);
}
}
}
+
+ createTokens();
}
/**
* After handling explicit tokens, tokenizes the remaining chunk(s) of text using normal implicit tokenization.
- * @param index The index into {@link #input} that is being processed.
- * By passing the index instead of the item itself, we can more easily see what, if anything, comes before or
- * after this input item.
+ * @param inputItem The input item to be tokenized.
*/
- private void tokenizeImplicit(final int index) {
- final Input inputItem = this.input.get(index);
+ private void tokenizeImplicit(final Input inputItem) {
/* First pass is to find all of the breaks that the BreakIterator can find. */
inputItem.breakOffsets = findRawBreaks(inputItem.text, inputItem.writingSystem);
@@ -407,9 +401,6 @@
/* The conceptual token immediately after the last actual token is the end char. */
final TokenType2 postSequenceBreakType = TokenType2.END;
filterBreakTypes(inputItem.breakTypes, preSequenceBreakType, postSequenceBreakType);
-
- /* The fourth step iterates over the resolved break types and turns them into tokens. */
- createImplicitTokens(inputItem);
}
/**
@@ -696,9 +687,28 @@
/**
* Create the token list.
- * @param inputItem The input item whose raw break information should be converted to tokens.
*/
- protected void createImplicitTokens(final Input inputItem) {
+ protected void createTokens() {
+ for (int index = 0; index < input.size(); index ++) {
+ final Input inputItem = input.get(index);
+ if (inputItem.isWordToken) {
+ final Token4a token = new Token4a();
+ token.text = inputItem.text;
+ token.type = TokenType.WORD;
+ token.writingSystem = inputItem.writingSystem;
+ this.output.add(token);
+ } else {
+ createComputedTokens(inputItem);
+ }
+ }
+ }
+
+
+ /**
+ * Creates and adds the tokens for an item of untokenized input.
+ * @param inputItem The untokenized input for which tokens are to be created.
+ */
+ private void createComputedTokens(final Input inputItem) {
TokenType2 lastBreakType = TokenType2.START;
int nextTokenOffset = 0;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-27 00:22:31
|
Revision: 13277
http://sourceforge.net/p/foray/code/13277
Author: victormote
Date: 2023-09-27 00:22:29 +0000 (Wed, 27 Sep 2023)
Log Message:
-----------
Minor cleanup.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 00:09:41 UTC (rev 13276)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 00:22:29 UTC (rev 13277)
@@ -398,7 +398,7 @@
* Our purpose is to find where words start and end and to treat all other content as non-word or interword
* content.
* So our second pass is to find out the type of each character that is at a break. */
- inputItem.breakTypes = findBreakTypes(inputItem.text, inputItem.breakOffsets);
+ inputItem.breakTypes = findBreakTypes(inputItem);
/* Third pass. Simplify the breakTypes array. */
/* For normal case (no explicit tokens), the conceptual token immediately previous to the first one is a break
@@ -422,28 +422,28 @@
/**
* Determines the type of character that triggered each raw break.
- * @param sequence The characters being tokenized.
- * @param rawBreaks The raw breaks.
- * @return An array with a one-to-one correspondence with {@code rawBreaks}, containing the type of character at
- * that break.
+ * @param inputItem The input item being tokenized.
+ * @return An array with a one-to-one correspondence with {@link Input#breakOffsets}, containing the type of
+ * character at that break.
*/
- protected TokenType2[] findBreakTypes(final CharSequence sequence, final IntSequence rawBreaks) {
- final TokenType2[] breakTypes = new TokenType2[rawBreaks.length()];
- for (int breakIndex = 0; breakIndex < rawBreaks.length(); breakIndex ++) {
- if (breakIndex >= rawBreaks.length() - 1) {
+ protected TokenType2[] findBreakTypes(final Input inputItem) {
+ final IntSequence breakOffsets = inputItem.breakOffsets;
+ final TokenType2[] breakTypes = new TokenType2[breakOffsets.length()];
+ for (int breakIndex = 0; breakIndex < breakOffsets.length(); breakIndex ++) {
+ if (breakIndex >= breakOffsets.length() - 1) {
breakTypes[breakIndex] = TokenType2.END;
} else {
- final int sequenceIndex = rawBreaks.intAt(breakIndex);
- final int end = rawBreaks.intAt(breakIndex + 1);
+ final int sequenceIndex = breakOffsets.intAt(breakIndex);
+ final int end = breakOffsets.intAt(breakIndex + 1);
/* Special cases where the first char alone does not tell the whole story. */
- if (NumberUtils.isArabicNumber(sequence, sequenceIndex, end)) {
+ if (NumberUtils.isArabicNumber(inputItem.text, sequenceIndex, end)) {
breakTypes[breakIndex] = TokenType2.WORD;
continue;
}
/* Interpret the sequence from the first char only. */
- final int testChar = sequence.charAt(sequenceIndex);
+ final int testChar = inputItem.text.charAt(sequenceIndex);
breakTypes[breakIndex] = computeCharType(testChar);
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|