foray-commit Mailing List for FOray (Page 25)
Modular XSL-FO Implementation for Java.
Status: Alpha
Brought to you by:
victormote
You can subscribe to this list here.
| 2006 |
Jan
|
Feb
|
Mar
(139) |
Apr
(98) |
May
(250) |
Jun
(394) |
Jul
(84) |
Aug
(13) |
Sep
(420) |
Oct
(186) |
Nov
(1) |
Dec
(3) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2007 |
Jan
(108) |
Feb
(202) |
Mar
(291) |
Apr
(247) |
May
(374) |
Jun
(227) |
Jul
(231) |
Aug
(60) |
Sep
(31) |
Oct
(45) |
Nov
(18) |
Dec
|
| 2008 |
Jan
(38) |
Feb
(71) |
Mar
(142) |
Apr
|
May
(59) |
Jun
(6) |
Jul
(10) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2009 |
Jan
(12) |
Feb
(4) |
Mar
(88) |
Apr
(121) |
May
(17) |
Jun
(30) |
Jul
|
Aug
(5) |
Sep
|
Oct
(1) |
Nov
|
Dec
|
| 2010 |
Jan
(11) |
Feb
(76) |
Mar
(11) |
Apr
|
May
(11) |
Jun
|
Jul
|
Aug
(44) |
Sep
(14) |
Oct
(7) |
Nov
|
Dec
|
| 2011 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(9) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
(10) |
Nov
|
Dec
|
| 2012 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
(3) |
Jul
(4) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2016 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
(168) |
| 2017 |
Jan
(77) |
Feb
(11) |
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2018 |
Jan
|
Feb
|
Mar
(1) |
Apr
(6) |
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2019 |
Jan
|
Feb
(88) |
Mar
(118) |
Apr
(1) |
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2020 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(6) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
(141) |
| 2021 |
Jan
(170) |
Feb
(20) |
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
|
Sep
|
Oct
(62) |
Nov
(189) |
Dec
(162) |
| 2022 |
Jan
(201) |
Feb
(118) |
Mar
(8) |
Apr
|
May
(2) |
Jun
(47) |
Jul
(19) |
Aug
(14) |
Sep
(3) |
Oct
|
Nov
(28) |
Dec
(235) |
| 2023 |
Jan
(112) |
Feb
(23) |
Mar
(2) |
Apr
(2) |
May
|
Jun
(1) |
Jul
|
Aug
(70) |
Sep
(92) |
Oct
(20) |
Nov
(1) |
Dec
(1) |
| 2024 |
Jan
|
Feb
|
Mar
(1) |
Apr
(1) |
May
(14) |
Jun
(11) |
Jul
(1) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2025 |
Jan
(10) |
Feb
(29) |
Mar
|
Apr
(162) |
May
(245) |
Jun
(83) |
Jul
|
Aug
(1) |
Sep
|
Oct
|
Nov
|
Dec
|
|
From: <vic...@us...> - 2023-09-27 00:09:45
|
Revision: 13276
http://sourceforge.net/p/foray/code/13276
Author: victormote
Date: 2023-09-27 00:09:41 +0000 (Wed, 27 Sep 2023)
Log Message:
-----------
Store the intermediate lexing results with each input item, so that all input items can be processed as contiguous items.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-26 23:54:59 UTC (rev 13275)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-27 00:09:41 UTC (rev 13276)
@@ -197,6 +197,12 @@
/** Indicates if this is a word token. If false, it is untokenized input. */
private boolean isWordToken;
+ /** The indexes into {@link #text} where each computed break occurs. */
+ private IntSequence breakOffsets;
+
+ /** The type of each break in {@link #breakOffsets}. */
+ private TokenType2[] breakTypes;
+
}
/**
@@ -385,7 +391,7 @@
private void tokenizeImplicit(final int index) {
final Input inputItem = this.input.get(index);
/* First pass is to find all of the breaks that the BreakIterator can find. */
- final IntSequence rawBreaks = findRawBreaks(inputItem.text, inputItem.writingSystem);
+ inputItem.breakOffsets = findRawBreaks(inputItem.text, inputItem.writingSystem);
/* The BreakIterator is helpful, but for our purposes does not dig deeply enough.
@@ -392,7 +398,7 @@
* Our purpose is to find where words start and end and to treat all other content as non-word or interword
* content.
* So our second pass is to find out the type of each character that is at a break. */
- final TokenType2[] breakTypes = findBreakTypes(inputItem.text, rawBreaks);
+ inputItem.breakTypes = findBreakTypes(inputItem.text, inputItem.breakOffsets);
/* Third pass. Simplify the breakTypes array. */
/* For normal case (no explicit tokens), the conceptual token immediately previous to the first one is a break
@@ -400,10 +406,10 @@
final TokenType2 preSequenceBreakType = TokenType2.BREAK;
/* The conceptual token immediately after the last actual token is the end char. */
final TokenType2 postSequenceBreakType = TokenType2.END;
- filterBreakTypes(breakTypes, preSequenceBreakType, postSequenceBreakType);
+ filterBreakTypes(inputItem.breakTypes, preSequenceBreakType, postSequenceBreakType);
/* The fourth step iterates over the resolved break types and turns them into tokens. */
- createImplicitTokens(inputItem.text, rawBreaks, breakTypes, inputItem.writingSystem);
+ createImplicitTokens(inputItem);
}
/**
@@ -690,26 +696,22 @@
/**
* Create the token list.
- * @param sequence The sequence of characters being tokenized.
- * @param rawOffsets The offsets of the breaks found by the break iterator.
- * @param breakTypes The filtered break types.
- * @param writingSystem The writing system for {@code sequence}.
+ * @param inputItem The input item whose raw break information should be converted to tokens.
*/
- protected void createImplicitTokens(final CharSequence sequence, final IntSequence rawOffsets,
- final TokenType2[] breakTypes, final WritingSystem writingSystem) {
+ protected void createImplicitTokens(final Input inputItem) {
TokenType2 lastBreakType = TokenType2.START;
int nextTokenOffset = 0;
- for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
- final TokenType2 currentBreakType = breakTypes[breakIndex];
- final int currentOffset = rawOffsets.intAt(breakIndex);
+ for (int breakIndex = 0; breakIndex < inputItem.breakTypes.length; breakIndex ++) {
+ final TokenType2 currentBreakType = inputItem.breakTypes[breakIndex];
+ final int currentOffset = inputItem.breakOffsets.intAt(breakIndex);
if (lastBreakType != TokenType2.START
&& (currentBreakType != lastBreakType
|| currentBreakType != TokenType2.WORD)) {
final Token4a token = new Token4a();
- token.text = sequence.subSequence(nextTokenOffset, currentOffset);
+ token.text = inputItem.text.subSequence(nextTokenOffset, currentOffset);
token.type = lastBreakType.wrappedTokenType;
- token.writingSystem = writingSystem;
+ token.writingSystem = inputItem.writingSystem;
this.output.add(token);
nextTokenOffset = currentOffset;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-26 23:55:02
|
Revision: 13275
http://sourceforge.net/p/foray/code/13275
Author: victormote
Date: 2023-09-26 23:54:59 +0000 (Tue, 26 Sep 2023)
Log Message:
-----------
Remove some no-longer-needed context parameters. Pass the input index around instead of the input item it references.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-26 23:38:28 UTC (rev 13274)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-26 23:54:59 UTC (rev 13275)
@@ -369,7 +369,7 @@
this.output.add(token);
} else {
/* This is normal untokenized content. */
- tokenizeImplicit(sequence, inputItem.writingSystem, true, true);
+ tokenizeImplicit(index);
}
}
}
@@ -378,15 +378,14 @@
/**
* After handling explicit tokens, tokenizes the remaining chunk(s) of text using normal implicit tokenization.
- * @param sequence The character sequence containing the untokenized text.
- * @param writingSystem The writing system to be used to tokenize {@code sequence}.
- * @param isFirstChunk Indicates whether this is the first chunk being implicitly tokenized.
- * @param isLastChunk Indicates whether this is the last chunk being implicitly tokenized.
+ * @param index The index into {@link #input} that is being processed.
+ * By passing the index instead of the item itself, we can more easily see what, if anything, comes before or
+ * after this input item.
*/
- private void tokenizeImplicit(final CharSequence sequence, final WritingSystem writingSystem,
- final boolean isFirstChunk, final boolean isLastChunk) {
+ private void tokenizeImplicit(final int index) {
+ final Input inputItem = this.input.get(index);
/* First pass is to find all of the breaks that the BreakIterator can find. */
- final IntSequence rawBreaks = findRawBreaks(sequence, writingSystem);
+ final IntSequence rawBreaks = findRawBreaks(inputItem.text, inputItem.writingSystem);
/* The BreakIterator is helpful, but for our purposes does not dig deeply enough.
@@ -393,23 +392,18 @@
* Our purpose is to find where words start and end and to treat all other content as non-word or interword
* content.
* So our second pass is to find out the type of each character that is at a break. */
- final TokenType2[] breakTypes = findBreakTypes(sequence, rawBreaks, isLastChunk);
+ final TokenType2[] breakTypes = findBreakTypes(inputItem.text, rawBreaks);
/* Third pass. Simplify the breakTypes array. */
/* For normal case (no explicit tokens), the conceptual token immediately previous to the first one is a break
* char. */
- TokenType2 preSequenceBreakType = TokenType2.BREAK;
- if (! isFirstChunk) {
- /* If this is not the first item being implicitly tokenized, an explicit token (a word) is the previous
- * token. */
- preSequenceBreakType = TokenType2.WORD;
- }
+ final TokenType2 preSequenceBreakType = TokenType2.BREAK;
/* The conceptual token immediately after the last actual token is the end char. */
final TokenType2 postSequenceBreakType = TokenType2.END;
filterBreakTypes(breakTypes, preSequenceBreakType, postSequenceBreakType);
/* The fourth step iterates over the resolved break types and turns them into tokens. */
- createImplicitTokens(sequence, rawBreaks, breakTypes, writingSystem);
+ createImplicitTokens(inputItem.text, rawBreaks, breakTypes, inputItem.writingSystem);
}
/**
@@ -424,21 +418,14 @@
* Determines the type of character that triggered each raw break.
* @param sequence The characters being tokenized.
* @param rawBreaks The raw breaks.
- * @param isLastChunk Indicates whether the chunk being tokenized is the last chunk of the big sequence.
* @return An array with a one-to-one correspondence with {@code rawBreaks}, containing the type of character at
* that break.
*/
- protected TokenType2[] findBreakTypes(final CharSequence sequence, final IntSequence rawBreaks,
- final boolean isLastChunk) {
+ protected TokenType2[] findBreakTypes(final CharSequence sequence, final IntSequence rawBreaks) {
final TokenType2[] breakTypes = new TokenType2[rawBreaks.length()];
for (int breakIndex = 0; breakIndex < rawBreaks.length(); breakIndex ++) {
if (breakIndex >= rawBreaks.length() - 1) {
- if (isLastChunk) {
- breakTypes[breakIndex] = TokenType2.END;
- } else {
- /* If this is not the last chunk, then the next chunk must be an explicit token, which is a word. */
- breakTypes[breakIndex] = TokenType2.WORD;
- }
+ breakTypes[breakIndex] = TokenType2.END;
} else {
final int sequenceIndex = rawBreaks.intAt(breakIndex);
final int end = rawBreaks.intAt(breakIndex + 1);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-26 23:38:30
|
Revision: 13274
http://sourceforge.net/p/foray/code/13274
Author: victormote
Date: 2023-09-26 23:38:28 +0000 (Tue, 26 Sep 2023)
Log Message:
-----------
Create the tokens immediately instead of accumulating their components.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-26 10:30:30 UTC (rev 13273)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-26 23:38:28 UTC (rev 13274)
@@ -248,15 +248,9 @@
/** The index into the result arrays that will be used by the next call to {@link #next()}. */
private int nextResultIndex;
- /** The array of tokens in the result. */
- private List<CharSequence> resultTextItems = new ArrayList<CharSequence>();
+ /** The list of output tokens to be returned. */
+ private List<Token4a> output = new ArrayList<Token4a>();
- /** The array of writing systems in the result. */
- private List<WritingSystem> resultWritingSystems = new ArrayList<WritingSystem>();
-
- /** The array of token types in the result. */
- private List<TokenType> resultTypes = new ArrayList<TokenType>();
-
/**
* Constructor.
* @param server The parent server.
@@ -309,9 +303,7 @@
public void clear() {
this.input.clear();
this.isTokenized = false;
- this.resultTextItems.clear();
- this.resultWritingSystems.clear();
- this.resultTypes.clear();
+ this.output.clear();
this.isLocked = false;
}
@@ -326,10 +318,7 @@
if (! hasNext()) {
throw new NoSuchElementException();
}
- final Token4a returnToken = new Token4a();
- returnToken.text = this.resultTextItems.get(this.nextResultIndex);
- returnToken.type = this.resultTypes.get(this.nextResultIndex);
- returnToken.writingSystem = this.resultWritingSystems.get(this.nextResultIndex);
+ final Token4a returnToken = this.output.get(this.nextResultIndex);
this.nextResultIndex ++;
return returnToken;
}
@@ -353,7 +342,7 @@
if (! this.isTokenized) {
process();
}
- return this.nextResultIndex < this.resultTextItems.size();
+ return this.nextResultIndex < this.output.size();
}
/**
@@ -360,9 +349,7 @@
* Tokenize the content and create the output.
*/
private void process() {
- this.resultTextItems.clear();
- this.resultTypes.clear();
- this.resultWritingSystems.clear();
+ this.output.clear();
this.nextResultIndex = 0;
this.isTokenized = true;
if (this.input.size() < 1) {
@@ -375,9 +362,11 @@
final CharSequence sequence = inputItem.text;
if (sequence.length() > 0) {
if (inputItem.isWordToken) {
- this.resultTextItems.add(sequence);
- this.resultTypes.add(TokenType.WORD);
- this.resultWritingSystems.add(inputItem.writingSystem);
+ final Token4a token = new Token4a();
+ token.text = sequence;
+ token.type = TokenType.WORD;
+ token.writingSystem = inputItem.writingSystem;
+ this.output.add(token);
} else {
/* This is normal untokenized content. */
tokenizeImplicit(sequence, inputItem.writingSystem, true, true);
@@ -730,9 +719,11 @@
if (lastBreakType != TokenType2.START
&& (currentBreakType != lastBreakType
|| currentBreakType != TokenType2.WORD)) {
- this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
- this.resultTypes.add(lastBreakType.wrappedTokenType);
- this.resultWritingSystems.add(writingSystem);
+ final Token4a token = new Token4a();
+ token.text = sequence.subSequence(nextTokenOffset, currentOffset);
+ token.type = lastBreakType.wrappedTokenType;
+ token.writingSystem = writingSystem;
+ this.output.add(token);
nextTokenOffset = currentOffset;
}
lastBreakType = currentBreakType;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-26 10:30:32
|
Revision: 13273
http://sourceforge.net/p/foray/code/13273
Author: victormote
Date: 2023-09-26 10:30:30 +0000 (Tue, 26 Sep 2023)
Log Message:
-----------
Add test of adjacent tokens in different writing systems.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-26 10:23:39 UTC (rev 13272)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-26 10:30:30 UTC (rev 13273)
@@ -38,6 +38,7 @@
import org.axsl.unicode.block.U2000_General_Punctuation;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
@@ -647,4 +648,28 @@
testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
+ /**
+ * Test of a phrase in a different writing system, followed by a period.
+ */
+ @Test
+ @Disabled("Solution is a work in progress.")
+ public void testPunctuationAfterDifferentWritingSystem() {
+ final Lexer4a out = getObjectUnderTest();
+ out.addUntokenized("That is ", WritingSystem4a.USA);
+ out.addUntokenized("quid pro quo", WritingSystem4a.LATIN);
+ out.addUntokenized(".", WritingSystem4a.USA);
+ final List<Lexer.Token> actual = tokenize();
+ assertEquals(10, actual.size());
+ testToken(actual.get(0), "That", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
+ testToken(actual.get(2), "is", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
+ testToken(actual.get(4), "quid", TokenType.WORD, WritingSystem4a.LATIN);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.LATIN);
+ testToken(actual.get(6), "pro", TokenType.WORD, WritingSystem4a.LATIN);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.LATIN);
+ testToken(actual.get(8), "quo", TokenType.WORD, WritingSystem4a.LATIN);
+ testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ }
+
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-26 10:23:42
|
Revision: 13272
http://sourceforge.net/p/foray/code/13272
Author: victormote
Date: 2023-09-26 10:23:39 +0000 (Tue, 26 Sep 2023)
Log Message:
-----------
Dictionary improvements.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ-epoch-01.dict.xml
trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/main/data/dictionaries/fre-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/main/data/dictionaries/lat-Latn-ZZZ.dict.xml
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ-epoch-01.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ-epoch-01.dict.xml 2023-09-24 22:57:57 UTC (rev 13271)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ-epoch-01.dict.xml 2023-09-26 10:23:39 UTC (rev 13272)
@@ -55,6 +55,7 @@
<w><t>hat-eth</t><verb><lemma>hate</lemma></verb></w>
<w><t>high-er=toned</t><adjective/></w>
<w><t>hum-bleth</t><verb><regular-root value="false"/></verb></w>
+<w><t>hun-dred-or</t><noun><pluralizable/><convertible-to-possessive/></noun><comment>member of a hundred, county subdivision</comment></w>
<w><t>im-pa-tent-ed</t><adjective><extensible value="false"/></adjective></w>
<w><t>Jno</t><abbrev referenced-word="John"/></w>
<w><t>Jona</t><abbrev referenced-word="Jonathan?"/></w>
@@ -73,6 +74,7 @@
<w><t>mal=con-duct</t><noun/></w>
<w><t>mas-ter=build-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>mim-ick-ry</t><noun><pluralizable/></noun></w>
+<w><t>mi-nu-ti-æ</t><noun><plural/></noun></w>
<w><t>non=com-pli-ance</t><noun/></w>
<w><t>non=com-ply-ing</t><adjective/></w>
<w><t>non=con-sump-tion</t></w>
@@ -90,9 +92,11 @@
<w><t>pre-ëm-i-nent-ly</t><adverb/></w>
<w><t>pro-nounc-eth</t><verb><regular-root value="false"/></verb></w>
<w><t>re-ceiv-eth</t></w>
+<w><t>re-ëch-o</t><verb><regular-root/></verb></w>
<w><t>re-ë-lect</t><verb><regular-root/></verb></w>
<w><t>re-ë-lec-tion</t><noun><pluralizable/></noun></w>
-<w><t>re-ël-i-gi-ble</t><adjective></adjective></w>
+<w><t>re-ël-i-gi-bil-i-ty</t><noun/></w>
+<w><t>re-ël-i-gi-ble</t><adjective/></w>
<w><t>re-ën-ter</t><verb><regular-root/></verb></w>
<w><t>re-ëx-am-ine</t><verb><regular-root/></verb></w>
<w><t>re-prov-eth</t><verb><lemma>reprove</lemma></verb></w>
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2023-09-24 22:57:57 UTC (rev 13271)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2023-09-26 10:23:39 UTC (rev 13272)
@@ -890,7 +890,7 @@
<w><t>ac-co-lad-ed</t></w>
<w><t>ac-co-lat-ed</t></w>
<w><t>ac-com-mo-da-ble</t></w>
-<w><t>ac-com-mo-date</t></w>
+<w><t>ac-com-mo-date</t><verb><regular-root/></verb></w>
<w><t>ac-com-mo-dat-ed</t></w>
<w><t>ac-com-mo-dat-ing</t></w>
<w><t>ac-com-mo-dat-ing-ly</t></w>
@@ -1948,7 +1948,7 @@
<w><t>ad-just-a-ble=pitch</t></w>
<w><t>ad-just-a-bly</t></w>
<w><t>ad-just-er</t></w>
-<w><t>ad-just-ment</t></w>
+<w><t>ad-just-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ad-just-ment-al</t></w>
<w><t>ad-jus-tor</t></w>
<w><t>ad-ju-tan-cy</t></w>
@@ -3948,7 +3948,7 @@
<w><t>Al-ge-ri-a</t></w>
<w><t>Al-ge-ri-an</t></w>
<w><t>al-ge-ri-enne</t></w>
-<w><t>Al-ge-rine</t></w>
+<w><t>Al-ge-rine</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>al-ge-rine</t></w>
<w><t>al-ge-ri-ta</t></w>
<w><t>Al-ger-non</t></w>
@@ -9608,7 +9608,7 @@
<w><t>ar-que-bus-ier</t></w>
<w><t>arr</t></w>
<w><t>ar-rack</t></w>
-<w><t>ar-raign</t></w>
+<w><t>ar-raign</t><verb><regular-root/></verb></w>
<w><t>ar-raign-er</t></w>
<w><t>ar-raign-ment</t></w>
<w><t>Ar-ran</t></w>
@@ -10354,7 +10354,7 @@
<w><t>as-sig-nats</t></w>
<w><t>as-sign-ee</t></w>
<w><t>as-sign-er</t></w>
-<w><t>as-sign-ment</t></w>
+<w><t>as-sign-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>as-sign-or</t></w>
<w><t>as-sim-i-la-bil-i-ty</t></w>
<w><t>as-sim-i-la-ble</t></w>
@@ -10438,7 +10438,7 @@
<w><t>as-sump-tive</t></w>
<w><t>as-sump-tive-ly</t></w>
<w><t>As-sur</t></w>
-<w><t>as-sur-ance</t></w>
+<w><t>as-sur-ance</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>As-sur-ba-ni-pal</t></w>
<w><t>as-sure</t><verb><regular-root/></verb></w>
<w><t>as-sured</t></w>
@@ -10961,7 +10961,7 @@
<w><t>at-tain-a-bil-i-ty</t></w>
<w><t>at-tain-a-ble</t></w>
<w><t>at-tain-a-ble-ness</t></w>
-<w><t>at-tain-der</t></w>
+<w><t>at-tain-der</t><noun><pluralizable/></noun></w>
<w><t>at-tain-er</t></w>
<w><t>at-tain-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>at-taint</t><verb><regular-root/></verb></w>
@@ -10977,7 +10977,7 @@
<w><t>at-tempt-er</t></w>
<w><t>At-ten-bor-ough</t></w>
<w><t>at-tend</t><verb><regular-root/></verb></w>
-<w><t>at-tend-ance</t></w>
+<w><t>at-tend-ance</t><noun><singular/></noun></w>
<phrase><t>at-tend-ance al-low-ance</t></phrase>
<phrase><t>at-tend-ance cen-tre</t></phrase>
<w><t>at-tend-ant</t><noun><pluralizable/><convertible-to-possessive/></noun><adjective><extensible value="false"/></adjective></w>
@@ -13174,7 +13174,7 @@
<phrase><t>Banks Is-land</t></phrase>
<phrase><t>bank state-ment</t></phrase>
<w><t>ban-lieue</t></w>
-<w><t>ban-ner</t></w>
+<w><t>ban-ner</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ban-nered</t></w>
<w><t>ban-ner-et</t></w>
<w><t>ban-ner-ette</t></w>
@@ -13609,7 +13609,7 @@
<w><t>Bar-re</t></w>
<w><t>bar-ré</t></w>
<w><t>barred</t></w>
-<w><t>bar-rel</t></w>
+<w><t>bar-rel</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>bar-rel=chest-ed</t></w>
<w><t>bar-reled</t></w>
<w><t>bar-rel-eye</t></w>
@@ -13685,7 +13685,7 @@
<w><t>Bart</t></w>
<w><t>bar-tend</t></w>
<w><t>bar-tend-er</t></w>
-<w><t>bar-ter</t></w>
+<w><t>bar-ter</t><verb><regular-root/></verb></w>
<w><t>bar-ter-er</t></w>
<w><t>Barth</t></w>
<w><t>Barth-i-an</t></w>
@@ -14789,11 +14789,12 @@
<w><t>Beh-men-ism</t></w>
<w><t>Beh-men-ist</t></w>
<w><t>Beh-men-ite</t></w>
-<w><t>be-hold</t></w>
+<w><t>be-hold</t><verb><regular-root value="false"/></verb></w>
<w><t>be-hold-a-ble</t></w>
<w><t>be-hold-en</t></w>
<w><t>be-hold-er</t></w>
<w><t>be-hold-ing</t></w>
+<w><t>be-holds</t><verb><lemma>behold</lemma></verb></w>
<w><t>Beh-rens</t></w>
<w><t>Beh-ring</t></w>
<w><t>Behr-man</t></w>
@@ -15695,7 +15696,7 @@
<w><t>bev-or</t></w>
<w><t>bev-vy</t></w>
<w><t>bev-y</t></w>
-<w><t>be-wail</t></w>
+<w><t>be-wail</t><verb><regular-root/></verb></w>
<w><t>be-wail-ing-ly</t></w>
<w><t>be-wail-ment</t></w>
<w><t>be-ware</t></w>
@@ -16911,9 +16912,9 @@
<w><t>black-rag</t></w>
<w><t>Blacks-burg</t></w>
<w><t>Black-shirt</t></w>
-<w><t>black-smith</t></w>
+<w><t>black-smith</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>black-snake</t></w>
-<w><t>Black-stone</t></w>
+<w><t>Black-stone</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>black-strap mo-las-ses</t></phrase>
<w><t>black-tail</t></w>
<w><t>black-thorn</t></w>
@@ -17275,7 +17276,7 @@
<w><t>blob-bing</t></w>
<w><t>bloc</t></w>
<w><t>Bloch</t></w>
-<w><t>block</t></w>
+<w><t>block</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>Block</t></w>
<w><t>block-ade</t></w>
<w><t>block-ad-ed</t></w>
@@ -18372,7 +18373,7 @@
<w><t>bor-del-lo</t></w>
<w><t>Bor-den</t></w>
<w><t>Bor-den-town</t></w>
-<w><t>bor-der</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>bor-der</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>Bor-der</t></w>
<w><t>bor-de-reau</t></w>
<w><t>bor-dered</t></w>
@@ -24571,7 +24572,7 @@
<w><t>cau-ter-ized</t></w>
<w><t>cau-ter-iz-ing</t></w>
<w><t>cau-ter-y</t></w>
-<w><t>cau-tion</t></w>
+<w><t>cau-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>cau-tion-ar-y</t></w>
<w><t>cau-tion-er</t></w>
<phrase><t>cau-tion mon-ey</t></phrase>
@@ -26731,7 +26732,7 @@
<w><t>Chim-bo-te</t></w>
<w><t>chime</t></w>
<w><t>chim-er</t></w>
-<w><t>chi-me-ra</t></w>
+<w><t>chi-me-ra</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>chi-mere</t></w>
<w><t>chi-mer-ic</t></w>
<w><t>chi-mer-i-cal</t></w>
@@ -28279,7 +28280,7 @@
<phrase><t>civ-il de-fence</t></phrase>
<phrase><t>civ-il dis-o-be-di-ence</t></phrase>
<phrase><t>civ-il en-gi-neer</t></phrase>
-<w><t>ci-vil-ian</t></w>
+<w><t>ci-vil-ian</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>civ-i-li-sa-tion</t></w>
<w><t>civ-i-lis-a-to-ry</t></w>
<w><t>civ-i-lise</t></w>
@@ -28335,7 +28336,7 @@
<w><t>clad-op-to-sis</t></w>
<w><t>Clai-borne</t></w>
<w><t>claim</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
-<w><t>claim-ant</t></w>
+<w><t>claim-ant</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>claim-er</t></w>
<phrase><t>claim-ing race</t></phrase>
<w><t>claim=jump-er</t></w>
@@ -29711,7 +29712,7 @@
<phrase><t>cof-fee=ta-ble book</t></phrase>
<phrase><t>cof-fee tree</t></phrase>
<w><t>cof-fee-weed</t></w>
-<w><t>cof-fer</t></w>
+<w><t>cof-fer</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>cof-fer-dam</t></w>
<w><t>cof-fered</t></w>
<w><t>cof-fer-like</t></w>
@@ -31760,7 +31761,7 @@
<w><t>con-fis-cate</t></w>
<w><t>con-fis-cat-ed</t></w>
<w><t>con-fis-cat-ing</t></w>
-<w><t>con-fis-ca-tion</t></w>
+<w><t>con-fis-ca-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>con-fis-ca-tor</t></w>
<w><t>con-fis-ca-to-ry</t></w>
<w><t>Con-fit-e-or</t></w>
@@ -32705,7 +32706,7 @@
<w><t>con-tra-dis-tinc-tion</t></w>
<w><t>con-tra-dis-tinc-tive</t></w>
<w><t>con-tra-dis-tinc-tive-ly</t></w>
-<w><t>con-tra-dis-tin-guish</t></w>
+<w><t>con-tra-dis-tin-guish</t><verb><regular-root/></verb></w>
<w><t>con-trail</t></w>
<w><t>con-tra-in-di-cant</t></w>
<w><t>con-tra-in-di-cate</t></w>
@@ -32969,7 +32970,7 @@
<w><t>con-vic-tion-al</t></w>
<w><t>con-vic-tive</t></w>
<w><t>con-vic-tive-ly</t></w>
-<w><t>con-vince</t></w>
+<w><t>con-vince</t><verb><regular-root/></verb></w>
<w><t>con-vinced</t></w>
<w><t>con-vinc-ed-ly</t></w>
<w><t>con-vinc-ed-ness</t></w>
@@ -37509,7 +37510,7 @@
<phrase><t>Da-mon and Pyth-i-as</t></phrase>
<w><t>dam-o-sel</t></w>
<w><t>dam-o-zel</t></w>
-<w><t>damp</t></w>
+<w><t>damp</t><verb><regular-root/></verb><adjective><extensible/></adjective></w>
<w><t>damp-course</t></w>
<w><t>damp-en</t></w>
<w><t>damp-en-er</t></w>
@@ -37769,7 +37770,7 @@
<w><t>Dar-rell</t></w>
<w><t>D’Ar-rest</t></w>
<w><t>Dar-row</t></w>
-<w><t>dart</t></w>
+<w><t>dart</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>dart-board</t></w>
<w><t>dart-er</t></w>
<w><t>Dart-ford</t></w>
@@ -38882,7 +38883,7 @@
<phrase><t>de-fence mech-an-ism</t></phrase>
<w><t>de-fend</t><verb><regular-root/></verb></w>
<w><t>de-fend-a-ble</t></w>
-<w><t>de-fend-ant</t></w>
+<w><t>de-fend-ant</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>de-fend-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>de-fen-es-tra-tion</t></w>
<w><t>de-fense</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -39376,7 +39377,7 @@
<w><t>de-lin-e-a-tor</t></w>
<w><t>de-li-ne-a-vit</t></w>
<w><t>de-lin-quen-cy</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
-<w><t>de-lin-quent</t></w>
+<w><t>de-lin-quent</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>de-lin-quent-ly</t></w>
<w><t>del-i-quesce</t></w>
<w><t>del-i-quesced</t></w>
@@ -40215,7 +40216,7 @@
<w><t>de-prav-ing</t></w>
<w><t>de-prav-ing-ly</t></w>
<w><t>de-prav-i-ty</t></w>
-<w><t>dep-re-cate</t></w>
+<w><t>dep-re-cate</t><verb><regular-root/></verb></w>
<w><t>dep-re-cat-ed</t></w>
<w><t>dep-re-cat-ing</t></w>
<w><t>dep-re-cat-ing-ly</t></w>
@@ -40458,7 +40459,7 @@
<w><t>de-scaled</t></w>
<w><t>de-scal-ing</t></w>
<w><t>des-ca-mi-sa-do</t></w>
-<w><t>des-cant</t></w>
+<w><t>des-cant</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>des-cant-er</t></w>
<w><t>Des-cartes</t></w>
<w><t>de-scend</t><verb><regular-root/></verb></w>
@@ -40650,8 +40651,8 @@
<w><t>des-patch</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>des-patch-er</t></w>
<w><t>Des-pen-ser</t></w>
-<w><t>des-per-a-do</t></w>
-<w><t>des-pe-ra-do</t></w>
+<w><t>des-per-a-do</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>des-per-a-does</t><noun><plural/><convertible-to-possessive/></noun><comment>Alt. "desperados"</comment></w>
<w><t>des-per-ate</t></w>
<w><t>des-per-ate-ly</t></w>
<w><t>des-per-ate-ness</t></w>
@@ -42946,7 +42947,7 @@
<w><t>dis-gorge-ment</t></w>
<w><t>dis-gorg-er</t></w>
<w><t>dis-gorg-ing</t></w>
-<w><t>dis-grace</t></w>
+<w><t>dis-grace</t><verb><regular-root/></verb></w>
<w><t>dis-graced</t></w>
<w><t>dis-grace-ful</t></w>
<w><t>dis-grace-ful-ly</t></w>
@@ -44071,7 +44072,7 @@
<w><t>dock</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>dock-age</t></w>
<w><t>dock-er</t></w>
-<w><t>dock-et</t></w>
+<w><t>dock-et</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>dock-et-ed</t></w>
<w><t>dock-et-ing</t></w>
<w><t>dock-land</t></w>
@@ -44466,7 +44467,7 @@
<w><t>Dom-r-my</t></w>
<w><t>Dom-r-my=la=Pu-celle</t></w>
<w><t>Don</t></w>
-<w><t>don</t></w>
+<w><t>don</t><verb><regular-root/></verb></w>
<w><t>Do-na</t></w>
<w><t>do-na</t></w>
<w><t>do-ña</t></w>
@@ -46835,16 +46836,16 @@
<w><t>E-chi-on</t></w>
<w><t>e-chiu-roid</t></w>
<w><t>Ech-o</t></w>
-<w><t>ech-o</t></w>
+<w><t>ech-o</t><noun><singular/></noun><verb><regular-root value="false"/></verb></w>
<phrase><t>ech-o cham-ber</t></phrase>
-<w><t>ech-oed</t></w>
+<w><t>ech-oed</t><verb><lemma>echo</lemma></verb></w>
<w><t>ech-o-er</t></w>
-<w><t>ech-oes</t></w>
+<w><t>ech-oes</t><noun><plural/></noun><verb><lemma>echo</lemma></verb></w>
<w><t>ech-o-gram</t></w>
<w><t>ech-o-graph</t></w>
<w><t>ech-o-ic</t></w>
<w><t>e-cho-ic</t></w>
-<w><t>ech-o-ing</t></w>
+<w><t>ech-o-ing</t><verb><lemma>echo</lemma></verb></w>
<w><t>ech-o-ism</t></w>
<w><t>e-cho-ism</t></w>
<w><t>ech-o-la-li-a</t></w>
@@ -47099,7 +47100,7 @@
<w><t>ed-i-ble</t></w>
<w><t>ed-i-ble-ness</t></w>
<w><t>ed-i-bles</t></w>
-<w><t>e-dict</t></w>
+<w><t>e-dict</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>e-dic-tal</t></w>
<w><t>e-dic-tal-ly</t></w>
<phrase><t>E-dict of Nantes</t></phrase>
@@ -48575,7 +48576,7 @@
<w><t>em-i-grate</t></w>
<w><t>em-i-grat-ed</t></w>
<w><t>em-i-grat-ing</t></w>
-<w><t>em-i-gra-tion</t></w>
+<w><t>em-i-gra-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>em-i-gra-tion-al</t></w>
<w><t>em-i-gra-tive</t></w>
<w><t>em-i-gra-to-ry</t></w>
@@ -49010,7 +49011,7 @@
<w><t>en-cul-tu-ra-tive</t></w>
<w><t>en-cum-ber</t></w>
<w><t>en-cum-ber-ing-ly</t></w>
-<w><t>en-cum-brance</t></w>
+<w><t>en-cum-brance</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>en-cum-branc-er</t></w>
<w><t>ency</t></w>
<w><t>encyc</t></w>
@@ -49320,7 +49321,7 @@
<w><t>eng</t></w>
<w><t>Eng</t></w>
<w><t>En-ga-dine</t></w>
-<w><t>en-gage</t></w>
+<w><t>en-gage</t><verb><regular-root/></verb></w>
<w><t>en-ga-gé</t></w>
<w><t>en-gaged</t></w>
<w><t>en-gag-ed-ly</t></w>
@@ -49343,7 +49344,7 @@
<w><t>en-gen-der-ment</t></w>
<w><t>En-ghien</t></w>
<w><t>engin</t></w>
-<w><t>en-gine</t></w>
+<w><t>en-gine</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>en-gine driv-er</t></phrase>
<w><t>en-gi-neer</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>en-gi-neer-ing</t></w>
@@ -49360,8 +49361,8 @@
<w><t>en-gla-cial</t></w>
<w><t>en-gla-ci-al</t></w>
<w><t>en-gla-cial-ly</t></w>
-<w><t>Eng-land</t></w>
-<w><t>Eng-land-er</t></w>
+<w><t>Eng-land</t><noun><convertible-to-possessive/></noun></w>
+<w><t>Eng-land-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>En-gle</t></w>
<w><t>En-gle-wood</t></w>
<w><t>Eng-lish</t></w>
@@ -51953,7 +51954,7 @@
<w><t>Ev-ans-ville</t></w>
<w><t>e-vap-o-ra-bil-i-ty</t></w>
<w><t>e-vap-o-ra-ble</t></w>
-<w><t>e-vap-o-rate</t></w>
+<w><t>e-vap-o-rate</t><verb><regular-root/></verb></w>
<w><t>e-vap-o-rat-ed</t></w>
<phrase><t>e-vap-o-rat-ed milk</t></phrase>
<w><t>e-vap-o-rat-ing</t></w>
@@ -52296,7 +52297,8 @@
<w><t>ex-cis-a-ble</t></w>
<w><t>ex-cise</t><verb><regular-root/></verb></w>
<w><t>ex-cised</t></w>
-<w><t>ex-cise-man</t></w>
+<w><t>ex-cise-man</t><noun><singular/><convertible-to-possessive/></noun></w>
+<w><t>ex-cise-men</t><noun><plural/><convertible-to-possessive/></noun></w>
<w><t>ex-cis-ing</t></w>
<w><t>ex-ci-sion</t></w>
<w><t>ex-cit-a-bil-i-ty</t></w>
@@ -52461,7 +52463,7 @@
<w><t>ex-e-cut-ed</t></w>
<w><t>ex-e-cut-er</t></w>
<w><t>ex-e-cut-ing</t></w>
-<w><t>ex-e-cu-tion</t></w>
+<w><t>ex-e-cu-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-e-cu-tion-al</t></w>
<w><t>ex-e-cu-tion-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-ec-u-tive</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -52985,7 +52987,7 @@
<w><t>ex-pos-tu-lat-ed</t></w>
<w><t>ex-pos-tu-lat-ing</t></w>
<w><t>ex-pos-tu-lat-ing-ly</t></w>
-<w><t>ex-pos-tu-la-tion</t></w>
+<w><t>ex-pos-tu-la-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ex-pos-tu-la-tive</t></w>
<w><t>ex-pos-tu-la-tor</t></w>
<w><t>ex-pos-tu-la-to-ry</t></w>
@@ -53713,7 +53715,7 @@
<w><t>Fair-born</t></w>
<w><t>Fair-bur-y</t></w>
<phrase><t>fair cop-y</t></phrase>
-<w><t>Fair-fax</t></w>
+<w><t>Fair-fax</t><noun><convertible-to-possessive/></noun></w>
<w><t>Fair-field</t></w>
<w><t>fair-ground</t></w>
<w><t>Fair-hope</t></w>
@@ -55372,7 +55374,7 @@
<phrase><t>field wind-ing</t></phrase>
<w><t>field-work</t></w>
<w><t>Fiend</t></w>
-<w><t>fiend</t></w>
+<w><t>fiend</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>fiend-ish</t></w>
<w><t>fiend-ish-ly</t></w>
<w><t>fiend-ish-ness</t></w>
@@ -58686,7 +58688,8 @@
<phrase><t>French let-ter</t></phrase>
<phrase><t>French li-lac</t></phrase>
<w><t>French-ly</t></w>
-<w><t>French-man</t></w>
+<w><t>French-man</t><noun><singular/><convertible-to-possessive/></noun></w>
+<w><t>French-men</t><noun><plural/><convertible-to-possessive/></noun></w>
<phrase><t>French Mo-roc-co</t></phrase>
<phrase><t>French mus-tard</t></phrase>
<phrase><t>French na-vy</t></phrase>
@@ -63404,7 +63407,7 @@
<w><t>gor-get-ed</t></w>
<w><t>Gor-gi-as</t></w>
<w><t>gorg-ing</t></w>
-<w><t>Gor-gon</t></w>
+<w><t>gor-gon</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>gor-go-nei-a</t></w>
<w><t>gor-go-nei-on</t></w>
<w><t>gor-go-ni-an</t></w>
@@ -67866,7 +67869,7 @@
<w><t>hearse-like</t></w>
<w><t>Hearst</t></w>
<w><t>heart</t></w>
-<w><t>heart-ache</t></w>
+<w><t>heart-ache</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>heart-ach-ing</t></w>
<phrase><t>heart at-tack</t></phrase>
<w><t>heart-beat</t></w>
@@ -67878,7 +67881,7 @@
<w><t>heart-bro-ken-ly</t></w>
<w><t>heart-bro-ken-ness</t></w>
<w><t>heart-burn</t></w>
-<w><t>heart-burn-ing</t></w>
+<w><t>heart-burn-ing</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>heart cher-ry</t></phrase>
<w><t>heart-ed</t></w>
<w><t>heart-ed-ly</t></w>
@@ -68083,7 +68086,7 @@
<w><t>Hec-a-tae-an</t></w>
<w><t>Hec-a-te</t></w>
<w><t>Hec-a-te-an</t></w>
-<w><t>hec-a-tomb</t></w>
+<w><t>hec-a-tomb</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Hec-a-ton-chei-res</t></w>
<w><t>Hec-a-ton-chi-res</t></w>
<w><t>hec-a-ton-sty-lon</t></w>
@@ -68728,7 +68731,7 @@
<w><t>hen-dec-a-he-dron</t></w>
<w><t>hen-dec-a-syl-lab-ic</t></w>
<w><t>hen-dec-a-syl-la-ble</t></w>
-<w><t>Hen-der-son</t></w>
+<w><t>Hen-der-son</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Hen-der-son-ville</t></w>
<w><t>hen-di-a-dys</t></w>
<w><t>Hen-don</t></w>
@@ -69396,7 +69399,7 @@
<w><t>HEW</t></w>
<w><t>hew</t></w>
<w><t>hew-a-ble</t></w>
-<w><t>hew-er</t></w>
+<w><t>hew-er</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Hew-ett</t></w>
<w><t>Hew-ette</t></w>
<w><t>hew-ing</t></w>
@@ -70200,7 +70203,7 @@
<w><t>hob-by=horse</t></w>
<w><t>hob-by-ist</t></w>
<w><t>hob-by-less</t></w>
-<w><t>hob-gob-lin</t></w>
+<w><t>hob-gob-lin</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Ho-bie</t></w>
<w><t>hob-like</t></w>
<w><t>hob-nail</t></w>
@@ -72173,7 +72176,7 @@
<phrase><t>Hy-der A-li</t></phrase>
<w><t>hyd-no-car-pate</t></w>
<phrase><t>hyd-no-car-pic ac-id</t></phrase>
-<w><t>hy-dra</t></w>
+<w><t>hy-dra</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Hy-dra</t></w>
<w><t>hy-drac-id</t></w>
<w><t>hy-drae-mi-a</t></w>
@@ -74659,11 +74662,12 @@
<w><t>im-ped-ing</t></w>
<w><t>im-ped-ing-ly</t></w>
<w><t>im-ped-i-tive</t></w>
-<w><t>im-pel</t></w>
-<w><t>im-pelled</t></w>
+<w><t>im-pel</t><verb><regular-root value="false"/></verb></w>
+<w><t>im-pelled</t><verb><lemma>impel</lemma></verb></w>
<w><t>im-pel-lent</t></w>
<w><t>im-pel-ler</t></w>
-<w><t>im-pel-ling</t></w>
+<w><t>im-pel-ling</t><verb><lemma>impel</lemma></verb></w>
+<w><t>im-pels</t><verb><lemma>impel</lemma></verb></w>
<w><t>im-pend</t></w>
<w><t>im-pend-ence</t></w>
<w><t>im-pend-en-cy</t></w>
@@ -76560,7 +76564,7 @@
<w><t>in-fest-er</t></w>
<w><t>in-feu-da-tion</t></w>
<w><t>in-fib-u-late</t></w>
-<w><t>in-fi-del</t></w>
+<w><t>in-fi-del</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>in-fi-del-ic</t></w>
<w><t>in-fi-del-i-ty</t></w>
<w><t>in-field</t></w>
@@ -78123,7 +78127,7 @@
<w><t>in-ter-fen-es-tra-tion</t></w>
<w><t>in-ter-fere</t></w>
<w><t>in-ter-fered</t></w>
-<w><t>in-ter-fer-ence</t></w>
+<w><t>in-ter-fer-ence</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>in-ter-fe-ren-tial</t></w>
<w><t>in-ter-fer-er</t></w>
<w><t>in-ter-fer-ing</t></w>
@@ -82026,7 +82030,7 @@
<phrase><t>ju-di-cial sep-a-ra-tion</t></phrase>
<w><t>ju-di-ci-ar-ies</t></w>
<w><t>ju-di-ci-ar-i-ly</t></w>
-<w><t>ju-di-ci-ar-y</t></w>
+<w><t>ju-di-ci-ar-y</t><noun><convertible-to-possessive/></noun></w>
<w><t>ju-di-cious</t></w>
<w><t>ju-di-cious-ly</t></w>
<w><t>ju-di-cious-ness</t></w>
@@ -86103,7 +86107,7 @@
<w><t>lau-re-ate-ship</t></w>
<w><t>Lau-reen</t></w>
<w><t>Lau-rel</t></w>
-<w><t>lau-rel</t></w>
+<w><t>lau-rel</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>Lau-rel and Har-dy</t></phrase>
<w><t>lau-reled</t></w>
<w><t>lau-rel-ing</t></w>
@@ -87140,7 +87144,7 @@
<w><t>let’s</t></w>
<w><t>Lett</t></w>
<w><t>let-ted</t></w>
-<w><t>let-ter</t></w>
+<w><t>let-ter</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>let-ter bomb</t></phrase>
<phrase><t>let-ter box</t></phrase>
<phrase><t>let-ter card</t></phrase>
@@ -88847,7 +88851,7 @@
<w><t>lo-cal-ism</t></w>
<w><t>lo-cal-ist</t></w>
<w><t>lo-cal-is-tic</t></w>
-<w><t>lo-cal-i-ty</t></w>
+<w><t>lo-cal-i-ty</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>lo-cal-iz-a-ble</t></w>
<w><t>lo-cal-i-za-tion</t></w>
<w><t>lo-cal-ize</t></w>
@@ -90623,7 +90627,7 @@
<w><t>mach-i-nate</t></w>
<w><t>mach-i-nat-ed</t></w>
<w><t>mach-i-nat-ing</t></w>
-<w><t>mach-i-na-tion</t></w>
+<w><t>mach-i-na-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>mach-i-na-tor</t></w>
<w><t>ma-chine</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ma-chine-a-ble</t></w>
@@ -96060,7 +96064,7 @@
<w><t>mi-nut-est</t></w>
<phrase><t>min-ute steak</t></phrase>
<w><t>mi-nu-ti-a</t></w>
-<w><t>mi-nu-ti-ae</t></w>
+<w><t>mi-nu-ti-ae</t><noun><plural/></noun></w>
<w><t>mi-nu-ti-al</t></w>
<w><t>min-ut-ing</t></w>
<w><t>minx</t></w>
@@ -96469,7 +96473,7 @@
<w><t>mis-in-fer-ence</t></w>
<w><t>mis-in-ferred</t></w>
<w><t>mis-in-fer-ring</t></w>
-<w><t>mis-in-form</t></w>
+<w><t>mis-in-form</t><verb><regular-root/></verb></w>
<w><t>mis-in-form-ant</t></w>
<w><t>mis-in-for-ma-tion</t></w>
<w><t>mis-in-form-a-tive</t></w>
@@ -97744,7 +97748,7 @@
<w><t>mo-nop-o-lis-tic</t></w>
<w><t>mo-nop-o-lis-ti-cal-ly</t></w>
<phrase><t>mo-nop-o-lis-tic com-pe-ti-tion</t></phrase>
-<w><t>mo-nop-o-lize</t></w>
+<w><t>mo-nop-o-lize</t><verb><regular-root/></verb></w>
<w><t>mo-nop-o-loid</t></w>
<w><t>mo-nop-o-ly</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Mo-nop-o-ly</t></w>
@@ -110903,7 +110907,7 @@
<w><t>Ouse</t></w>
<w><t>ou-sel</t></w>
<w><t>Ou-spen-sky</t></w>
-<w><t>oust</t></w>
+<w><t>oust</t><verb><regular-root/></verb></w>
<w><t>oust-er</t></w>
<w><t>out</t></w>
<w><t>out-act</t></w>
@@ -114093,7 +114097,8 @@
<w><t>oye-let</t></w>
<w><t>oy-er</t></w>
<w><t>o-yer</t></w>
-<phrase><t>oy-er and ter-mi-ner</t></phrase>
+<w><t>oy-er and ter-mi-ner</t></w>
+<w><t>Oy-er and Ter-mi-ner</t></w>
<w><t>o-yes</t></w>
<w><t>o-yez</t></w>
<w><t>O-yo</t></w>
@@ -114209,7 +114214,7 @@
<w><t>pac-i-fy</t></w>
<w><t>pac-i-fy-ing</t></w>
<w><t>pac-ing</t></w>
-<w><t>pack</t></w>
+<w><t>pack</t><verb><regular-root/></verb></w>
<w><t>pack-a-ble</t></w>
<w><t>pack-age</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>pack-aged</t></w>
@@ -116127,7 +116132,7 @@
<w><t>Pass-o-ver</t></w>
<w><t>pass-o-ver</t></w>
<phrase><t>pass o-ver</t></phrase>
-<w><t>pass-port</t></w>
+<w><t>pass-port</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>pass-port-less</t></w>
<w><t>pas-sus</t></w>
<w><t>pas-sus-es</t></w>
@@ -116693,7 +116698,7 @@
<w><t>peart-ly</t></w>
<w><t>peart-ness</t></w>
<w><t>Pea-ry</t></w>
-<w><t>peas-ant</t></w>
+<w><t>peas-ant</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>peas-ant-ry</t></w>
<w><t>peas-cod</t></w>
<w><t>pease</t></w>
@@ -120603,7 +120608,7 @@
<w><t>plain-stanes</t></w>
<w><t>plain-stones</t></w>
<w><t>plaint</t></w>
-<w><t>plain-tiff</t></w>
+<w><t>plain-tiff</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>plain-tiff-ship</t></w>
<w><t>plain-tive</t></w>
<w><t>plain-tive-ly</t></w>
@@ -127001,7 +127006,7 @@
<phrase><t>prick-ly pop-py</t></phrase>
<w><t>pric-y</t></w>
<w><t>Pride</t></w>
-<w><t>pride</t></w>
+<w><t>pride</t><noun><singular/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>pride-ful</t></w>
<w><t>pride-ful-ly</t></w>
<w><t>pride-ful-ness</t></w>
@@ -128549,7 +128554,7 @@
<w><t>pro-test-a-ble</t></w>
<w><t>Prot-es-tant</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Prot-es-tant-ism</t></w>
-<w><t>pro-tes-ta-tion</t></w>
+<w><t>pro-tes-ta-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>pro-test-er</t></w>
<w><t>pro-test-ing-ly</t></w>
<w><t>pro-test-ive</t></w>
@@ -132525,7 +132530,7 @@
<w><t>rank-less</t></w>
<w><t>rank-ly</t></w>
<w><t>rank-ness</t></w>
-<w><t>ran-sack</t></w>
+<w><t>ran-sack</t><verb><regular-root/></verb></w>
<w><t>ran-sack-er</t></w>
<w><t>ran-seur</t></w>
<w><t>ran-som</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
@@ -133755,7 +133760,7 @@
<w><t>re-coil-ing-ly</t></w>
<w><t>re-coin</t></w>
<w><t>re-coin-age</t></w>
-<w><t>rec-ol-lect</t></w>
+<w><t>rec-ol-lect</t><verb><regular-root/></verb></w>
<w><t>re=col-lect</t></w>
<w><t>rec-ol-lect-ed</t></w>
<w><t>rec-ol-lect-ed-ly</t></w>
@@ -140147,6 +140152,7 @@
<phrase><t>run a-long</t></phrase>
<phrase><t>run a-round</t></phrase>
<w><t>run=a-round</t></w>
+<w><t>run-a-way</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>run a-way</t></phrase>
<w><t>run-back</t></w>
<w><t>runch</t></w>
@@ -140608,7 +140614,7 @@
<w><t>safe=break-er</t></w>
<w><t>safe=con-duct</t></w>
<w><t>safe=de-pos-it</t></w>
-<w><t>safe-guard</t></w>
+<w><t>safe-guard</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>safe-keep-ing</t></w>
<w><t>safe-light</t></w>
<w><t>safe-ly</t></w>
@@ -140647,7 +140653,7 @@
<w><t>sag-a-more</t></w>
<w><t>sag-but</t></w>
<w><t>SAGE</t></w>
-<w><t>sage</t></w>
+<w><t>sage</t><noun><pluralizable/><convertible-to-possessive/></noun><adjective><extensible/></adjective></w>
<w><t>Sage</t></w>
<w><t>sage-brush</t></w>
<phrase><t>sage Der-by</t></phrase>
@@ -141467,7 +141473,7 @@
<w><t>Sa-on</t></w>
<w><t>Saor-stat</t></w>
<phrase><t>Saor-stat Eir-eann</t></phrase>
-<w><t>sap</t></w>
+<w><t>sap</t><noun><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>sap-a-jou</t></w>
<w><t>sap-an-wood</t></w>
<w><t>sa-pan-wood</t></w>
@@ -142918,7 +142924,7 @@
<w><t>screech-ing</t></w>
<w><t>screech-ing-ly</t></w>
<w><t>screed</t></w>
-<w><t>screen</t></w>
+<w><t>screen</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>screen-ing</t></w>
<w><t>screen-ings</t></w>
<phrase><t>screen mem-o-ry</t></phrase>
@@ -143052,7 +143058,7 @@
<w><t>Scu-dé-ry</t></w>
<w><t>scu-do</t></w>
<w><t>scuff</t></w>
-<w><t>scuf-fle</t></w>
+<w><t>scuf-fle</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>scuf-fling-ly</t></w>
<w><t>scul-dud-der-ies</t></w>
<w><t>scul-dud-der-y</t></w>
@@ -145495,7 +145501,7 @@
<w><t>se-ño-ri-ta</t></w>
<w><t>sen-sa</t></w>
<w><t>sen-sate</t></w>
-<w><t>sen-sa-tion</t></w>
+<w><t>sen-sa-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>sen-sa-tion-al</t></w>
<w><t>sen-sa-tion-al-ise</t></w>
<w><t>sen-sa-tion-al-ised</t></w>
@@ -145735,7 +145741,7 @@
<w><t>se-quen-ti-al-i-ty</t></w>
<w><t>se-quen-tial-ly</t></w>
<w><t>se-quent-ly</t></w>
-<w><t>se-ques-ter</t></w>
+<w><t>se-ques-ter</t><verb><regular-root/></verb></w>
<w><t>se-ques-tered</t></w>
<w><t>se-ques-tra-ble</t></w>
<w><t>se-ques-tral</t></w>
@@ -153186,7 +153192,7 @@
<w><t>stee-ple-jack</t></w>
<w><t>stee-ple-less</t></w>
<w><t>stee-ple-like</t></w>
-<w><t>steer</t></w>
+<w><t>steer</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>steer-age</t></w>
<w><t>steer-age-way</t></w>
<w><t>steer-er</t></w>
@@ -153802,7 +153808,7 @@
<w><t>stock-i-ly</t></w>
<w><t>stock-i-ness</t></w>
<w><t>stock-i-net</t></w>
-<w><t>stock-ing</t></w>
+<w><t>stock-ing</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<phrase><t>stock-ing cap</t></phrase>
<w><t>stock-inged</t></w>
<phrase><t>stock-ing fill-er</t></phrase>
@@ -154376,7 +154382,7 @@
<w><t>stric-tion</t></w>
<w><t>strict-ly</t></w>
<w><t>stric-ture</t><noun><pluralizable/></noun></w>
-<w><t>stride</t></w>
+<w><t>stride</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root value="false"/></verb></w>
<w><t>stri-dence</t></w>
<w><t>stri-den-cy</t></w>
<w><t>stri-dent</t></w>
@@ -157017,7 +157023,7 @@
<w><t>su-per-a-cute</t></w>
<w><t>su-per-a-dap-ta-ble</t></w>
<w><t>su-per-a-dap-ta-ble-ness</t></w>
-<w><t>su-per-add</t></w>
+<w><t>su-per-add</t><verb><regular-root/></verb></w>
<w><t>su-per-ad-di-tion</t></w>
<w><t>su-per-ad-di-tion-al</t></w>
<w><t>su-per-ad-e-quate</t></w>
@@ -157817,7 +157823,7 @@
<w><t>su-per-sec-u-lar</t></w>
<w><t>su-per-se-cure</t></w>
<w><t>su-per-sed-a-ble</t></w>
-<w><t>su-per-sede</t></w>
+<w><t>su-per-sede</t><verb><regular-root/></verb></w>
<w><t>su-per-sed-ed</t></w>
<w><t>su-per-sed-er</t></w>
<w><t>su-per-sed-ing</t></w>
@@ -160127,6 +160133,7 @@
<w><t>tar-di-est</t></w>
<w><t>Tar-dieu</t></w>
<w><t>tar-di-grade</t></w>
+<w><t>tar-di-ly</t><adverb/></w>
<w><t>tar-do</t></w>
<w><t>tar-dy</t></w>
<w><t>tare</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
@@ -161041,7 +161048,7 @@
<w><t>ten-e-brous</t></w>
<w><t>ten-e-brous-ness</t></w>
<w><t>Ten-e-dos</t></w>
-<w><t>ten-e-ment</t></w>
+<w><t>ten-e-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ten-e-men-tal</t></w>
<w><t>Ten-er-ife</t></w>
<w><t>Ten-e-rife</t></w>
@@ -161366,7 +161373,7 @@
<w><t>Ter-ri-to-ri-an</t></w>
<w><t>Ter-ri-to-ry</t></w>
<w><t>ter-ri-to-ry</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
-<w><t>ter-ror</t></w>
+<w><t>ter-ror</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ter-ror-ful</t></w>
<w><t>ter-ror-ise</t></w>
<w><t>ter-ror-ised</t></w>
@@ -166607,7 +166614,7 @@
<phrase><t>Tu-ring ma-chine</t></phrase>
<w><t>tur-i-on</t></w>
<w><t>Tu-ri-shche-va</t></w>
-<w><t>Turk</t></w>
+<w><t>Turk</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Tur-ke-stan</t></w>
<w><t>tur-key</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Tur-key</t></w>
@@ -167769,6 +167776,7 @@
<w><t>un-an-swer-a-ble</t></w>
<w><t>un-an-swer-a-ble-ness</t></w>
<w><t>un-an-swer-a-bly</t></w>
+<w><t>un-an-swered</t><adjective/></w>
<w><t>un-an-swer-ing</t></w>
<w><t>un-an-tag-o-nis-a-ble</t></w>
<w><t>un-an-tag-o-nised</t></w>
@@ -169197,8 +169205,9 @@
<w><t>un-con-dens-ing</t></w>
<w><t>un-con-des-cend-ing</t></w>
<w><t>un-con-di-tion</t></w>
-<w><t>un-con-di-tion-al</t></w>
+<w><t>un-con-di-tion-al</t><adjective/></w>
<w><t>un-con-di-tion-al-i-ty</t></w>
+<w><t>un-con-di-tion-al-ly</t><adverb/></w>
<w><t>un-con-di-tion-al-ness</t></w>
<w><t>un-con-di-tioned</t></w>
<w><t>un-con-di-tioned-ness</t></w>
@@ -173601,7 +173610,7 @@
<w><t>un-in-wrapped</t></w>
<w><t>un-in-wreathed</t></w>
<w><t>u-ni-oc-u-lar</t></w>
-<w><t>un-ion</t></w>
+<w><t>un-ion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Un-ion</t></w>
<phrase><t>un-ion card</t></phrase>
<phrase><t>un-ion cat-a-logue</t></phrase>
@@ -181617,6 +181626,7 @@
<w><t>vi-o-la-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>vi-o-la-tion-al</t></w>
<w><t>vi-o-la-tive</t></w>
+<w><t>vi-o-la-tor</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>vi-o-lence</t></w>
<w><t>vi-o-lent</t><adjective><extensible value="false"/></adjective></w>
<w><t>vi-o-lent-ly</t><adverb/></w>
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/fre-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/fre-Latn-ZZZ.dict.xml 2023-09-24 22:57:57 UTC (rev 13271)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/fre-Latn-ZZZ.dict.xml 2023-09-26 10:23:39 UTC (rev 13272)
@@ -21,9 +21,11 @@
<w><t>de</t></w>
<w><t>der-nier</t></w>
<w><t>du</t><comment>Possibly archaic for "de".</comment></w>
+<w><t>é-clat</t><noun/></w>
<w><t>en</t></w>
<w><t>es-prit</t></w>
<w><t>feme</t><noun><pluralizable/></noun></w>
+<w><t>femme</t></w>
<w><t>fi-let mi-gnon</t></w>
<w><t>France</t></w>
<w><t>grace</t></w>
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/lat-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/lat-Latn-ZZZ.dict.xml 2023-09-24 22:57:57 UTC (rev 13271)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/lat-Latn-ZZZ.dict.xml 2023-09-26 10:23:39 UTC (rev 13272)
@@ -12,14 +12,19 @@
<!--
-->
+<w><t>a</t></w>
<w><t>ab-sent-i-a</t></w>
<w><t>ac-tu</t></w>
<w><t>ad</t></w>
<w><t>al-ter-i-us</t></w>
+<w><t>am-or</t></w>
<w><t>an-no</t></w>
<w><t>ann-um</t></w>
<w><t>a-pel-la</t></w>
+<w><t>ar-gu-ment-a</t></w>
<w><t>ar-gu-ment-um</t></w>
+<w><t>ar-tem</t></w>
+<w><t>ar-va</t></w>
<w><t>bap-tism-a</t></w>
<w><t>bel-li</t></w>
<w><t>Ben-e-dic-tus</t><noun/></w>
@@ -33,6 +38,7 @@
<w><t>cen-tum</t><comment>100, as in "per centum" or "percent."</comment></w>
<w><t>Christ</t></w>
<w><t>cir-ca</t></w>
+<w><t>com-i-ta-tus</t></w>
<w><t>con</t><abbrev referenced-word="contradicente"/></w>
<w><t>con-tra-di-cen-te</t></w>
<w><t>con-ven-tus</t></w>
@@ -40,8 +46,13 @@
<w><t>cre-dat</t></w>
<w><t>cre-do</t></w>
<w><t>cre-pi-dam</t></w>
+<w><t>cus-tos</t></w>
<w><t>de</t></w>
+<w><t>de-cem-vir</t></w>
+<w><t>de-cem-vir-i</t></w>
<w><t>De-o</t></w>
+<w><t>de-sid-er-a-ta</t></w>
+<w><t>de-sid-er-a-tum</t></w>
<w><t>de-struc-ti-o</t></w>
<w><t>die</t></w>
<w><t>die-bus</t></w>
@@ -50,6 +61,7 @@
<w><t>do-lo-ro-sa</t></w>
<w><t>Dom-i-ni</t></w>
<w><t>dra-ma-tis</t></w>
+<w><t>dul-ci-a</t></w>
<w><t>e.g.</t><abbrev referenced-word="id est"/><comment>Latin "that is."</comment></w>
<w><t>e. g.</t><abbrev referenced-word="id est"/><comment>Latin "that is."</comment></w>
<w><t>e-go</t></w>
@@ -59,10 +71,14 @@
<w><t>etc.</t><abbrev referenced-word="et cetera"/></w>
<w><t>e-van-gel-i-ar-i-um</t></w>
<w><t>ex</t></w>
+<w><t>fa-cias</t></w>
<w><t>fac-to</t></w>
<w><t>fa-to</t></w>
+<w><t>fat-u-us</t></w>
<w><t>fide</t></w>
+<w><t>fi-eri</t></w>
<w><t>fit</t></w>
+<w><t>fu-gi-mus</t></w>
<w><t>gen-er-a-ti-o</t></w>
<w><t>glo-ri-a</t></w>
<w><t>ha-be-as</t></w>
@@ -70,11 +86,13 @@
<w><t>ho-mi-nem</t></w>
<w><t>hy-dro-ma-ni-a</t></w>
<w><t>i.e</t><abbrev referenced-word="id est"/></w>
-<w><t>i. e</t><abbrev referenced-word="id est"/><comment>Contains embedded non-breaking space.</comment></w>
+<w><t>i. e.</t><abbrev referenced-word="id est"/><comment>Contains embedded non-breaking space.</comment></w>
+<w><t>ig-nis</t></w>
<w><t>im-mer-go</t></w>
<w><t>im-per-i-i</t></w>
<w><t>in</t></w>
<w><t>in-fi-del-i-um</t></w>
+<w><t>in-fin-i-tum</t></w>
<w><t>in-san-i-a</t></w>
<w><t>ip-so</t></w>
<w><t>ju-dae-us</t></w>
@@ -81,6 +99,7 @@
<w><t>ju-ris</t></w>
<w><t>jus</t></w>
<w><t>li-ber-or-um</t></w>
+<w><t>lin-qui-mus</t></w>
<w><t>lo-co</t></w>
<w><t>me-um</t></w>
<w><t>nas-ci-tur</t></w>
@@ -87,8 +106,11 @@
<w><t>ne</t></w>
<w><t>nem</t><abbrev referenced-word="nemine"/></w>
<w><t>nem-i-ne</t></w>
+<w><t>nex-us</t></w>
+<w><t>ni-hil</t></w>
<w><t>nix-æ</t></w>
<w><t>non</t></w>
+<w><t>nos</t></w>
<w><t>no-vo</t></w>
<w><t>of-fi-cio</t></w>
<w><t>or-i-gin-es</t></w>
@@ -99,30 +121,44 @@
<w><t>par-te</t></w>
<w><t>part-i-bus</t></w>
<w><t>Pas-cha-tis</t></w>
+<w><t>pa-tri-a</t></w>
+<w><t>pa-tri-æ</t></w>
+<w><t>pa-tri-am</t></w>
<w><t>Pen-te-cost-it</t></w>
<w><t>per</t></w>
<w><t>pe-ri-ti</t></w>
<w><t>per-son-ae</t></w>
<w><t>pe-ti-tio</t></w>
+<w><t>pop-u-li</t></w>
+<w><t>pos-se</t></w>
<w><t>post</t></w>
<w><t>po-tent-i-æ</t></w>
+<w><t>præ-ter-e-a</t></w>
<w><t>prin-ci-pii</t></w>
+<w><t>pri-or-i</t></w>
<w><t>pro</t></w>
<w><t>prop-a-gan-da</t></w>
+<w><t>qua</t></w>
<w><t>quad-ru-plex</t></w>
<w><t>qui</t></w>
+<w><t>quid</t></w>
<w><t>quo</t></w>
+<w><t>quo-ad</t></w>
<w><t>ra-sa</t></w>
<w><t>ra-ti-o</t></w>
+<w><t>re-gum</t></w>
<w><t>ritus</t></w>
<w><t>sac-rae</t></w>
<w><t>sac-ris</t></w>
+<w><t>sal-us</t></w>
<w><t>scrip-tur-a</t></w>
<w><t>se</t></w>
+<w><t>se-cun-dum</t></w>
<w><t>sem-per</t></w>
<w><t>seq</t><abbrev referenced-word="sequens"/></w>
<w><t>se-quens</t></w>
<w><t>sig-no</t></w>
+<w><t>si-ne</t></w>
<w><t>so-la</t></w>
<w><t>so-li</t></w>
<w><t>sta-tus</t></w>
@@ -140,12 +176,16 @@
<w><t>tri-um</t></w>
<w><t>tu-um</t></w>
<w><t>ul-ti-ma</t></w>
+<w><t>ul-ti-ma-ta</t></w>
+<w><t>ul-ti-ma-tum</t></w>
<w><t>ul-tra</t></w>
<w><t>un-i-us</t></w>
<w><t>va-lo-rem</t></w>
+<w><t>ven-ue</t></w>
<w><t>ver-ba-tim</t><adjective/><adverb/></w>
<w><t>ver-sa</t></w>
<w><t>ver-sus</t></w>
+<w><t>ve-to</t></w>
<w><t>vi-a</t></w>
<w><t>vice</t></w>
<w><t>vin-ces</t></w>
@@ -152,6 +192,7 @@
<w><t>vi-va</t></w>
<w><t>vive</t></w>
<w><t>vo-ce</t></w>
+<w><t>vox</t></w>
<w><t>vul-gus</t></w>
<w><t>war-rant-o</t></w>
<w><t>zo-an-thro-pic-a</t></w>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 22:58:00
|
Revision: 13271
http://sourceforge.net/p/foray/code/13271
Author: victormote
Date: 2023-09-24 22:57:57 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Conform to aXSL change: Rename token type WHITESPACE to BREAK, and add documentation about why.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 22:22:56 UTC (rev 13270)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 22:57:57 UTC (rev 13271)
@@ -130,8 +130,8 @@
/** Surrogate for {@link TokenType#WORD}. */
WORD(TokenType.WORD),
- /** Surrogate for {@link TokenType#WHITESPACE}. */
- WHITESPACE(TokenType.WHITESPACE),
+ /** Surrogate for {@link TokenType#BREAK}. */
+ BREAK(TokenType.BREAK),
/** Surrogate for {@link TokenType#LEADING_PUNCTUATION}. */
LEADING_PUNCTUATION(TokenType.LEADING_PUNCTUATION),
@@ -409,7 +409,7 @@
/* Third pass. Simplify the breakTypes array. */
/* For normal case (no explicit tokens), the conceptual token immediately previous to the first one is a break
* char. */
- TokenType2 preSequenceBreakType = TokenType2.WHITESPACE;
+ TokenType2 preSequenceBreakType = TokenType2.BREAK;
if (! isFirstChunk) {
/* If this is not the first item being implicitly tokenized, an explicit token (a word) is the previous
* token. */
@@ -473,7 +473,7 @@
* The touchstone here is the known word breaks which are always interword content.
* Anything between them must be either attached to the word break to become a part of the interword content, or
* must get coalesced into a "word" whether it is recognized as word content or not. If done properly, every element
- * in the array, when finished, should be either {@link TokenType2#WORD} or {@link TokenType2#WHITESPACE}.
+ * in the array, when finished, should be either {@link TokenType2#WORD} or {@link TokenType2#BREAK}.
* Anything not in those two categories will be treated in the final tokenization as {@link TokenType2#WORD}.
* @param breakTypes The array of charTypes.
* @param preSequenceBreakType The break type that is conceptually immediately before the first (index 0) break
@@ -530,7 +530,7 @@
/* Look for ambiguous punctuation immediate followed by whitespace and immediately preceded by trailing
* punctuation. Resolve it to trailing punctuation. */
if (currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION
- && (nextBreakType == TokenType2.WHITESPACE
+ && (nextBreakType == TokenType2.BREAK
|| nextBreakType == TokenType2.END)
&& (previousBreakType == TokenType2.TRAILING_PUNCTUATION
|| previousBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION)) {
@@ -548,9 +548,9 @@
/* If the current type is not a whitespace char, but it is surrounded by whitespace chars, this marks a
* word. */
- if (currentBreakType != TokenType2.WHITESPACE
- && previousBreakType == TokenType2.WHITESPACE
- && (nextBreakType == TokenType2.WHITESPACE
+ if (currentBreakType != TokenType2.BREAK
+ && previousBreakType == TokenType2.BREAK
+ && (nextBreakType == TokenType2.BREAK
|| nextBreakType == TokenType2.END)) {
breakTypes[breakIndex] = TokenType2.WORD;
}
@@ -577,7 +577,7 @@
breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
break;
}
- case WHITESPACE: {
+ case BREAK: {
/* This cannot be trailing punctuation, so must be the first character in a new word, probably a
* contraction like "'tis" for example. */
breakTypes[breakIndex] = TokenType2.WORD;
@@ -640,9 +640,9 @@
switch (currentBreakType) {
case LEADING_PUNCTUATION: {
switch (previousBreakType) {
- case WHITESPACE: {
+ case BREAK: {
switch (nextBreakType) {
- case WHITESPACE: {
+ case BREAK: {
/* Surrounded by breaks. Treat this as a word. */
breakTypes[breakIndex] = TokenType2.WORD;
break;
@@ -685,10 +685,10 @@
switch (currentBreakType) {
case TRAILING_PUNCTUATION: {
switch (nextBreakType) {
- case WHITESPACE:
+ case BREAK:
case END: {
switch (previousBreakType) {
- case WHITESPACE: {
+ case BREAK: {
/* Surrounded by breaks. Treat this as a word. */
breakTypes[breakIndex] = TokenType2.WORD;
break;
@@ -784,7 +784,7 @@
*/
public TokenType2 computeCharType(final int c) {
if (CharacterUtils.isWordBreakChar(c)) {
- return TokenType2.WHITESPACE;
+ return TokenType2.BREAK;
}
if (isWordChar(c)) {
return TokenType2.WORD;
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 22:22:56 UTC (rev 13270)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 22:57:57 UTC (rev 13271)
@@ -117,13 +117,13 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(10, actual.size());
testToken(actual.get(0), "Beware", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "the", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "ides", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(6), "of", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "March", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -138,33 +138,33 @@
assertEquals(30, actual.size());
testToken(actual.get(0), "39", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(3), "It", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(5), "was", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(7), "the", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(9), "best", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(10), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(11), "of", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(13), "times", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(14), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(16), "It", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(17), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(18), "was", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(19), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(19), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(20), "the", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(21), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(21), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(22), "worst", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(23), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(23), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(24), "of", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(25), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(25), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(26), "times", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(27), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(28), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(28), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(29), "<----", TokenType.WORD, WritingSystem4a.USA);
}
@@ -180,14 +180,14 @@
/* Compound word "fiery-footed" treated as one word. */
assertEquals(11, actual.size());
testToken(actual.get(0), "Gallop", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "apace", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(5), "you", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(7), "fiery-footed", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(9), "steeds", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(10), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -202,11 +202,11 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(7, actual.size());
testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "play's", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "the", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(6), "thing", TokenType.WORD, WritingSystem4a.USA);
}
@@ -220,13 +220,13 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(9, actual.size());
testToken(actual.get(0), "!", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "@", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "#", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(6), "$", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "%", TokenType.WORD, WritingSystem4a.USA);
}
@@ -241,33 +241,33 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(30, actual.size());
testToken(actual.get(0), "Parentheses", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "(", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(3), "as", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(5), "I", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(7), "stated", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(9), "earlier", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(10), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(12), "are", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(13), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(14), "a", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(16), "matching", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(17), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(18), "pair", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(19), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(19), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(20), "of", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(21), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(21), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(22), "(", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(23), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(23), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(24), "and", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(25), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(25), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(26), ")", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(27), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(27), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(28), "characters", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(29), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -283,26 +283,26 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(23, actual.size());
testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "quick", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "(", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(5), "“", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(6), "brown", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(7), "”", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(8), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(10), "fox", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(12), "can’t", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(13), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(14), "jump", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(16), "32.3", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(17), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(18), "feet", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(19), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(20), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(20), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(21), "right", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(22), "?", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -316,7 +316,7 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(4, actual.size());
testToken(actual.get(0), "Appendix", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "D.4", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -330,7 +330,7 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(5, actual.size());
testToken(actual.get(0), "every", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "creature", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(4), "”", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
@@ -346,9 +346,9 @@
assertEquals(6, actual.size());
testToken(actual.get(0), "“", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(1), "Go", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(3), "ye", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(5), "into", TokenType.WORD, WritingSystem4a.USA);
}
@@ -361,14 +361,14 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(10, actual.size());
testToken(actual.get(0), "for", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "every", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "[", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(5), "student", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(6), "]", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(7), ".", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(9), "Return", TokenType.WORD, WritingSystem4a.USA);
}
@@ -381,15 +381,15 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(12, actual.size());
testToken(actual.get(0), "’Tis", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "the", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "season", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(6), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(10), "jolly", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(11), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -406,13 +406,13 @@
assertEquals(11, actual.size());
testToken(actual.get(0), "Letter", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(3), "&c", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(4), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(5), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(6), "\n", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "\n", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(7), "at", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(9), "large", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(10), ";", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -426,24 +426,24 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(21, actual.size());
testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(5), "i.e", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(6), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(10), "exist", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(11), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(13), "or", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(15), "not", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(16), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(17), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(18), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(18), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(19), "be", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(20), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -457,26 +457,26 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(23, actual.size());
testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(5), "(", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(6), "i.e", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(7), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(9), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(10), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(11), "exist", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(12), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(13), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(15), "or", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(16), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(17), "not", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(18), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(18), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(19), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(20), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(20), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(21), "be", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(22), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -491,9 +491,9 @@
assertEquals(7, actual.size());
testToken(actual.get(0), "Mr", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(3), "P.’s", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(5), "hat", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(6), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -510,15 +510,15 @@
final List<Lexer.Token> actual = tokenize();
assertEquals(12, actual.size());
testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "trip", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(6), "São Paulo", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "was", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(10), "nice", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(11), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -533,20 +533,20 @@
assertEquals(17, actual.size());
testToken(actual.get(0), "Noble", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "gases", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "(", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(5), "neon", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(6), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "etc", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(10), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(11), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(13), "are", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(15), "inert", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(16), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -560,14 +560,14 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(11, actual.size());
testToken(actual.get(0), "Sydney", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "is", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "-33.865143", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(5), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(7), "151.209900", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(9), "lat-long", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(10), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -583,14 +583,14 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(10, actual.size());
testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "gods", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), Character.toString(U2000_General_Punctuation.HORIZONTAL_ELLIPSIS), TokenType.WORD,
WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(6), "plague", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "us", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -607,22 +607,22 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(18, actual.size());
testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "way", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(6), "dusty", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "death", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(10), Character.toString(U2000_General_Punctuation.HORIZONTAL_ELLIPSIS),
TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(12), "heard", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(13), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(14), "no", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(16), "more", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(17), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -636,13 +636,13 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(10, actual.size());
testToken(actual.get(0), "Ships", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(2), "entering", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), "/", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), "/", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(4), "exiting", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(6), "the", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.BREAK, WritingSystem4a.USA);
testToken(actual.get(8), "harbor", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 22:22:58
|
Revision: 13270
http://sourceforge.net/p/foray/code/13270
Author: victormote
Date: 2023-09-24 22:22:56 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Handle an embedded slash or solidus.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
trunk/foray/foray-primitive/src/main/java/org/foray/primitive/CharacterUtils.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 17:50:59 UTC (rev 13269)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 22:22:56 UTC (rev 13270)
@@ -556,7 +556,7 @@
*/
@Test
public void testArabicNumerals() {
- final String testString = "Sydney is -33.865143, 151.209900 lat/long.";
+ final String testString = "Sydney is -33.865143, 151.209900 lat-long.";
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(11, actual.size());
testToken(actual.get(0), "Sydney", TokenType.WORD, WritingSystem4a.USA);
@@ -568,7 +568,7 @@
testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(7), "151.209900", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(9), "lat/long", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), "lat-long", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(10), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
@@ -627,4 +627,24 @@
testToken(actual.get(17), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
+ /**
+ * Test of text with an embedded slash.
+ */
+ @Test
+ public void testEmbeddedSlash() {
+ final String testString = "Ships entering/exiting the harbor.";
+ final List<Lexer.Token> actual = tokenize(testString);
+ assertEquals(10, actual.size());
+ testToken(actual.get(0), "Ships", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "entering", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), "/", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "exiting", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "the", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "harbor", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ }
+
}
Modified: trunk/foray/foray-primitive/src/main/java/org/foray/primitive/CharacterUtils.java
===================================================================
--- trunk/foray/foray-primitive/src/main/java/org/foray/primitive/CharacterUtils.java 2023-09-24 17:50:59 UTC (rev 13269)
+++ trunk/foray/foray-primitive/src/main/java/org/foray/primitive/CharacterUtils.java 2023-09-24 22:22:56 UTC (rev 13270)
@@ -70,6 +70,7 @@
U2000_General_Punctuation.RIGHT_SINGLE_QUOTATION_MARK,
U2000_General_Punctuation.RIGHT_DOUBLE_QUOTATION_MARK,
U2000_General_Punctuation.HORIZONTAL_ELLIPSIS,
+ U0000_Basic_Latin.SOLIDUS,
});
/** The punctuation characters which may, depending on context, be treated as intraword punctuation. */
@@ -81,7 +82,7 @@
U0000_Basic_Latin.FULL_STOP, //English example: Section 8.16
});
- /** The punctuation characters which, when they immediately follow a word, can be separated from that word during
+ /** The punctuation characters which, when they immediately follow a word, may be separated from that word during
* line-breaking, but which preferably should not be separated. TODO: This list is not comprehensive and should be
* improved. */
private static final String DETACHABLE_PUNCTUATION = new String(new char[] {
@@ -88,7 +89,7 @@
U2000_General_Punctuation.EN_DASH,
});
- /** The punctuation characters which, when they immediately follow a word, can be separated from that word during
+ /** The punctuation characters which, when they immediately follow a word, shall be separated from that word during
* line-breaking, but which preferably should not be separated. TODO: This list is not comprehensive and should be
* improved.*/
private static final String DETACHED_PUNCTUATION = new String(new char[] {
@@ -390,11 +391,18 @@
*/
public static boolean isWordBreakChar(final int c) {
switch (c) {
- case ' ': return true;
- case '\r': return true;
- case '\n': return true;
- case '\t': return true;
+ case U0000_Basic_Latin.SPACE:
+ case U0000_Basic_Latin.CONTROL_CARRIAGE_RETURN:
+ case U0000_Basic_Latin.CONTROL_LINE_FEED:
+ case U0000_Basic_Latin.CONTROL_CHARACTER_TABULATION:
+ return true;
}
+ if (isFixedWidthSpace(c)) {
+ return true;
+ }
+ if (c == U0000_Basic_Latin.SOLIDUS) {
+ return true;
+ }
return false;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 17:51:01
|
Revision: 13269
http://sourceforge.net/p/foray/code/13269
Author: victormote
Date: 2023-09-24 17:50:59 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Resolve ambiguous trailing punctuation that can be resolved from its context.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 17:38:06 UTC (rev 13268)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 17:50:59 UTC (rev 13269)
@@ -508,6 +508,7 @@
/* First iterate in reverse order. */
for (int breakIndex = breakTypes.length - 1; breakIndex > -1; breakIndex --) {
final TokenType2 currentBreakType = breakTypes[breakIndex];
+ final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
@@ -525,6 +526,16 @@
&& nextBreakType == TokenType2.TRAILING_PUNCTUATION) {
breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
}
+
+ /* Look for ambiguous punctuation immediate followed by whitespace and immediately preceded by trailing
+ * punctuation. Resolve it to trailing punctuation. */
+ if (currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION
+ && (nextBreakType == TokenType2.WHITESPACE
+ || nextBreakType == TokenType2.END)
+ && (previousBreakType == TokenType2.TRAILING_PUNCTUATION
+ || previousBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION)) {
+ breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
+ }
}
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 17:38:06 UTC (rev 13268)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 17:50:59 UTC (rev 13269)
@@ -367,7 +367,7 @@
testToken(actual.get(4), "[", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(5), "student", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(6), "]", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(7), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(7), ".", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(9), "Return", TokenType.WORD, WritingSystem4a.USA);
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 17:38:08
|
Revision: 13268
http://sourceforge.net/p/foray/code/13268
Author: victormote
Date: 2023-09-24 17:38:06 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Handle transient trailing punctuation that is followed by resolved trailing punctuation.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 17:12:15 UTC (rev 13267)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 17:38:06 UTC (rev 13268)
@@ -505,17 +505,26 @@
private void resolvePossibleIntrawordPunctuation(final TokenType2[] breakTypes,
final TokenType2 preSequenceBreakType, final TokenType2 postSequenceBreakType) {
- /* First iterate in reverse order, looking for ambiguous punctuation that is immediately followed by word chars.
- * In that case the punctuation is considered part of the word. */
+ /* First iterate in reverse order. */
for (int breakIndex = breakTypes.length - 1; breakIndex > -1; breakIndex --) {
final TokenType2 currentBreakType = breakTypes[breakIndex];
final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
+
+ /* Look for transient or ambiguous punctuation that is immediately followed by word chars.
+ * That punctuation is considered part of the word. */
if ((currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
|| currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION)
&& nextBreakType == TokenType2.WORD) {
breakTypes[breakIndex] = TokenType2.WORD;
}
+
+ /* Look for transient trailing punctuation immediately followed by resolved trailing punctuation.
+ * Change the transient to resolved. */
+ if (currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
+ && nextBreakType == TokenType2.TRAILING_PUNCTUATION) {
+ breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
+ }
}
@@ -526,7 +535,8 @@
final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
- /* If the current type is not a break char, but it is surrounded by break chars, this marks a word. */
+ /* If the current type is not a whitespace char, but it is surrounded by whitespace chars, this marks a
+ * word. */
if (currentBreakType != TokenType2.WHITESPACE
&& previousBreakType == TokenType2.WHITESPACE
&& (nextBreakType == TokenType2.WHITESPACE
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 17:12:15 UTC (rev 13267)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 17:38:06 UTC (rev 13268)
@@ -38,7 +38,6 @@
import org.axsl.unicode.block.U2000_General_Punctuation;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
@@ -525,18 +524,13 @@
}
/**
- * Test of explicit token ending with punctuation, immediately followed by actual punctuation in a different writing
- * system.
+ * Test of ambiguous trailing punctuation, immediately followed by actual trailing punctuation.
*/
@Test
- @Disabled("Solution is a work in progress.")
- public void testTokenPunctuationFollowedByActualPunctuation() {
- final Lexer4a out = getObjectUnderTest();
- out.addUntokenized("Noble gases (neon, ", WritingSystem4a.USA);
- out.addUntokenized("etc.", WritingSystem4a.LATIN);
- out.addUntokenized("), are inert.", WritingSystem4a.USA);
+ public void testAmbiguousTrailingPunctuationFollowedByTrailingPunctuation() {
+ final String testString = "Noble gases (neon, etc.), are inert.";
+ final List<Lexer.Token> actual = tokenize(testString);
- final List<Lexer.Token> actual = tokenize();
assertEquals(17, actual.size());
testToken(actual.get(0), "Noble", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
@@ -546,8 +540,8 @@
testToken(actual.get(5), "neon", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(6), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), "etc", TokenType.WORD, WritingSystem4a.LATIN);
- testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.LATIN);
+ testToken(actual.get(8), "etc", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(10), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(11), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 17:12:20
|
Revision: 13267
http://sourceforge.net/p/foray/code/13267
Author: victormote
Date: 2023-09-24 17:12:15 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Handle lexing of horizontal ellipsis.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/data/dictionaries/non-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
trunk/foray/foray-primitive/src/main/java/org/foray/primitive/CharacterUtils.java
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/non-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/non-Latn-ZZZ.dict.xml 2023-09-24 14:39:26 UTC (rev 13266)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/non-Latn-ZZZ.dict.xml 2023-09-24 17:12:15 UTC (rev 13267)
@@ -6,7 +6,7 @@
<axsl-dictionary
id="org.foray.non.Latn.ZZZ"
- language="arc" script="Latn"
+ language="non" script="Latn"
hard-hyphen-char="=" soft-hyphen-char="-">
<!--
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 14:39:26 UTC (rev 13266)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 17:12:15 UTC (rev 13267)
@@ -35,6 +35,7 @@
import org.axsl.orthography.optional.Lexer;
import org.axsl.orthography.optional.Lexer.Token;
import org.axsl.orthography.optional.Lexer.TokenType;
+import org.axsl.unicode.block.U2000_General_Punctuation;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Disabled;
@@ -577,4 +578,59 @@
testToken(actual.get(10), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
+ /**
+ * Test of text containing a standalone ellipsis.
+ */
+ @Test
+ public void testStandaloneEllipsis() {
+ final String testString = "The gods " + U2000_General_Punctuation.HORIZONTAL_ELLIPSIS + " plague us.";
+ /* King Lear, Act v, Scene 3. */
+
+ final List<Lexer.Token> actual = tokenize(testString);
+ assertEquals(10, actual.size());
+ testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "gods", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), Character.toString(U2000_General_Punctuation.HORIZONTAL_ELLIPSIS), TokenType.WORD,
+ WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "plague", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "us", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ }
+
+ /**
+ * Test of text containing a full-stop followed by a trailing ellipsis.
+ */
+ @Test
+ public void testTrailingEllipsis() {
+ final String testString = "The way to dusty death." + U2000_General_Punctuation.HORIZONTAL_ELLIPSIS +
+ " heard no more.";
+ /* Macbeth, Act v, Scene 5. */
+
+ final List<Lexer.Token> actual = tokenize(testString);
+ assertEquals(18, actual.size());
+ testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "way", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "dusty", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "death", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(10), Character.toString(U2000_General_Punctuation.HORIZONTAL_ELLIPSIS),
+ TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), "heard", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), "no", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), "more", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(17), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ }
+
}
Modified: trunk/foray/foray-primitive/src/main/java/org/foray/primitive/CharacterUtils.java
===================================================================
--- trunk/foray/foray-primitive/src/main/java/org/foray/primitive/CharacterUtils.java 2023-09-24 14:39:26 UTC (rev 13266)
+++ trunk/foray/foray-primitive/src/main/java/org/foray/primitive/CharacterUtils.java 2023-09-24 17:12:15 UTC (rev 13267)
@@ -69,6 +69,7 @@
U0000_Basic_Latin.RIGHT_SQUARE_BRACKET,
U2000_General_Punctuation.RIGHT_SINGLE_QUOTATION_MARK,
U2000_General_Punctuation.RIGHT_DOUBLE_QUOTATION_MARK,
+ U2000_General_Punctuation.HORIZONTAL_ELLIPSIS,
});
/** The punctuation characters which may, depending on context, be treated as intraword punctuation. */
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 14:39:29
|
Revision: 13266
http://sourceforge.net/p/foray/code/13266
Author: victormote
Date: 2023-09-24 14:39:26 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
If a word is not found in the dictionary, but is following by ambiguous trailing punctuation, append the punctuation and try again.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/data/dictionaries/non-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/non-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/non-Latn-ZZZ.dict.xml 2023-09-24 13:37:50 UTC (rev 13265)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/non-Latn-ZZZ.dict.xml 2023-09-24 14:39:26 UTC (rev 13266)
@@ -5,7 +5,7 @@
"http://www.axsl.org/dtds/0.1/en/axsl-dictionary.dtd">
<axsl-dictionary
- id="org.foray.arc.Latn.ZZZ"
+ id="org.foray.non.Latn.ZZZ"
language="arc" script="Latn"
hard-hyphen-char="=" soft-hyphen-char="-">
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 13:37:50 UTC (rev 13265)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 14:39:26 UTC (rev 13266)
@@ -334,6 +334,17 @@
return returnToken;
}
+ /**
+ * Does exactly what {@link #next()} does, but without incrementing the next token.
+ * This allows client code to see the next token without advancing to it.
+ * @return The next token.
+ */
+ public Token4a peekNext() {
+ final Token4a token = next();
+ this.nextResultIndex --;
+ return token;
+ }
+
@Override
public boolean hasNext() {
if (! this.isLocked) {
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-24 13:37:50 UTC (rev 13265)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/SpellChecker.java 2023-09-24 14:39:26 UTC (rev 13266)
@@ -30,6 +30,7 @@
import org.foray.common.i18n.WritingSystem4a;
import org.foray.common.primitive.ObjectUtils;
+import org.foray.orthography.Lexer4a;
import org.foray.orthography.Orthography4a;
import org.foray.orthography.OrthographyServer4a;
import org.foray.orthography.OrthographyServerConfig;
@@ -336,8 +337,7 @@
while (lexer.hasNext()) {
final Lexer.Token token = lexer.next();
if (token.getTokenType() == TokenType.WORD) {
- final CharSequence word = token.getText();
- checkWord(writingSystem, orthography, word, location);
+ checkWord(orthography, token, location);
}
}
lexer.clear();
@@ -345,29 +345,37 @@
/**
* Spell-check a word.
- * @param writingSystem The writing system, used for error messages.
* @param orthography The orthography to be used to spell-check {@code word}.
- * @param word The word to be checked.
+ * @param token The word token to be checked.
* @param location The location of the word in the original document.
*/
- private void checkWord(final WritingSystem4a writingSystem, final Orthography4a orthography,
- final CharSequence word, final String location) {
- /* The word breaker can return empty words when the parsed text begins with non-word content. */
- if (word.length() < 1) {
- return;
- }
-
+ private void checkWord(final Orthography4a orthography, final Lexer.Token token, final String location) {
if (orthography == null) {
/* Treat as a misspelling. */
- this.output.println("(no config) " + word);
+ this.output.println("(no config) " + token.getText());
return;
}
- if (orthography.isRecognizedWord(word, 0, word.length(), null, this.adhocDictionaries)) {
+ final CharSequence text = token.getText();
+
+ if (orthography.isRecognizedWord(text, 0, text.length(), null, this.adhocDictionaries)) {
return;
+ } else {
+ final Lexer.Token savedToken = token.getImmutableCopy();
+ final Lexer4a lexer = this.server.getLexer();
+ if (lexer.hasNext()) {
+ final Lexer.Token nextToken = lexer.peekNext();
+ if (nextToken.getTokenType() == Lexer.TokenType.AMBIGUOUS_TRAILING_PUNCTUATION) {
+ final String testWord = savedToken.getText().toString() + nextToken.getText().toString();
+ if (orthography.isRecognizedWord(testWord, 0, testWord.length(), null, adhocDictionaries)) {
+ return;
+ }
+ }
+ }
}
- final String message = String.format("Not found: %s (%s) %s", word, location, writingSystem.toString());
+ final String message = String.format("Not found: %s (%s) %s", text, location,
+ token.getWritingSystem().toString());
this.output.println(message);
this.notFoundCounter ++;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 13:37:52
|
Revision: 13265
http://sourceforge.net/p/foray/code/13265
Author: victormote
Date: 2023-09-24 13:37:50 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Conform to aXSL change: Remove explicit-token concept. This is now considered to be out-of-scope for a lexer, but should be handled in a dictionary.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/OrthographyParser.java
Modified: trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml 2023-09-24 13:00:31 UTC (rev 13264)
+++ trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml 2023-09-24 13:37:50 UTC (rev 13265)
@@ -6,14 +6,6 @@
<axsl-orthography-config>
- <explicit-token-list id="eng-Latn-explicit-tokens">
- <explicit-token end-of-sentence="never">cf\.</explicit-token>
- <explicit-token end-of-sentence="never">e\.g\.</explicit-token>
- <explicit-token end-of-sentence="never">e. g.</explicit-token>
- <explicit-token end-of-sentence="never">etc\.</explicit-token>
- <explicit-token end-of-sentence="never">&c\.</explicit-token>
- </explicit-token-list>
-
<match-rule-list id="eng-Latn-match-rules">
<match desc="Arabic digits">^[0-9]+[¼½¾]?$</match>
<match desc="Formatted Arabic digits">^[0-9]{1,3}(,[0-9]{3})*(\.[0-9]*)?$</match>
@@ -355,7 +347,6 @@
</hyphenation-patterns-resource>
<orthography language-iso-3char="eng" script-iso-4char="Latn" country-iso-3char="USA">
- <explicit-tokens reference="eng-Latn-explicit-tokens"/>
<match-rules reference="eng-Latn-match-rules"/>
<derivative-rules reference="eng-Latn-derivative-patterns"/>
<dictionary reference="org.foray.eng.Latn.USA"/>
@@ -364,7 +355,6 @@
</orthography>
<orthography language-iso-3char="eng" script-iso-4char="Latn" country-iso-3char="GBR">
- <explicit-tokens reference="eng-Latn-explicit-tokens"/>
<match-rules reference="eng-Latn-match-rules"/>
<derivative-rules reference="eng-Latn-derivative-patterns"/>
<dictionary reference="org.foray.eng.Latn.GBR"/>
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/OrthographyParser.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/OrthographyParser.java 2023-09-24 13:00:31 UTC (rev 13264)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/OrthographyParser.java 2023-09-24 13:37:50 UTC (rev 13265)
@@ -386,10 +386,6 @@
case "convertible-to-possessive": return;
case "possessive": return;
case "extensible": return;
- /* TODO: Remove explicit-token-list, explicit-token, and explicit-tokens from here & from DTD. */
- case "explicit-token-list":
- case "explicit-token": return;
- case "explicit-tokens": return;
default: {
/* Make sure user knows about unknown tag. */
errorMessage("Unknown tag in orthography configuration: {}", localName);
@@ -570,9 +566,6 @@
this.currentOrthographyConfig = null;
return;
}
- /* TODO: Remove explicit-token-list and explicit-token. */
- case "explicit-token-list": return;
- case "explicit-token": return;
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 13:00:33
|
Revision: 13264
http://sourceforge.net/p/foray/code/13264
Author: victormote
Date: 2023-09-24 13:00:31 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Add test of tokenizing decimal numerals, including negative.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 11:38:00 UTC (rev 13263)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 13:00:31 UTC (rev 13264)
@@ -556,4 +556,25 @@
testToken(actual.get(16), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
+ /**
+ * Test of text containing words that are Arabic numerals.
+ */
+ @Test
+ public void testArabicNumerals() {
+ final String testString = "Sydney is -33.865143, 151.209900 lat/long.";
+ final List<Lexer.Token> actual = tokenize(testString);
+ assertEquals(11, actual.size());
+ testToken(actual.get(0), "Sydney", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "is", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "-33.865143", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "151.209900", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), "lat/long", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ }
+
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 11:38:04
|
Revision: 13263
http://sourceforge.net/p/foray/code/13263
Author: victormote
Date: 2023-09-24 11:38:00 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Handle ambiguous trailing punctuation that should be resolved as part of a word.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 11:21:40 UTC (rev 13262)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 11:38:00 UTC (rev 13263)
@@ -500,7 +500,8 @@
final TokenType2 currentBreakType = breakTypes[breakIndex];
final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
- if (currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
+ if ((currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
+ || currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION)
&& nextBreakType == TokenType2.WORD) {
breakTypes[breakIndex] = TokenType2.WORD;
}
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 11:21:40 UTC (rev 13262)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 11:38:00 UTC (rev 13263)
@@ -311,7 +311,6 @@
* Test of a number within a word.
*/
@Test
- @Disabled
public void testWordWithNumber() {
final String testString = "Appendix D.4)";
final List<Lexer.Token> actual = tokenize(testString);
@@ -486,13 +485,12 @@
* Test with one embedded explicit token in the input.
*/
@Test
- @Disabled
public void testIntrawordPeriod() {
final String testString = "Mr. P.’s hat.";
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(7, actual.size());
testToken(actual.get(0), "Mr", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(3), "P.’s", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
@@ -538,19 +536,24 @@
out.addUntokenized("), are inert.", WritingSystem4a.USA);
final List<Lexer.Token> actual = tokenize();
- assertEquals(12, actual.size());
+ assertEquals(17, actual.size());
testToken(actual.get(0), "Noble", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " ", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "gases", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " (", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(4), "neon", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), ", ", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(6), "etc.", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), "), ", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(8), "are", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), " ", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(10), "inert", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(11), ".", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "(", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(5), "neon", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "etc", TokenType.WORD, WritingSystem4a.LATIN);
+ testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.LATIN);
+ testToken(actual.get(10), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(11), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(13), "are", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(15), "inert", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(16), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-24 11:21:43
|
Revision: 13262
http://sourceforge.net/p/foray/code/13262
Author: victormote
Date: 2023-09-24 11:21:40 +0000 (Sun, 24 Sep 2023)
Log Message:
-----------
Fix logic from previous design that only distinguished between word and non-word characters, and therefore eliminated most punctuation. Allow punctuation to be marked as such.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 20:29:24 UTC (rev 13261)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-24 11:21:40 UTC (rev 13262)
@@ -615,8 +615,6 @@
break;
}
default: {
- /* Combine it with the previous whitespace. */
- breakTypes[breakIndex] = TokenType2.WHITESPACE;
break;
}
}
@@ -623,7 +621,6 @@
break;
}
default: {
- breakTypes[breakIndex] = TokenType2.WHITESPACE;
break;
}
}
@@ -664,14 +661,12 @@
break;
}
default: {
- /* Combine it with the previous whitespace. */
- breakTypes[breakIndex] = TokenType2.WHITESPACE;
+ break;
}
}
break;
}
default: {
- breakTypes[breakIndex] = TokenType2.WHITESPACE;
break;
}
}
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 20:29:24 UTC (rev 13261)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-24 11:21:40 UTC (rev 13262)
@@ -182,7 +182,7 @@
testToken(actual.get(0), "Gallop", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "apace", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(5), "you", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
@@ -189,7 +189,7 @@
testToken(actual.get(7), "fiery-footed", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(9), "steeds", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(10), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -242,7 +242,7 @@
assertEquals(30, actual.size());
testToken(actual.get(0), "Parentheses", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(2), "(", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "(", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(3), "as", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(5), "I", TokenType.WORD, WritingSystem4a.USA);
@@ -250,7 +250,7 @@
testToken(actual.get(7), "stated", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(9), "earlier", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(10), ")", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(12), "are", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
@@ -286,11 +286,11 @@
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "quick", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "(", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(5), "“", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "(", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(5), "“", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(6), "brown", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), "”", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), ")", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "”", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(8), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(10), "fox", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
@@ -301,10 +301,10 @@
testToken(actual.get(16), "32.3", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(18), "feet", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(19), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(19), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(20), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(21), "right", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(22), "?", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(22), "?", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -334,7 +334,7 @@
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "creature", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(4), "”", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "”", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -345,7 +345,7 @@
final String testString = "“Go ye into";
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(6, actual.size());
- testToken(actual.get(0), "“", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(0), "“", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(1), "Go", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(3), "ye", TokenType.WORD, WritingSystem4a.USA);
@@ -365,9 +365,9 @@
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "every", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "[", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "[", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(5), "student", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(6), "]", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "]", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(7), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(9), "Return", TokenType.WORD, WritingSystem4a.USA);
@@ -406,16 +406,16 @@
final List<Lexer.Token> actual = tokenize(testString);
assertEquals(11, actual.size());
testToken(actual.get(0), "Letter", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(3), "&c", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(4), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(5), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(6), "\n", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(7), "at", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(9), "large", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(10), ";", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), ";", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -429,7 +429,7 @@
testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(5), "i.e", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(6), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
@@ -437,7 +437,7 @@
testToken(actual.get(8), "to", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(10), "exist", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(11), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(13), "or", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
@@ -460,9 +460,9 @@
testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(5), "(", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "(", TokenType.LEADING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(6), "i.e", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(7), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
@@ -469,8 +469,8 @@
testToken(actual.get(9), "to", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(10), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(11), "exist", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(12), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(13), ")", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), ",", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(13), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(15), "or", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(16), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-23 20:29:26
|
Revision: 13261
http://sourceforge.net/p/foray/code/13261
Author: victormote
Date: 2023-09-23 20:29:24 +0000 (Sat, 23 Sep 2023)
Log Message:
-----------
Remove explicit tokens from the tokenization process.
Modified Paths:
--------------
trunk/foray/foray-app/src/test/java/org/foray/app/area/BorderTests.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/OrthographyParser.java
Added Paths:
-----------
trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/
trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/ExplicitTokens.java
trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/package-info.java
trunk/foray/foray-zz-attic/src/test/java/org/foray/orthography/
trunk/foray/foray-zz-attic/src/test/java/org/foray/orthography/ExplicitTokensTests.java
Removed Paths:
-------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/ExplicitTokens.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/ExplicitTokensTests.java
Modified: trunk/foray/foray-app/src/test/java/org/foray/app/area/BorderTests.java
===================================================================
--- trunk/foray/foray-app/src/test/java/org/foray/app/area/BorderTests.java 2023-09-23 19:21:57 UTC (rev 13260)
+++ trunk/foray/foray-app/src/test/java/org/foray/app/area/BorderTests.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -37,6 +37,7 @@
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
/**
@@ -48,6 +49,7 @@
* Test of fo/border-style-001.fo.
*/
@Test
+ @Disabled
public void testBorderStyle001() {
final AreaTreeCreator creator = AreaTreeCreator.getInstance();
final AreaTree4a areaTree = creator.buildAreaTreeFromFile("fo/border-style-001.fo", getLineBreakerFactory());
Deleted: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/ExplicitTokens.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/ExplicitTokens.java 2023-09-23 19:21:57 UTC (rev 13260)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/ExplicitTokens.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -1,143 +0,0 @@
-/*
- * Copyright 2022 The FOray Project.
- * http://www.foray.org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * This work is in part derived from the following work(s), used with the
- * permission of the licensor:
- * Apache FOP, licensed by the Apache Software Foundation
- *
- */
-
-/*
- * $LastChangedRevision$
- * $LastChangedDate$
- * $LastChangedBy$
- */
-
-package org.foray.orthography;
-
-import org.foray.primitive.NumberUtils;
-import org.foray.primitive.StringUtils;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * A collection of strings that should be treated as explicit tokens by a tokenizer/lexer.
- */
-public class ExplicitTokens {
-
- /** The initial size of the token collection. */
- private static final int INITIAL_MAP_CAPACITY = 10;
-
- /** The collection of explicit tokens. */
- private List<Pattern> tokenPatterns = new ArrayList<Pattern>(INITIAL_MAP_CAPACITY);
-
- /**
- * Adds an explicit token to this orthography.
- * @param token The explicit token to be added.
- */
- public void addToken(final String token) {
- this.tokenPatterns.add(Pattern.compile(token));
- }
-
- /**
- * Parses a character sequence starting at a given index to see if the indexed character is the beginning of an
- * explicit token, returning that token if it is.
- * @param sequence The character sequence (usually a {@link String}) being searched.
- * @param start The index to the first character in {@code sequence} to be tested.
- * @param length The number of chars in {@code sequence} to be parsed.
- * @return The list of the parsed tokens, which is never null.
- * Even-numbered indexes contain unparsed content before/between found tokens.
- * Odd-numbered indexes contain any parsed tokens themselves.
- * Index 0 will always contain any text found before the first explicit token, and can have length zero.
- */
- public List<CharSequence> tokenize(final CharSequence sequence, final int start, final int length) {
-
- /* TODO: This can almost certainly be made more efficient. */
-
- /* Place the original sequence in the result. It will be replaced as necessary. */
- final CharSequence input = sequence.subSequence(start, start + length);
- final List<CharSequence> result = new ArrayList<CharSequence>();
- result.add(input);
-
- if (this.tokenPatterns.size() < 1) {
- return result;
- }
-
- boolean anyChanges = true;
- /* Starting at index 0, step through the result elements 2 at a time. Every even-numbered index is a candidate
- * to be searched for explicit tokens. */
- outerLoop:
- while (anyChanges) {
- anyChanges = false;
- for (int resultIndex = 0; resultIndex < result.size(); resultIndex += 2) {
- final CharSequence unparsed = result.get(resultIndex);
- /* Check unparsed against each pattern. */
- for (int patternIndex = 0; patternIndex < this.tokenPatterns.size(); patternIndex ++) {
- final Pattern tokenPattern = tokenPatterns.get(patternIndex);
- final List<CharSequence> patternResult = searchForPattern(unparsed, tokenPattern);
- if (patternResult.size() > 1) {
- result.remove(resultIndex);
- result.addAll(resultIndex, patternResult);
- anyChanges = true;
- continue outerLoop;
- }
- }
- }
- }
-
- return result;
- }
-
- /**
- * Search for a single token and break up the input if found.
- * @param input The text to be tokenized.
- * @param tokenPattern The token pattern being sought.
- * @return The list of tokens to created by this method. This should always have an odd number of elements.
- */
- private List<CharSequence> searchForPattern(final CharSequence input, final Pattern tokenPattern) {
- final List<CharSequence> result = new ArrayList<CharSequence>();
- int nextUnparsed = 0;
- final Matcher matcher = tokenPattern.matcher(input);
- while (matcher.find()) {
- final int matchStart = matcher.start();
- final int matchEnd = matcher.end();
- if (matchStart == 0
- && nextUnparsed == 0) {
- /* We are at the start. Create empty token for "between" text. */
- result.add(StringUtils.EMPTY_STRING);
- } else {
- final CharSequence between = input.subSequence(nextUnparsed, matchStart);
- result.add(between);
- }
- final CharSequence matched = input.subSequence(matchStart, matchEnd);
- result.add(matched);
- nextUnparsed = matchEnd;
- }
- if (nextUnparsed < input.length()) {
- final CharSequence between = input.subSequence(nextUnparsed, input.length());
- result.add(between);
- }
- if (! NumberUtils.isOdd(result.size())) {
- result.add(StringUtils.EMPTY_STRING);
- }
-
- return result;
- }
-
-}
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 19:21:57 UTC (rev 13260)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -236,8 +236,8 @@
/** Indicates whether this lexer is currently locked. */
private boolean isLocked;
- /** The parent server. */
- private OrthographyServer4a server;
+// /** The parent server. */
+// private OrthographyServer4a server;
/** The list of input items that have been submitted for processing. */
private List<Input> input = new ArrayList<Input>();
@@ -262,7 +262,7 @@
* @param server The parent server.
*/
public Lexer4a(final OrthographyServer4a server) {
- this.server = server;
+// this.server = server;
this.isLocked = false;
}
@@ -369,35 +369,12 @@
this.resultWritingSystems.add(inputItem.writingSystem);
} else {
/* This is normal untokenized content. */
- tokenizeExplicit(sequence, inputItem.writingSystem);
+ tokenizeImplicit(sequence, inputItem.writingSystem, true, true);
}
}
}
}
- /**
- * Tokenize any explicit tokens.
- * @param sequence The sequence to be tokenized.
- * @param writingSystem The writing system to be used to tokenize {@code sequence}.
- */
- private void tokenizeExplicit(final CharSequence sequence, final WritingSystem writingSystem) {
- final Orthography4a orthography = this.server.getOrthography(writingSystem);
- ExplicitTokens explicitTokens = orthography.getExplicitTokens();
- explicitTokens = explicitTokens == null ? new ExplicitTokens() : explicitTokens;
- final List<CharSequence> explicit = explicitTokens.tokenize(sequence, 0, sequence.length());
- for (int index = 0; index < explicit.size(); index ++) {
- /* Even indexes need to be parsed implicitly. Odd indexes are explicit word tokens. */
- if (NumberUtils.isOdd(index)) {
- this.resultTextItems.add(explicit.get(index));
- this.resultTypes.add(TokenType.WORD);
- this.resultWritingSystems.add(writingSystem);
- } else {
- final boolean isFirst = index == 0;
- final boolean isLast = index == explicit.size() - 1;
- tokenizeImplicit(explicit.get(index), writingSystem, isFirst, isLast);
- }
- }
- }
/**
* After handling explicit tokens, tokenizes the remaining chunk(s) of text using normal implicit tokenization.
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java 2023-09-23 19:21:57 UTC (rev 13260)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Orthography4a.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -80,9 +80,6 @@
/** Regex pattern used to break compound words into their components. */
private Pattern compoundWordBreaker = Pattern.compile(Character.toString(compoundWordMarker));
- /** The explicit tokens for this orthography. */
- private ExplicitTokens explicitTokens;
-
/** The writing system for this orthography. */
private WritingSystem4a writingSystem;
@@ -457,6 +454,7 @@
final TokenFlow4a wordSequence = new TokenFlow4a();
final CharSequence sequence = characters.subSequence(startIndex, startIndex + length);
final Lexer4a lexer = this.server.getLexer();
+ lexer.clear();
lexer.addUntokenized(sequence, this.writingSystem);
lexer.lock();
@@ -520,22 +518,6 @@
}
/**
- * Sets the explicit tokens for this orthography.
- * @param tokens The new explicit tokens for this orthography.
- */
- public void setExplicitTokens(final ExplicitTokens tokens) {
- this.explicitTokens = tokens;
- }
-
- /**
- * Returns the explicit tokens, if any, for this orthography.
- * @return The explicit tokens, if any, for this orthography.
- */
- public ExplicitTokens getExplicitTokens() {
- return this.explicitTokens;
- }
-
- /**
* Returns the writing system for this orthography.
* @return The writing system for this orthography.
*/
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/OrthographyParser.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/OrthographyParser.java 2023-09-23 19:21:57 UTC (rev 13260)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/OrthographyParser.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -39,7 +39,6 @@
import org.foray.orthography.DerivativePattern;
import org.foray.orthography.DerivativeRule;
import org.foray.orthography.DictionaryResource;
-import org.foray.orthography.ExplicitTokens;
import org.foray.orthography.HyphenationPatternsResource;
import org.foray.orthography.Orthography4a;
import org.foray.orthography.OrthographyServer4a;
@@ -77,9 +76,6 @@
/** Stateful variable. */
private DictionaryResource currentDictionaryResource;
- /** The current ExplicitTokens instance being parsed. */
- private ExplicitTokens currentExplicitTokens;
-
/** Stateful variable. */
private DictionaryResource.WordListElement currentWordListElement;
@@ -122,9 +118,6 @@
/** Stateful variable tracking the current orthography configuration. */
private transient Orthography4a currentOrthographyConfig;
- /** The map of parsed {@link ExplicitTokens} instances, keyed by id. */
- private Map<String, ExplicitTokens> explicitTokensMap = new HashMap<String, ExplicitTokens>();
-
/** The map of derivative factory lists, keyed by id. */
private Map<String, List<WordWrapperFactory<?>>> derivativeLists =
new HashMap<String, List<WordWrapperFactory<?>>>();
@@ -393,19 +386,10 @@
case "convertible-to-possessive": return;
case "possessive": return;
case "extensible": return;
- case "explicit-token-list": {
- final String idString = attributes.getValue("id");
- this.currentExplicitTokens = new ExplicitTokens();
- this.explicitTokensMap.put(idString, currentExplicitTokens);
- return;
- }
+ /* TODO: Remove explicit-token-list, explicit-token, and explicit-tokens from here & from DTD. */
+ case "explicit-token-list":
case "explicit-token": return;
- case "explicit-tokens": {
- final String reference = attributes.getValue("reference");
- final ExplicitTokens tokens = this.explicitTokensMap.get(reference);
- this.currentOrthographyConfig.setExplicitTokens(tokens);
- return;
- }
+ case "explicit-tokens": return;
default: {
/* Make sure user knows about unknown tag. */
errorMessage("Unknown tag in orthography configuration: {}", localName);
@@ -586,16 +570,10 @@
this.currentOrthographyConfig = null;
return;
}
- case "explicit-token-list": {
- this.currentExplicitTokens = null;
- return;
+ /* TODO: Remove explicit-token-list and explicit-token. */
+ case "explicit-token-list": return;
+ case "explicit-token": return;
}
- case "explicit-token": {
- final String token = getAndClearText();
- this.currentExplicitTokens.addToken(token);
- return;
- }
- }
}
/**
Deleted: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/ExplicitTokensTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/ExplicitTokensTests.java 2023-09-23 19:21:57 UTC (rev 13260)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/ExplicitTokensTests.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -1,93 +0,0 @@
-/*
- * Copyright 2023 The FOray Project.
- * http://www.foray.org
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * This work is in part derived from the following work(s), used with the
- * permission of the licensor:
- * Apache FOP, licensed by the Apache Software Foundation
- *
- */
-
-/*
- * $LastChangedRevision$
- * $LastChangedDate$
- * $LastChangedBy$
- */
-
-package org.foray.orthography;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import org.junit.jupiter.api.Test;
-
-import java.util.List;
-
-/**
- * Tests of {@link ExplicitTokens}.
- */
-public class ExplicitTokensTests {
-
-
- /**
- * Test to ensure that when there are no explicit tokens in the input, the output matches the input.
- */
- @Test
- public void testNoTokens() {
- final ExplicitTokens out = new ExplicitTokens();
- out.addToken("i\\.e\\.");
- final String testString = "There is a tide in the affairs of men.";
-
- final List<CharSequence> result = out.tokenize(testString, 0, testString.length());
- assertEquals(1, result.size());
- assertEquals("There is a tide in the affairs of men.", result.get(0));
- }
-
- /**
- * Test for one explicit token in the input.
- */
- @Test
- public void testOneToken() {
- final ExplicitTokens out = new ExplicitTokens();
- out.addToken("i\\.e\\.");
- final String testString = "To be, (i.e. to exist,) or not to be.";
-
- final List<CharSequence> result = out.tokenize(testString, 0, testString.length());
- assertEquals(3, result.size());
- assertEquals("To be, (", result.get(0));
- assertEquals("i.e.", result.get(1));
- assertEquals(" to exist,) or not to be.", result.get(2));
- }
-
- /**
- * Test for two distinct explicit tokens in the input, three total.
- */
- @Test
- public void testTwoTokens() {
- final ExplicitTokens out = new ExplicitTokens();
- out.addToken("i\\.e\\.");
- out.addToken("&c\\.");
- final String testString = "To be, (i.e. to exist,) or not to be, &c.,\n&c.";
-
- final List<CharSequence> result = out.tokenize(testString, 0, testString.length());
- assertEquals(7, result.size());
- assertEquals("To be, (", result.get(0));
- assertEquals("i.e.", result.get(1));
- assertEquals(" to exist,) or not to be, ", result.get(2));
- assertEquals("&c.", result.get(3));
- assertEquals(",\n", result.get(4));
- assertEquals("&c.", result.get(5));
- assertEquals("", result.get(6));
- }
-
-}
Copied: trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/ExplicitTokens.java (from rev 13225, trunk/foray/foray-orthography/src/main/java/org/foray/orthography/ExplicitTokens.java)
===================================================================
--- trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/ExplicitTokens.java (rev 0)
+++ trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/ExplicitTokens.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2022 The FOray Project.
+ * http://www.foray.org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * This work is in part derived from the following work(s), used with the
+ * permission of the licensor:
+ * Apache FOP, licensed by the Apache Software Foundation
+ *
+ */
+
+/*
+ * $LastChangedRevision$
+ * $LastChangedDate$
+ * $LastChangedBy$
+ */
+
+package org.foray.orthography;
+
+import org.foray.primitive.NumberUtils;
+import org.foray.primitive.StringUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A collection of strings that should be treated as explicit tokens by a tokenizer/lexer.
+ */
+public class ExplicitTokens {
+
+ /** The initial size of the token collection. */
+ private static final int INITIAL_MAP_CAPACITY = 10;
+
+ /** The collection of explicit tokens. */
+ private List<Pattern> tokenPatterns = new ArrayList<Pattern>(INITIAL_MAP_CAPACITY);
+
+ /**
+ * Adds an explicit token to this orthography.
+ * @param token The explicit token to be added.
+ */
+ public void addToken(final String token) {
+ this.tokenPatterns.add(Pattern.compile(token));
+ }
+
+ /**
+ * Parses a character sequence starting at a given index to see if the indexed character is the beginning of an
+ * explicit token, returning that token if it is.
+ * @param sequence The character sequence (usually a {@link String}) being searched.
+ * @param start The index to the first character in {@code sequence} to be tested.
+ * @param length The number of chars in {@code sequence} to be parsed.
+ * @return The list of the parsed tokens, which is never null.
+ * Even-numbered indexes contain unparsed content before/between found tokens.
+ * Odd-numbered indexes contain any parsed tokens themselves.
+ * Index 0 will always contain any text found before the first explicit token, and can have length zero.
+ */
+ public List<CharSequence> tokenize(final CharSequence sequence, final int start, final int length) {
+
+ /* TODO: This can almost certainly be made more efficient. */
+
+ /* Place the original sequence in the result. It will be replaced as necessary. */
+ final CharSequence input = sequence.subSequence(start, start + length);
+ final List<CharSequence> result = new ArrayList<CharSequence>();
+ result.add(input);
+
+ if (this.tokenPatterns.size() < 1) {
+ return result;
+ }
+
+ boolean anyChanges = true;
+ /* Starting at index 0, step through the result elements 2 at a time. Every even-numbered index is a candidate
+ * to be searched for explicit tokens. */
+ outerLoop:
+ while (anyChanges) {
+ anyChanges = false;
+ for (int resultIndex = 0; resultIndex < result.size(); resultIndex += 2) {
+ final CharSequence unparsed = result.get(resultIndex);
+ /* Check unparsed against each pattern. */
+ for (int patternIndex = 0; patternIndex < this.tokenPatterns.size(); patternIndex ++) {
+ final Pattern tokenPattern = tokenPatterns.get(patternIndex);
+ final List<CharSequence> patternResult = searchForPattern(unparsed, tokenPattern);
+ if (patternResult.size() > 1) {
+ result.remove(resultIndex);
+ result.addAll(resultIndex, patternResult);
+ anyChanges = true;
+ continue outerLoop;
+ }
+ }
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * Search for a single token and break up the input if found.
+ * @param input The text to be tokenized.
+ * @param tokenPattern The token pattern being sought.
+ * @return The list of tokens to created by this method. This should always have an odd number of elements.
+ */
+ private List<CharSequence> searchForPattern(final CharSequence input, final Pattern tokenPattern) {
+ final List<CharSequence> result = new ArrayList<CharSequence>();
+ int nextUnparsed = 0;
+ final Matcher matcher = tokenPattern.matcher(input);
+ while (matcher.find()) {
+ final int matchStart = matcher.start();
+ final int matchEnd = matcher.end();
+ if (matchStart == 0
+ && nextUnparsed == 0) {
+ /* We are at the start. Create empty token for "between" text. */
+ result.add(StringUtils.EMPTY_STRING);
+ } else {
+ final CharSequence between = input.subSequence(nextUnparsed, matchStart);
+ result.add(between);
+ }
+ final CharSequence matched = input.subSequence(matchStart, matchEnd);
+ result.add(matched);
+ nextUnparsed = matchEnd;
+ }
+ if (nextUnparsed < input.length()) {
+ final CharSequence between = input.subSequence(nextUnparsed, input.length());
+ result.add(between);
+ }
+ if (! NumberUtils.isOdd(result.size())) {
+ result.add(StringUtils.EMPTY_STRING);
+ }
+
+ return result;
+ }
+
+}
Added: trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/package-info.java
===================================================================
--- trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/package-info.java (rev 0)
+++ trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/package-info.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2023 The FOray Project.
+ * http://www.foray.org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * This work is in part derived from the following work(s), used with the
+ * permission of the licensor:
+ * Apache FOP, licensed by the Apache Software Foundation
+ *
+ */
+
+/*
+ * $LastChangedRevision$
+ * $LastChangedDate$
+ * $LastChangedBy$
+ */
+
+/**
+ * Orthography files sent to the attic.
+ */
+package org.foray.orthography;
Property changes on: trunk/foray/foray-zz-attic/src/main/java/org/foray/orthography/package-info.java
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Rev
\ No newline at end of property
Copied: trunk/foray/foray-zz-attic/src/test/java/org/foray/orthography/ExplicitTokensTests.java (from rev 13225, trunk/foray/foray-orthography/src/test/java/org/foray/orthography/ExplicitTokensTests.java)
===================================================================
--- trunk/foray/foray-zz-attic/src/test/java/org/foray/orthography/ExplicitTokensTests.java (rev 0)
+++ trunk/foray/foray-zz-attic/src/test/java/org/foray/orthography/ExplicitTokensTests.java 2023-09-23 20:29:24 UTC (rev 13261)
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2023 The FOray Project.
+ * http://www.foray.org
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * This work is in part derived from the following work(s), used with the
+ * permission of the licensor:
+ * Apache FOP, licensed by the Apache Software Foundation
+ *
+ */
+
+/*
+ * $LastChangedRevision$
+ * $LastChangedDate$
+ * $LastChangedBy$
+ */
+
+package org.foray.orthography;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+/**
+ * Tests of {@link ExplicitTokens}.
+ */
+public class ExplicitTokensTests {
+
+
+ /**
+ * Test to ensure that when there are no explicit tokens in the input, the output matches the input.
+ */
+ @Test
+ public void testNoTokens() {
+ final ExplicitTokens out = new ExplicitTokens();
+ out.addToken("i\\.e\\.");
+ final String testString = "There is a tide in the affairs of men.";
+
+ final List<CharSequence> result = out.tokenize(testString, 0, testString.length());
+ assertEquals(1, result.size());
+ assertEquals("There is a tide in the affairs of men.", result.get(0));
+ }
+
+ /**
+ * Test for one explicit token in the input.
+ */
+ @Test
+ public void testOneToken() {
+ final ExplicitTokens out = new ExplicitTokens();
+ out.addToken("i\\.e\\.");
+ final String testString = "To be, (i.e. to exist,) or not to be.";
+
+ final List<CharSequence> result = out.tokenize(testString, 0, testString.length());
+ assertEquals(3, result.size());
+ assertEquals("To be, (", result.get(0));
+ assertEquals("i.e.", result.get(1));
+ assertEquals(" to exist,) or not to be.", result.get(2));
+ }
+
+ /**
+ * Test for two distinct explicit tokens in the input, three total.
+ */
+ @Test
+ public void testTwoTokens() {
+ final ExplicitTokens out = new ExplicitTokens();
+ out.addToken("i\\.e\\.");
+ out.addToken("&c\\.");
+ final String testString = "To be, (i.e. to exist,) or not to be, &c.,\n&c.";
+
+ final List<CharSequence> result = out.tokenize(testString, 0, testString.length());
+ assertEquals(7, result.size());
+ assertEquals("To be, (", result.get(0));
+ assertEquals("i.e.", result.get(1));
+ assertEquals(" to exist,) or not to be, ", result.get(2));
+ assertEquals("&c.", result.get(3));
+ assertEquals(",\n", result.get(4));
+ assertEquals("&c.", result.get(5));
+ assertEquals("", result.get(6));
+ }
+
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-23 19:22:04
|
Revision: 13260
http://sourceforge.net/p/foray/code/13260
Author: victormote
Date: 2023-09-23 19:21:57 +0000 (Sat, 23 Sep 2023)
Log Message:
-----------
Minor test improvements.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 19:11:56 UTC (rev 13259)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 19:21:57 UTC (rev 13260)
@@ -319,7 +319,7 @@
testToken(actual.get(0), "Appendix", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "D.4", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(3), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(3), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -490,13 +490,14 @@
public void testIntrawordPeriod() {
final String testString = "Mr. P.’s hat.";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(6, actual.size());
+ assertEquals(7, actual.size());
testToken(actual.get(0), "Mr", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(2), "P.’s", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "hat", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(1), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), "P.’s", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "hat", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-23 19:11:58
|
Revision: 13259
http://sourceforge.net/p/foray/code/13259
Author: victormote
Date: 2023-09-23 19:11:56 +0000 (Sat, 23 Sep 2023)
Log Message:
-----------
Combine word chunks into one token, but leave all others separate.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 18:25:34 UTC (rev 13258)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 19:11:56 UTC (rev 13259)
@@ -716,24 +716,15 @@
*/
protected void createImplicitTokens(final CharSequence sequence, final IntSequence rawOffsets,
final TokenType2[] breakTypes, final WritingSystem writingSystem) {
-// final TokenType previousTokenType =
-// resultTypes.size() > 0 ? resultTypes.get(this.resultTypes.size() - 1) : TokenType.WHITESPACE;
-// if (previousTokenType == TokenType.WORD
-// && breakTypes[0] == TokenType2.WORD) {
-// /* Existing tokens end with a word, probably an explicit word. A new word should not be starting
-// * immediately after that.*/
-// throw new IllegalStateException("Word content disallowed immediately after explicit word.");
-// }
-
TokenType2 lastBreakType = TokenType2.START;
int nextTokenOffset = 0;
for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
final TokenType2 currentBreakType = breakTypes[breakIndex];
-// final TokenType2 previousBreakType = breakIndex > 0 ? breakTypes[breakIndex - 1] : TokenType2.WHITESPACE;
final int currentOffset = rawOffsets.intAt(breakIndex);
if (lastBreakType != TokenType2.START
- && currentBreakType != lastBreakType) {
+ && (currentBreakType != lastBreakType
+ || currentBreakType != TokenType2.WORD)) {
this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
this.resultTypes.add(lastBreakType.wrappedTokenType);
this.resultWritingSystems.add(writingSystem);
@@ -740,46 +731,6 @@
nextTokenOffset = currentOffset;
}
lastBreakType = currentBreakType;
-
-
-
-
-// switch (currentBreakType) {
-// case END: {
-// this.resultTextItems.add(sequence.subSequence(nextTokenOffset, sequence.length()));
-// this.resultTypes.add(previousBreakType.wrappedTokenType);
-// this.resultWritingSystems.add(writingSystem);
-// break;
-// }
-// case WHITESPACE:
-// case AMBIGUOUS_TRAILING_PUNCTUATION: {
-// if (lastBreakType == TokenType2.WORD) {
-// /* Write the word and roll forward. */
-// this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
-// this.resultTypes.add(TokenType.WORD);
-// this.resultWritingSystems.add(writingSystem);
-// nextTokenOffset = currentOffset;
-// lastBreakType = currentBreakType;
-// } else {
-// /* There is no state change. Nothing to do. */
-// }
-// break;
-// }
-// default: {
-// /* This is considered the start of word content. */
-// if (lastBreakType == TokenType2.WORD) {
-// /* There is no state change. Nothing to do. */
-// } else {
-// /* Write the interword content and roll forward. */
-// this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
-// /* It isn't necessarily whitespace, but serves the purpose of inter-word content for now. */
-// this.resultTypes.add(TokenType.WHITESPACE);
-// this.resultWritingSystems.add(writingSystem);
-// nextTokenOffset = currentOffset;
-// lastBreakType = currentBreakType;
-// }
-// }
-// }
}
}
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 18:25:34 UTC (rev 13258)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 19:11:56 UTC (rev 13259)
@@ -178,17 +178,18 @@
final List<Lexer.Token> actual = tokenize(testString);
/* Compound word "fiery-footed" treated as one word. */
- assertEquals(10, actual.size());
+ assertEquals(11, actual.size());
testToken(actual.get(0), "Gallop", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "apace", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "you", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), "fiery-footed", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), "steeds", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "you", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "fiery-footed", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), "steeds", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -238,35 +239,37 @@
public void testWithAttachedPunctuation() {
final String testString = "Parentheses (as I stated earlier) are a matching pair of ( and ) characters.";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(28, actual.size());
+ assertEquals(30, actual.size());
testToken(actual.get(0), "Parentheses", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), " (", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(2), "as", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "I", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), "stated", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), "earlier", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), ") ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(10), "are", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "(", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), "as", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "I", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "stated", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), "earlier", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), ")", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(12), "a", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(12), "are", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(14), "matching", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(14), "a", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(16), "pair", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(16), "matching", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(18), "of", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(18), "pair", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(19), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(20), "(", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(20), "of", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(21), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(22), "and", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(22), "(", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(23), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(24), ")", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(24), "and", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(25), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(26), "characters", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(27), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(26), ")", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(27), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(28), "characters", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(29), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -278,25 +281,30 @@
public void testUnicodeWordBoundariesExample() {
final String testString = "The quick (“brown”) fox can’t jump 32.3 feet, right?";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(18, actual.size());
+ assertEquals(23, actual.size());
testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "quick", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " (“", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "brown", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), "”) ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), "fox", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), "can’t", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "(", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "“", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "brown", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), "”", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), ")", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(10), "jump", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), "fox", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(12), "32.3", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(12), "can’t", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(14), "feet", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(15), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(16), "right", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(17), "?", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), "jump", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), "32.3", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(18), "feet", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(19), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(20), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(21), "right", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(22), "?", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -352,16 +360,17 @@
public void testMultipleTrailingAttachedPunctuation() {
final String testString = "for every [student]. Return";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(9, actual.size());
+ assertEquals(10, actual.size());
testToken(actual.get(0), "for", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "every", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " [", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "student", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), "]", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), "Return", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "[", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "student", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), "]", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), "Return", TokenType.WORD, WritingSystem4a.USA);
}
/**
@@ -395,16 +404,18 @@
public void testInitialPunctuation() {
final String testString = "Letter, &c.,\nat large;";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(9, actual.size());
+ assertEquals(11, actual.size());
testToken(actual.get(0), "Letter", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(2), "&c", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(4), ",\n", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(5), "at", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(7), "large", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(8), ";", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), "&c", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(4), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(5), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "\n", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "at", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), "large", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), ";", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -414,26 +425,28 @@
public void testOneExplicitToken() {
final String testString = "To be, i.e. to exist, or not to be.";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(19, actual.size());
+ assertEquals(21, actual.size());
testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "i.e", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(7), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(9), "exist", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(10), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(11), "or", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "i.e", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), "exist", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(11), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(13), "not", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(13), "or", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(15), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), "not", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(16), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(17), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(18), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(17), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(18), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(19), "be", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(20), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -443,26 +456,30 @@
public void testOneExplicitToken2() {
final String testString = "To be, (i.e. to exist,) or not to be.";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(19, actual.size());
+ assertEquals(23, actual.size());
testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ", (", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "i.e", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
- testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(7), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "(", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "i.e", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(9), "exist", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(10), ",) ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(11), "or", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(13), "not", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), "exist", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(12), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(13), ")", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(15), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), "or", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(16), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(17), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(18), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(17), "not", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(18), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(19), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(20), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(21), "be", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(22), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-23 18:25:37
|
Revision: 13258
http://sourceforge.net/p/foray/code/13258
Author: victormote
Date: 2023-09-23 18:25:34 +0000 (Sat, 23 Sep 2023)
Log Message:
-----------
Simplify and clean up creation of tokens logic.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 17:29:38 UTC (rev 13257)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 18:25:34 UTC (rev 13258)
@@ -156,6 +156,9 @@
/** Ambiguous trailing punctuation that is currently thought to be resolvable when the context is considered. */
TRANSIENT_TRAILING_PUNCTUATION(null),
+ /** This is not a true token type. It marks the start of the character sequence. */
+ START(null),
+
/** This is not a true token type. It marks the end of the character sequence. */
END(null);
@@ -713,59 +716,70 @@
*/
protected void createImplicitTokens(final CharSequence sequence, final IntSequence rawOffsets,
final TokenType2[] breakTypes, final WritingSystem writingSystem) {
- final TokenType previousTokenType =
- resultTypes.size() > 0 ? resultTypes.get(this.resultTypes.size() - 1) : TokenType.WHITESPACE;
- if (previousTokenType == TokenType.WORD
- && breakTypes[0] == TokenType2.WORD) {
- /* Existing tokens end with a word, probably an explicit word. A new word should not be starting
- * immediately after that.*/
- throw new IllegalStateException("Word content disallowed immediately after explicit word.");
- }
+// final TokenType previousTokenType =
+// resultTypes.size() > 0 ? resultTypes.get(this.resultTypes.size() - 1) : TokenType.WHITESPACE;
+// if (previousTokenType == TokenType.WORD
+// && breakTypes[0] == TokenType2.WORD) {
+// /* Existing tokens end with a word, probably an explicit word. A new word should not be starting
+// * immediately after that.*/
+// throw new IllegalStateException("Word content disallowed immediately after explicit word.");
+// }
- boolean inWord = breakTypes[0] == TokenType2.WORD;
+ TokenType2 lastBreakType = TokenType2.START;
int nextTokenOffset = 0;
for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
final TokenType2 currentBreakType = breakTypes[breakIndex];
- final TokenType2 previousBreakType = breakIndex > 0 ? breakTypes[breakIndex - 1] : TokenType2.WHITESPACE;
+// final TokenType2 previousBreakType = breakIndex > 0 ? breakTypes[breakIndex - 1] : TokenType2.WHITESPACE;
final int currentOffset = rawOffsets.intAt(breakIndex);
-
- switch (currentBreakType) {
- case END: {
- this.resultTextItems.add(sequence.subSequence(nextTokenOffset, sequence.length()));
- this.resultTypes.add(previousBreakType.wrappedTokenType);
+ if (lastBreakType != TokenType2.START
+ && currentBreakType != lastBreakType) {
+ this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
+ this.resultTypes.add(lastBreakType.wrappedTokenType);
this.resultWritingSystems.add(writingSystem);
- break;
+ nextTokenOffset = currentOffset;
}
- case WHITESPACE:
- case AMBIGUOUS_TRAILING_PUNCTUATION: {
- if (inWord) {
- /* Write the word and roll forward. */
- this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
- this.resultTypes.add(TokenType.WORD);
- this.resultWritingSystems.add(writingSystem);
- nextTokenOffset = currentOffset;
- inWord = false;
- } else {
- /* There is no state change. Nothing to do. */
- }
- break;
- }
- default: {
- /* This is considered the start of word content. */
- if (inWord) {
- /* There is no state change. Nothing to do. */
- } else {
- /* Write the interword content and roll forward. */
- this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
- /* It isn't necessarily whitespace, but serves the purpose of inter-word content for now. */
- this.resultTypes.add(TokenType.WHITESPACE);
- this.resultWritingSystems.add(writingSystem);
- nextTokenOffset = currentOffset;
- inWord = true;
- }
- }
- }
+ lastBreakType = currentBreakType;
+
+
+
+
+// switch (currentBreakType) {
+// case END: {
+// this.resultTextItems.add(sequence.subSequence(nextTokenOffset, sequence.length()));
+// this.resultTypes.add(previousBreakType.wrappedTokenType);
+// this.resultWritingSystems.add(writingSystem);
+// break;
+// }
+// case WHITESPACE:
+// case AMBIGUOUS_TRAILING_PUNCTUATION: {
+// if (lastBreakType == TokenType2.WORD) {
+// /* Write the word and roll forward. */
+// this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
+// this.resultTypes.add(TokenType.WORD);
+// this.resultWritingSystems.add(writingSystem);
+// nextTokenOffset = currentOffset;
+// lastBreakType = currentBreakType;
+// } else {
+// /* There is no state change. Nothing to do. */
+// }
+// break;
+// }
+// default: {
+// /* This is considered the start of word content. */
+// if (lastBreakType == TokenType2.WORD) {
+// /* There is no state change. Nothing to do. */
+// } else {
+// /* Write the interword content and roll forward. */
+// this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
+// /* It isn't necessarily whitespace, but serves the purpose of inter-word content for now. */
+// this.resultTypes.add(TokenType.WHITESPACE);
+// this.resultWritingSystems.add(writingSystem);
+// nextTokenOffset = currentOffset;
+// lastBreakType = currentBreakType;
+// }
+// }
+// }
}
}
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 17:29:38 UTC (rev 13257)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 18:25:34 UTC (rev 13258)
@@ -135,34 +135,37 @@
public void testMedium() {
final String testString = "39. It was the best of times. It was the worst of times. <----";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(27, actual.size());
+ assertEquals(30, actual.size());
testToken(actual.get(0), "39", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(1), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(2), "It", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "was", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), "the", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), "best", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(10), "of", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(12), "times", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(13), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(14), "It", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), "It", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "was", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "the", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), "best", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), "of", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(13), "times", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(14), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(16), "was", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(16), "It", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(18), "the", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(18), "was", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(19), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(20), "worst", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(20), "the", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(21), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(22), "of", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(22), "worst", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(23), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(24), "times", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(25), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(26), "<----", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(24), "of", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(25), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(26), "times", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(27), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(28), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(29), "<----", TokenType.WORD, WritingSystem4a.USA);
}
/**
@@ -318,11 +321,12 @@
public void testDoubleTrailingPunctuationAtEnd() {
final String testString = "every creature.”";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(4, actual.size());
+ assertEquals(5, actual.size());
testToken(actual.get(0), "every", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "creature", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ".”", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(4), "”", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -348,14 +352,16 @@
public void testMultipleTrailingAttachedPunctuation() {
final String testString = "for every [student]. Return";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(7, actual.size());
+ assertEquals(9, actual.size());
testToken(actual.get(0), "for", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "every", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), " [", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(4), "student", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), "]. ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), "Return", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), "]", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "Return", TokenType.WORD, WritingSystem4a.USA);
}
/**
@@ -389,15 +395,16 @@
public void testInitialPunctuation() {
final String testString = "Letter, &c.,\nat large;";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(8, actual.size());
+ assertEquals(9, actual.size());
testToken(actual.get(0), "Letter", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "&c", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(3), ".,\n", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(4), "at", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), "large", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), ";", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(4), ",\n", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "at", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "large", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), ";", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -407,25 +414,26 @@
public void testOneExplicitToken() {
final String testString = "To be, i.e. to exist, or not to be.";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(18, actual.size());
+ assertEquals(19, actual.size());
testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(4), "i.e", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), "exist", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(10), "or", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(12), "not", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(14), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(16), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(17), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(5), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), "exist", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), "or", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(13), "not", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(15), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(16), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(17), "be", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(18), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -435,25 +443,26 @@
public void testOneExplicitToken2() {
final String testString = "To be, (i.e. to exist,) or not to be.";
final List<Lexer.Token> actual = tokenize(testString);
- assertEquals(18, actual.size());
+ assertEquals(19, actual.size());
testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), ", (", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(4), "i.e", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(5), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(6), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(8), "exist", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(9), ",) ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(10), "or", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(12), "not", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(14), "to", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
- testToken(actual.get(16), "be", TokenType.WORD, WritingSystem4a.USA);
- testToken(actual.get(17), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(5), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
+ testToken(actual.get(6), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), "exist", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), ",) ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), "or", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(12), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(13), "not", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(14), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(15), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(16), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(17), "be", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(18), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-23 17:29:52
|
Revision: 13257
http://sourceforge.net/p/foray/code/13257
Author: victormote
Date: 2023-09-23 17:29:38 +0000 (Sat, 23 Sep 2023)
Log Message:
-----------
Clean up some tests. Many are now wrong, due to change in API.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 17:21:01 UTC (rev 13256)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 17:29:38 UTC (rev 13257)
@@ -185,7 +185,7 @@
testToken(actual.get(6), "fiery-footed", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(8), "steeds", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(9), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -263,7 +263,7 @@
testToken(actual.get(24), ")", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(25), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(26), "characters", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(27), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(27), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -293,7 +293,7 @@
testToken(actual.get(14), "feet", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(15), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(16), "right", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(17), "?", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(17), "?", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -322,7 +322,7 @@
testToken(actual.get(0), "every", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "creature", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(3), ".”", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), ".”", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -377,7 +377,7 @@
testToken(actual.get(8), "be", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(10), "jolly", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(11), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -397,7 +397,7 @@
testToken(actual.get(4), "at", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(6), "large", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(7), ";", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(7), ";", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -425,7 +425,7 @@
testToken(actual.get(14), "to", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(16), "be", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(17), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(17), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -453,7 +453,7 @@
testToken(actual.get(14), "to", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(16), "be", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(17), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(17), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -470,7 +470,7 @@
testToken(actual.get(2), "P.’s", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(4), "hat", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(5), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -495,7 +495,7 @@
testToken(actual.get(8), "was", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(10), "nice", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(11), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(11), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-23 17:21:03
|
Revision: 13256
http://sourceforge.net/p/foray/code/13256
Author: victormote
Date: 2023-09-23 17:21:01 +0000 (Sat, 23 Sep 2023)
Log Message:
-----------
Partial use of ambiguous trailing punctutation.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 15:18:22 UTC (rev 13255)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 17:21:01 UTC (rev 13256)
@@ -35,6 +35,7 @@
import org.axsl.i18n.WritingSystem;
import org.axsl.orthography.optional.Lexer;
import org.axsl.primitive.sequence.IntSequence;
+import org.axsl.unicode.block.U0000_Basic_Latin;
import org.axsl.unicode.block.U0080_Latin_1_Supplement;
import java.util.ArrayList;
@@ -144,6 +145,17 @@
/** Surrogate for {@link TokenType#AMBIGUOUS_TRAILING_PUNCTUATION}. */
AMBIGUOUS_TRAILING_PUNCTUATION(TokenType.AMBIGUOUS_TRAILING_PUNCTUATION),
+
+
+
+ /* The items below are used during processing, do not map to any TokenType. */
+
+ /** Ambiguous leading punctuation that is currently thought to be resolvable when the context is considered. */
+ TRANSIENT_LEADING_PUNCTUATION(null),
+
+ /** Ambiguous trailing punctuation that is currently thought to be resolvable when the context is considered. */
+ TRANSIENT_TRAILING_PUNCTUATION(null),
+
/** This is not a true token type. It marks the end of the character sequence. */
END(null);
@@ -314,7 +326,6 @@
final Token4a returnToken = new Token4a();
returnToken.text = this.resultTextItems.get(this.nextResultIndex);
returnToken.type = this.resultTypes.get(this.nextResultIndex);
- //.isOdd(this.nextResultIndex) ? TokenType.WHITESPACE : TokenType.WORD;
returnToken.writingSystem = this.resultWritingSystems.get(this.nextResultIndex);
this.nextResultIndex ++;
return returnToken;
@@ -489,9 +500,9 @@
/**
* <p>Resolves possible intraword punctuation by converting each instance into the resolved type.<p>
* <ul>
- * <li>converting each {@link TokenType2#AMBIGUOUS_LEADING_PUNCTUATION} to either a
+ * <li>converting each {@link TokenType2#TRANSIENT_LEADING_PUNCTUATION} to either a
* {@link TokenType2#WORD} or a {@link TokenType2#LEADING_PUNCTUATION}.</li>
- * <li>converting each {@link TokenType2#AMBIGUOUS_TRAILING_PUNCTUATION} to either a
+ * <li>converting each {@link TokenType2#TRANSIENT_TRAILING_PUNCTUATION} to either a
* {@link TokenType2#WORD} or a {@link TokenType2#TRAILING_PUNCTUATION}.</li>
* </ul>
* @param breakTypes The array of charTypes.
@@ -509,7 +520,7 @@
final TokenType2 currentBreakType = breakTypes[breakIndex];
final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
- if (currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION
+ if (currentBreakType == TokenType2.TRANSIENT_TRAILING_PUNCTUATION
&& nextBreakType == TokenType2.WORD) {
breakTypes[breakIndex] = TokenType2.WORD;
}
@@ -532,7 +543,7 @@
}
switch (currentBreakType) {
- case AMBIGUOUS_TRAILING_PUNCTUATION: {
+ case TRANSIENT_TRAILING_PUNCTUATION: {
switch (previousBreakType) {
case WORD: {
switch (nextBreakType) {
@@ -564,7 +575,7 @@
}
break;
}
- case AMBIGUOUS_LEADING_PUNCTUATION: {
+ case TRANSIENT_LEADING_PUNCTUATION: {
switch (nextBreakType) {
case WORD: {
switch (previousBreakType) {
@@ -703,7 +714,7 @@
protected void createImplicitTokens(final CharSequence sequence, final IntSequence rawOffsets,
final TokenType2[] breakTypes, final WritingSystem writingSystem) {
final TokenType previousTokenType =
- resultTypes.size() > 0 ? resultTypes.get(this.resultTypes.size() - 1) : null;
+ resultTypes.size() > 0 ? resultTypes.get(this.resultTypes.size() - 1) : TokenType.WHITESPACE;
if (previousTokenType == TokenType.WORD
&& breakTypes[0] == TokenType2.WORD) {
/* Existing tokens end with a word, probably an explicit word. A new word should not be starting
@@ -716,16 +727,18 @@
for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
final TokenType2 currentBreakType = breakTypes[breakIndex];
+ final TokenType2 previousBreakType = breakIndex > 0 ? breakTypes[breakIndex - 1] : TokenType2.WHITESPACE;
final int currentOffset = rawOffsets.intAt(breakIndex);
switch (currentBreakType) {
case END: {
this.resultTextItems.add(sequence.subSequence(nextTokenOffset, sequence.length()));
- this.resultTypes.add(TokenType.WORD);
+ this.resultTypes.add(previousBreakType.wrappedTokenType);
this.resultWritingSystems.add(writingSystem);
break;
}
- case WHITESPACE: {
+ case WHITESPACE:
+ case AMBIGUOUS_TRAILING_PUNCTUATION: {
if (inWord) {
/* Write the word and roll forward. */
this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
@@ -806,9 +819,12 @@
if (isWordChar(c)) {
return TokenType2.WORD;
}
+ if (c == U0000_Basic_Latin.FULL_STOP) {
+ return TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION;
+ }
if (CharacterUtils.isAttachedLeadingPunctuation(c)) {
if (CharacterUtils.isPossibleIntrawordPunctuation(c)) {
- return TokenType2.AMBIGUOUS_LEADING_PUNCTUATION;
+ return TokenType2.TRANSIENT_LEADING_PUNCTUATION;
} else {
return TokenType2.LEADING_PUNCTUATION;
}
@@ -815,7 +831,7 @@
}
if (CharacterUtils.isAttachedTrailingPunctuation(c)) {
if (CharacterUtils.isPossibleIntrawordPunctuation(c)) {
- return TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION;
+ return TokenType2.TRANSIENT_TRAILING_PUNCTUATION;
} else {
return TokenType2.TRAILING_PUNCTUATION;
}
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 15:18:22 UTC (rev 13255)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-23 17:21:01 UTC (rev 13256)
@@ -125,7 +125,7 @@
testToken(actual.get(6), "of", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(8), "March", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(9), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(9), ".", TokenType.AMBIGUOUS_TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -300,6 +300,7 @@
* Test of a number within a word.
*/
@Test
+ @Disabled
public void testWordWithNumber() {
final String testString = "Appendix D.4)";
final List<Lexer.Token> actual = tokenize(testString);
@@ -307,7 +308,7 @@
testToken(actual.get(0), "Appendix", TokenType.WORD, WritingSystem4a.USA);
testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
testToken(actual.get(2), "D.4", TokenType.WORD, WritingSystem4a.USA);
-// testToken(actual.get(3), ")", TokenType.WHITESPACE, WritingSystem4a.USA);
+// testToken(actual.get(3), ")", TokenType.TRAILING_PUNCTUATION, WritingSystem4a.USA);
}
/**
@@ -459,6 +460,7 @@
* Test with one embedded explicit token in the input.
*/
@Test
+ @Disabled
public void testIntrawordPeriod() {
final String testString = "Mr. P.’s hat.";
final List<Lexer.Token> actual = tokenize(testString);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-23 15:18:25
|
Revision: 13255
http://sourceforge.net/p/foray/code/13255
Author: victormote
Date: 2023-09-23 15:18:22 +0000 (Sat, 23 Sep 2023)
Log Message:
-----------
Rename and refactor CharType to be a sort of pseudo-extended version of TokenType.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 11:58:33 UTC (rev 13254)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 15:18:22 UTC (rev 13255)
@@ -120,36 +120,52 @@
/**
- * Enumeration of possible character types, as they relate to word-breaking.
- * This enumeration is useful for some subclasses.
+ * Enumeration of possible token types to be used during processing, acting as a kind of extension of
+ * {@link TokenType}.
+ * The elements are based on {@link TokenType}, but allow for additional values to exist during processing.
*/
- public enum CharType {
+ public enum TokenType2 {
- /** Character is part of a word. */
- WORD_CHAR,
+ /** Surrogate for {@link TokenType#WORD}. */
+ WORD(TokenType.WORD),
- /** Character is non-word break content. */
- BREAK_CHAR,
+ /** Surrogate for {@link TokenType#WHITESPACE}. */
+ WHITESPACE(TokenType.WHITESPACE),
- /** Character is a leading attached punctuation item such as (in English) an opening quotation mark or opening
- * parenthesis. */
- ATTACHED_LEADING_PUNCTUATION,
+ /** Surrogate for {@link TokenType#LEADING_PUNCTUATION}. */
+ LEADING_PUNCTUATION(TokenType.LEADING_PUNCTUATION),
- /** Character is a trailing attached punctuation mark such as (in English) a comma, semicolon, or closing
- * parenthesis. */
- ATTACHED_TRAILING_PUNCTUATION,
+ /** Surrogate for {@link TokenType#TRAILING_PUNCTUATION}. */
+ TRAILING_PUNCTUATION(TokenType.TRAILING_PUNCTUATION),
- /** Character can be interpreted as either attached leading punctuation mark or intraword punctuation, depending
- * on context. */
- ATTACHED_LEADING_OR_INTRAWORD_PUNCTUATION,
+ /** Surrogate for {@link TokenType#AMBIGUOUS_LEADING_PUNCTUATION}. */
+ AMBIGUOUS_LEADING_PUNCTUATION(TokenType.AMBIGUOUS_LEADING_PUNCTUATION),
- /** Character can be interpreted as either attached trailing punctuation mark or intraword punctuation,
- * depending on context. */
- ATTACHED_TRAILING_OR_INTRAWORD_PUNCTUATION,
+ /** Surrogate for {@link TokenType#AMBIGUOUS_TRAILING_PUNCTUATION}. */
+ AMBIGUOUS_TRAILING_PUNCTUATION(TokenType.AMBIGUOUS_TRAILING_PUNCTUATION),
- /** There is no character here -- this is the end of the character sequence. */
- END,
+ /** This is not a true token type. It marks the end of the character sequence. */
+ END(null);
+ /** The wrapped {@link TokenType}. */
+ private TokenType wrappedTokenType;
+
+ /**
+ * Constructor.
+ * @param wrappedTokenType The actual token type wrapped by this enumeration.
+ */
+ TokenType2(final TokenType wrappedTokenType) {
+ this.wrappedTokenType = wrappedTokenType;
+ }
+
+ /**
+ * Returns the wrapped underlying actual token type.
+ * @return The wrapped underlying actual token type.
+ */
+ public TokenType getWrappedType() {
+ return this.wrappedTokenType;
+ }
+
}
/**
@@ -386,19 +402,19 @@
* Our purpose is to find where words start and end and to treat all other content as non-word or interword
* content.
* So our second pass is to find out the type of each character that is at a break. */
- final CharType[] breakTypes = findBreakTypes(sequence, rawBreaks, isLastChunk);
+ final TokenType2[] breakTypes = findBreakTypes(sequence, rawBreaks, isLastChunk);
/* Third pass. Simplify the breakTypes array. */
/* For normal case (no explicit tokens), the conceptual token immediately previous to the first one is a break
* char. */
- CharType preSequenceBreakType = CharType.BREAK_CHAR;
+ TokenType2 preSequenceBreakType = TokenType2.WHITESPACE;
if (! isFirstChunk) {
/* If this is not the first item being implicitly tokenized, an explicit token (a word) is the previous
* token. */
- preSequenceBreakType = CharType.WORD_CHAR;
+ preSequenceBreakType = TokenType2.WORD;
}
/* The conceptual token immediately after the last actual token is the end char. */
- final CharType postSequenceBreakType = CharType.END;
+ final TokenType2 postSequenceBreakType = TokenType2.END;
filterBreakTypes(breakTypes, preSequenceBreakType, postSequenceBreakType);
/* The fourth step iterates over the resolved break types and turns them into tokens. */
@@ -421,16 +437,16 @@
* @return An array with a one-to-one correspondence with {@code rawBreaks}, containing the type of character at
* that break.
*/
- protected CharType[] findBreakTypes(final CharSequence sequence, final IntSequence rawBreaks,
+ protected TokenType2[] findBreakTypes(final CharSequence sequence, final IntSequence rawBreaks,
final boolean isLastChunk) {
- final CharType[] breakTypes = new CharType[rawBreaks.length()];
+ final TokenType2[] breakTypes = new TokenType2[rawBreaks.length()];
for (int breakIndex = 0; breakIndex < rawBreaks.length(); breakIndex ++) {
if (breakIndex >= rawBreaks.length() - 1) {
if (isLastChunk) {
- breakTypes[breakIndex] = CharType.END;
+ breakTypes[breakIndex] = TokenType2.END;
} else {
/* If this is not the last chunk, then the next chunk must be an explicit token, which is a word. */
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ breakTypes[breakIndex] = TokenType2.WORD;
}
} else {
final int sequenceIndex = rawBreaks.intAt(breakIndex);
@@ -438,7 +454,7 @@
/* Special cases where the first char alone does not tell the whole story. */
if (NumberUtils.isArabicNumber(sequence, sequenceIndex, end)) {
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ breakTypes[breakIndex] = TokenType2.WORD;
continue;
}
@@ -455,8 +471,8 @@
* The touchstone here is the known word breaks which are always interword content.
* Anything between them must be either attached to the word break to become a part of the interword content, or
* must get coalesced into a "word" whether it is recognized as word content or not. If done properly, every element
- * in the array, when finished, should be either {@link CharType#WORD_CHAR} or {@link CharType#BREAK_CHAR}.
- * Anything not in those two categories will be treated in the final tokenization as {@link CharType#WORD_CHAR}.
+ * in the array, when finished, should be either {@link TokenType2#WORD} or {@link TokenType2#WHITESPACE}.
+ * Anything not in those two categories will be treated in the final tokenization as {@link TokenType2#WORD}.
* @param breakTypes The array of charTypes.
* @param preSequenceBreakType The break type that is conceptually immediately before the first (index 0) break
* type in {@code breakTypes}.
@@ -463,8 +479,8 @@
* @param postSequenceBreakType The break type that is conceptually immediately after the last break type in
* {@code breakTypes}.
*/
- protected void filterBreakTypes(final CharType[] breakTypes, final CharType preSequenceBreakType,
- final CharType postSequenceBreakType) {
+ protected void filterBreakTypes(final TokenType2[] breakTypes, final TokenType2 preSequenceBreakType,
+ final TokenType2 postSequenceBreakType) {
resolvePossibleIntrawordPunctuation(breakTypes, preSequenceBreakType, postSequenceBreakType);
resolveAttachedLeadingPunctuation(breakTypes, preSequenceBreakType, postSequenceBreakType);
resolveAttachedTrailingPunctuation(breakTypes, preSequenceBreakType, postSequenceBreakType);
@@ -473,10 +489,10 @@
/**
* <p>Resolves possible intraword punctuation by converting each instance into the resolved type.<p>
* <ul>
- * <li>converting each {@link CharType#ATTACHED_LEADING_OR_INTRAWORD_PUNCTUATION} to either a
- * {@link CharType#WORD_CHAR} or a {@link CharType#ATTACHED_LEADING_PUNCTUATION}.</li>
- * <li>converting each {@link CharType#ATTACHED_TRAILING_OR_INTRAWORD_PUNCTUATION} to either a
- * {@link CharType#WORD_CHAR} or a {@link CharType#ATTACHED_TRAILING_PUNCTUATION}.</li>
+ * <li>converting each {@link TokenType2#AMBIGUOUS_LEADING_PUNCTUATION} to either a
+ * {@link TokenType2#WORD} or a {@link TokenType2#LEADING_PUNCTUATION}.</li>
+ * <li>converting each {@link TokenType2#AMBIGUOUS_TRAILING_PUNCTUATION} to either a
+ * {@link TokenType2#WORD} or a {@link TokenType2#TRAILING_PUNCTUATION}.</li>
* </ul>
* @param breakTypes The array of charTypes.
* @param preSequenceBreakType The break type that is conceptually immediately before the first (index 0) break
@@ -484,18 +500,18 @@
* @param postSequenceBreakType The break type that is conceptually immediately after the last break type in
* {@code breakTypes}.
*/
- private void resolvePossibleIntrawordPunctuation(final CharType[] breakTypes, final CharType preSequenceBreakType,
- final CharType postSequenceBreakType) {
+ private void resolvePossibleIntrawordPunctuation(final TokenType2[] breakTypes,
+ final TokenType2 preSequenceBreakType, final TokenType2 postSequenceBreakType) {
/* First iterate in reverse order, looking for ambiguous punctuation that is immediately followed by word chars.
* In that case the punctuation is considered part of the word. */
for (int breakIndex = breakTypes.length - 1; breakIndex > -1; breakIndex --) {
- final CharType currentBreakType = breakTypes[breakIndex];
- final CharType nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
+ final TokenType2 currentBreakType = breakTypes[breakIndex];
+ final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
- if (currentBreakType == CharType.ATTACHED_TRAILING_OR_INTRAWORD_PUNCTUATION
- && nextBreakType == CharType.WORD_CHAR) {
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ if (currentBreakType == TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION
+ && nextBreakType == TokenType2.WORD) {
+ breakTypes[breakIndex] = TokenType2.WORD;
}
}
@@ -502,45 +518,45 @@
/* Now iterate in normal order. */
for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
- final CharType currentBreakType = breakTypes[breakIndex];
- final CharType previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
- final CharType nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
+ final TokenType2 currentBreakType = breakTypes[breakIndex];
+ final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
+ final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
/* If the current type is not a break char, but it is surrounded by break chars, this marks a word. */
- if (currentBreakType != CharType.BREAK_CHAR
- && previousBreakType == CharType.BREAK_CHAR
- && (nextBreakType == CharType.BREAK_CHAR
- || nextBreakType == CharType.END)) {
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ if (currentBreakType != TokenType2.WHITESPACE
+ && previousBreakType == TokenType2.WHITESPACE
+ && (nextBreakType == TokenType2.WHITESPACE
+ || nextBreakType == TokenType2.END)) {
+ breakTypes[breakIndex] = TokenType2.WORD;
}
switch (currentBreakType) {
- case ATTACHED_TRAILING_OR_INTRAWORD_PUNCTUATION: {
+ case AMBIGUOUS_TRAILING_PUNCTUATION: {
switch (previousBreakType) {
- case WORD_CHAR: {
+ case WORD: {
switch (nextBreakType) {
- case WORD_CHAR: {
+ case WORD: {
/* This also is part of the word. */
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ breakTypes[breakIndex] = TokenType2.WORD;
break;
}
default: {
- breakTypes[breakIndex] = CharType.ATTACHED_TRAILING_PUNCTUATION;
+ breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
break;
}
}
break;
}
- case ATTACHED_TRAILING_PUNCTUATION: {
+ case TRAILING_PUNCTUATION: {
/* This is additional trailing punctuation. */
- breakTypes[breakIndex] = CharType.ATTACHED_TRAILING_PUNCTUATION;
+ breakTypes[breakIndex] = TokenType2.TRAILING_PUNCTUATION;
break;
}
- case BREAK_CHAR: {
+ case WHITESPACE: {
/* This cannot be trailing punctuation, so must be the first character in a new word, probably a
* contraction like "'tis" for example. */
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ breakTypes[breakIndex] = TokenType2.WORD;
break;
}
default:
@@ -548,25 +564,25 @@
}
break;
}
- case ATTACHED_LEADING_OR_INTRAWORD_PUNCTUATION: {
+ case AMBIGUOUS_LEADING_PUNCTUATION: {
switch (nextBreakType) {
- case WORD_CHAR: {
+ case WORD: {
switch (previousBreakType) {
- case WORD_CHAR: {
+ case WORD: {
/* This also is part of the word. */
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ breakTypes[breakIndex] = TokenType2.WORD;
break;
}
default: {
- breakTypes[breakIndex] = CharType.ATTACHED_LEADING_PUNCTUATION;
+ breakTypes[breakIndex] = TokenType2.LEADING_PUNCTUATION;
break;
}
}
break;
}
- case ATTACHED_LEADING_PUNCTUATION: {
+ case LEADING_PUNCTUATION: {
/* This is additional leading punctuation. */
- breakTypes[breakIndex] = CharType.ATTACHED_LEADING_PUNCTUATION;
+ breakTypes[breakIndex] = TokenType2.LEADING_PUNCTUATION;
break;
}
default:
@@ -589,27 +605,27 @@
* @param postSequenceBreakType The break type that is conceptually immediately after the last break type in
* {@code breakTypes}.
*/
- private void resolveAttachedLeadingPunctuation(final CharType[] breakTypes, final CharType preSequenceBreakType,
- final CharType postSequenceBreakType) {
+ private void resolveAttachedLeadingPunctuation(final TokenType2[] breakTypes, final TokenType2 preSequenceBreakType,
+ final TokenType2 postSequenceBreakType) {
/* Resolve attached leading punctuation. */
for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
- final CharType currentBreakType = breakTypes[breakIndex];
- final CharType previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
- final CharType nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
+ final TokenType2 currentBreakType = breakTypes[breakIndex];
+ final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
+ final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
switch (currentBreakType) {
- case ATTACHED_LEADING_PUNCTUATION: {
+ case LEADING_PUNCTUATION: {
switch (previousBreakType) {
- case BREAK_CHAR: {
+ case WHITESPACE: {
switch (nextBreakType) {
- case BREAK_CHAR: {
+ case WHITESPACE: {
/* Surrounded by breaks. Treat this as a word. */
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ breakTypes[breakIndex] = TokenType2.WORD;
break;
}
default: {
/* Combine it with the previous whitespace. */
- breakTypes[breakIndex] = CharType.BREAK_CHAR;
+ breakTypes[breakIndex] = TokenType2.WHITESPACE;
break;
}
}
@@ -616,7 +632,7 @@
break;
}
default: {
- breakTypes[breakIndex] = CharType.BREAK_CHAR;
+ breakTypes[breakIndex] = TokenType2.WHITESPACE;
break;
}
}
@@ -637,34 +653,34 @@
* @param postSequenceBreakType The break type that is conceptually immediately after the last break type in
* {@code breakTypes}.
*/
- private void resolveAttachedTrailingPunctuation(final CharType[] breakTypes, final CharType preSequenceBreakType,
- final CharType postSequenceBreakType) {
+ private void resolveAttachedTrailingPunctuation(final TokenType2[] breakTypes,
+ final TokenType2 preSequenceBreakType, final TokenType2 postSequenceBreakType) {
/* Resolve attached trailing punctuation. Iterate these in reverse order. */
for (int breakIndex = breakTypes.length - 1; breakIndex > 0; breakIndex --) {
- final CharType currentBreakType = breakTypes[breakIndex];
- final CharType previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
- final CharType nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
+ final TokenType2 currentBreakType = breakTypes[breakIndex];
+ final TokenType2 previousBreakType = breakIndex == 0 ? preSequenceBreakType : breakTypes[breakIndex - 1];
+ final TokenType2 nextBreakType = breakIndex == breakTypes.length - 1 ? postSequenceBreakType
: breakTypes[breakIndex + 1];
switch (currentBreakType) {
- case ATTACHED_TRAILING_PUNCTUATION: {
+ case TRAILING_PUNCTUATION: {
switch (nextBreakType) {
- case BREAK_CHAR:
+ case WHITESPACE:
case END: {
switch (previousBreakType) {
- case BREAK_CHAR: {
+ case WHITESPACE: {
/* Surrounded by breaks. Treat this as a word. */
- breakTypes[breakIndex] = CharType.WORD_CHAR;
+ breakTypes[breakIndex] = TokenType2.WORD;
break;
}
default: {
/* Combine it with the previous whitespace. */
- breakTypes[breakIndex] = CharType.BREAK_CHAR;
+ breakTypes[breakIndex] = TokenType2.WHITESPACE;
}
}
break;
}
default: {
- breakTypes[breakIndex] = CharType.BREAK_CHAR;
+ breakTypes[breakIndex] = TokenType2.WHITESPACE;
break;
}
}
@@ -685,21 +701,21 @@
* @param writingSystem The writing system for {@code sequence}.
*/
protected void createImplicitTokens(final CharSequence sequence, final IntSequence rawOffsets,
- final CharType[] breakTypes, final WritingSystem writingSystem) {
+ final TokenType2[] breakTypes, final WritingSystem writingSystem) {
final TokenType previousTokenType =
resultTypes.size() > 0 ? resultTypes.get(this.resultTypes.size() - 1) : null;
if (previousTokenType == TokenType.WORD
- && breakTypes[0] == CharType.WORD_CHAR) {
+ && breakTypes[0] == TokenType2.WORD) {
/* Existing tokens end with a word, probably an explicit word. A new word should not be starting
* immediately after that.*/
throw new IllegalStateException("Word content disallowed immediately after explicit word.");
}
- boolean inWord = breakTypes[0] == CharType.WORD_CHAR;
+ boolean inWord = breakTypes[0] == TokenType2.WORD;
int nextTokenOffset = 0;
for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
- final CharType currentBreakType = breakTypes[breakIndex];
+ final TokenType2 currentBreakType = breakTypes[breakIndex];
final int currentOffset = rawOffsets.intAt(breakIndex);
switch (currentBreakType) {
@@ -709,7 +725,7 @@
this.resultWritingSystems.add(writingSystem);
break;
}
- case BREAK_CHAR: {
+ case WHITESPACE: {
if (inWord) {
/* Write the word and roll forward. */
this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
@@ -783,28 +799,28 @@
* @param c The char being tested.
* @return The word-breaking type of {@code c}.
*/
- public CharType computeCharType(final int c) {
+ public TokenType2 computeCharType(final int c) {
if (CharacterUtils.isWordBreakChar(c)) {
- return CharType.BREAK_CHAR;
+ return TokenType2.WHITESPACE;
}
if (isWordChar(c)) {
- return CharType.WORD_CHAR;
+ return TokenType2.WORD;
}
if (CharacterUtils.isAttachedLeadingPunctuation(c)) {
if (CharacterUtils.isPossibleIntrawordPunctuation(c)) {
- return CharType.ATTACHED_LEADING_OR_INTRAWORD_PUNCTUATION;
+ return TokenType2.AMBIGUOUS_LEADING_PUNCTUATION;
} else {
- return CharType.ATTACHED_LEADING_PUNCTUATION;
+ return TokenType2.LEADING_PUNCTUATION;
}
}
if (CharacterUtils.isAttachedTrailingPunctuation(c)) {
if (CharacterUtils.isPossibleIntrawordPunctuation(c)) {
- return CharType.ATTACHED_TRAILING_OR_INTRAWORD_PUNCTUATION;
+ return TokenType2.AMBIGUOUS_TRAILING_PUNCTUATION;
} else {
- return CharType.ATTACHED_TRAILING_PUNCTUATION;
+ return TokenType2.TRAILING_PUNCTUATION;
}
}
- return CharType.WORD_CHAR;
+ return TokenType2.WORD;
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-23 11:58:35
|
Revision: 13254
http://sourceforge.net/p/foray/code/13254
Author: victormote
Date: 2023-09-23 11:58:33 +0000 (Sat, 23 Sep 2023)
Log Message:
-----------
Treat all chars not identified otherwise as WORD chars.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-22 22:12:46 UTC (rev 13253)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-23 11:58:33 UTC (rev 13254)
@@ -150,9 +150,6 @@
/** There is no character here -- this is the end of the character sequence. */
END,
- /** Character is none of the above. */
- OTHER,
-
}
/**
@@ -760,7 +757,9 @@
case Character.TITLECASE_LETTER:
case Character.MODIFIER_LETTER:
case Character.OTHER_LETTER:
+
case Character.DECIMAL_DIGIT_NUMBER:
+
case Character.CURRENCY_SYMBOL:
case Character.MATH_SYMBOL:
case Character.LETTER_NUMBER:
@@ -805,7 +804,7 @@
return CharType.ATTACHED_TRAILING_PUNCTUATION;
}
}
- return CharType.OTHER;
+ return CharType.WORD_CHAR;
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-22 22:12:47
|
Revision: 13253
http://sourceforge.net/p/foray/code/13253
Author: victormote
Date: 2023-09-22 22:12:46 +0000 (Fri, 22 Sep 2023)
Log Message:
-----------
Make more assertions about tokenization results. Comment out some that are currently broken.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-22 21:01:55 UTC (rev 13252)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-22 22:12:46 UTC (rev 13253)
@@ -30,8 +30,11 @@
import org.foray.common.i18n.WritingSystem4a;
+import org.axsl.i18n.WritingSystem;
import org.axsl.orthography.OrthographyException;
import org.axsl.orthography.optional.Lexer;
+import org.axsl.orthography.optional.Lexer.Token;
+import org.axsl.orthography.optional.Lexer.TokenType;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Disabled;
@@ -69,7 +72,7 @@
* @param testString The text string.
* @return The list of words tokenized from {@code testString}.
*/
- private List<String> tokenize(final String testString) {
+ private List<Lexer.Token> tokenize(final String testString) {
final Lexer4a out = getObjectUnderTest();
out.addUntokenized(testString, WritingSystem4a.USA);
return tokenize();
@@ -79,13 +82,13 @@
* Convert the iterator output to a List, for more convenient testing.
* @return The list of words tokenized from {@code testString}.
*/
- private List<String> tokenize() {
+ private List<Lexer.Token> tokenize() {
final Lexer4a out = getObjectUnderTest();
out.lock();
- final List<String> actual = new ArrayList<String>();
+ final List<Lexer.Token> actual = new ArrayList<Lexer.Token>();
while (out.hasNext()) {
final Lexer.Token token = out.next();
- actual.add(token.getText().toString());
+ actual.add(token.getImmutableCopy());
}
out.clear();
return actual;
@@ -92,23 +95,37 @@
}
/**
+ * Makes assertions about the content of a token relative to expected values.
+ * @param token The token being tested.
+ * @param expectedText The expected token text.
+ * @param expectedType The expected token type.
+ * @param expectedWritingSystem The expected token writing system.
+ */
+ private void testToken(final Token token, final String expectedText, final TokenType expectedType,
+ final WritingSystem expectedWritingSystem) {
+ assertEquals(expectedText, token.getText());
+ assertEquals(expectedType, token.getTokenType());
+ assertEquals(expectedWritingSystem, token.getWritingSystem());
+ }
+
+ /**
* A simple test of {@link LexerLatin1#breakIntoWords(CharSequence)}.
*/
@Test
public void testBreakSimple() {
final String testString = "Beware the ides of March.";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(10, actual.size());
- assertEquals("Beware", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("the", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("ides", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("of", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("March", actual.get(8));
- assertEquals(".", actual.get(9));
+ testToken(actual.get(0), "Beware", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "the", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "ides", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "of", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "March", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(9), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -117,35 +134,35 @@
@Test
public void testMedium() {
final String testString = "39. It was the best of times. It was the worst of times. <----";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(27, actual.size());
- assertEquals("39", actual.get(0));
- assertEquals(". ", actual.get(1));
- assertEquals("It", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("was", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("the", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("best", actual.get(8));
- assertEquals(" ", actual.get(9));
- assertEquals("of", actual.get(10));
- assertEquals(" ", actual.get(11));
- assertEquals("times", actual.get(12));
- assertEquals(". ", actual.get(13));
- assertEquals("It", actual.get(14));
- assertEquals(" ", actual.get(15));
- assertEquals("was", actual.get(16));
- assertEquals(" ", actual.get(17));
- assertEquals("the", actual.get(18));
- assertEquals(" ", actual.get(19));
- assertEquals("worst", actual.get(20));
- assertEquals(" ", actual.get(21));
- assertEquals("of", actual.get(22));
- assertEquals(" ", actual.get(23));
- assertEquals("times", actual.get(24));
- assertEquals(". ", actual.get(25));
- assertEquals("<----", actual.get(26));
+ testToken(actual.get(0), "39", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "It", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "was", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "the", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "best", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), "of", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), "times", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(13), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), "It", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), "was", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(18), "the", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(19), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(20), "worst", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(21), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(22), "of", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(23), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(24), "times", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(25), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(26), "<----", TokenType.WORD, WritingSystem4a.USA);
}
/**
@@ -155,20 +172,20 @@
public void testWithCompoundWord() {
/* Spoken by Juliet, Romeo & Juliet, Act 3 Scene 2. */
final String testString = "Gallop apace, you fiery-footed steeds,";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
/* Compound word "fiery-footed" treated as one word. */
assertEquals(10, actual.size());
- assertEquals("Gallop", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("apace", actual.get(2));
- assertEquals(", ", actual.get(3));
- assertEquals("you", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("fiery-footed", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("steeds", actual.get(8));
- assertEquals(",", actual.get(9));
+ testToken(actual.get(0), "Gallop", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "apace", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "you", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "fiery-footed", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "steeds", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(9), ",", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -178,15 +195,15 @@
public void testWithMidWordContractionApostrophe() {
/* Spoken by Hamlet, Hamlet, Act 2, Scene 2. */
final String testString = "The play's the thing";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(7, actual.size());
- assertEquals("The", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("play's", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("the", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("thing", actual.get(6));
+ testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "play's", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "the", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "thing", TokenType.WORD, WritingSystem4a.USA);
}
/**
@@ -196,17 +213,17 @@
@Test
public void testWithSymbolsAsWords() {
final String testString = "! @ # $ %";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(9, actual.size());
- assertEquals("!", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("@", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("#", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("$", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("%", actual.get(8));
+ testToken(actual.get(0), "!", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "@", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "#", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "$", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "%", TokenType.WORD, WritingSystem4a.USA);
}
/**
@@ -217,36 +234,36 @@
@Test
public void testWithAttachedPunctuation() {
final String testString = "Parentheses (as I stated earlier) are a matching pair of ( and ) characters.";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(28, actual.size());
- assertEquals("Parentheses", actual.get(0));
- assertEquals(" (", actual.get(1));
- assertEquals("as", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("I", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("stated", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("earlier", actual.get(8));
- assertEquals(") ", actual.get(9));
- assertEquals("are", actual.get(10));
- assertEquals(" ", actual.get(11));
- assertEquals("a", actual.get(12));
- assertEquals(" ", actual.get(13));
- assertEquals("matching", actual.get(14));
- assertEquals(" ", actual.get(15));
- assertEquals("pair", actual.get(16));
- assertEquals(" ", actual.get(17));
- assertEquals("of", actual.get(18));
- assertEquals(" ", actual.get(19));
- assertEquals("(", actual.get(20));
- assertEquals(" ", actual.get(21));
- assertEquals("and", actual.get(22));
- assertEquals(" ", actual.get(23));
- assertEquals(")", actual.get(24));
- assertEquals(" ", actual.get(25));
- assertEquals("characters", actual.get(26));
- assertEquals(".", actual.get(27));
+ testToken(actual.get(0), "Parentheses", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " (", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "as", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "I", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "stated", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "earlier", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), ") ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), "are", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), "a", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), "matching", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), "pair", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(17), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(18), "of", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(19), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(20), "(", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(21), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(22), "and", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(23), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(24), ")", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(25), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(26), "characters", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(27), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -257,26 +274,26 @@
@Test
public void testUnicodeWordBoundariesExample() {
final String testString = "The quick (“brown”) fox can’t jump 32.3 feet, right?";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(18, actual.size());
- assertEquals("The", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("quick", actual.get(2));
- assertEquals(" (“", actual.get(3));
- assertEquals("brown", actual.get(4));
- assertEquals("”) ", actual.get(5));
- assertEquals("fox", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("can’t", actual.get(8));
- assertEquals(" ", actual.get(9));
- assertEquals("jump", actual.get(10));
- assertEquals(" ", actual.get(11));
- assertEquals("32.3", actual.get(12));
- assertEquals(" ", actual.get(13));
- assertEquals("feet", actual.get(14));
- assertEquals(", ", actual.get(15));
- assertEquals("right", actual.get(16));
- assertEquals("?", actual.get(17));
+ testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "quick", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " (“", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "brown", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), "”) ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "fox", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "can’t", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), "jump", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), "32.3", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), "feet", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), "right", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(17), "?", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -285,12 +302,12 @@
@Test
public void testWordWithNumber() {
final String testString = "Appendix D.4)";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(4, actual.size());
- assertEquals("Appendix", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("D.4", actual.get(2));
- assertEquals(")", actual.get(3));
+ testToken(actual.get(0), "Appendix", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "D.4", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(3), ")", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -299,12 +316,12 @@
@Test
public void testDoubleTrailingPunctuationAtEnd() {
final String testString = "every creature.”";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(4, actual.size());
- assertEquals("every", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("creature", actual.get(2));
- assertEquals(".”", actual.get(3));
+ testToken(actual.get(0), "every", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "creature", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(3), ".”", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -313,14 +330,14 @@
@Test
public void testLeadingPunctuationAtStart() {
final String testString = "“Go ye into";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(6, actual.size());
- assertEquals("“", actual.get(0));
- assertEquals("Go", actual.get(1));
- assertEquals(" ", actual.get(2));
- assertEquals("ye", actual.get(3));
- assertEquals(" ", actual.get(4));
- assertEquals("into", actual.get(5));
+ testToken(actual.get(0), "“", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(1), "Go", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(2), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(3), "ye", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(4), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(5), "into", TokenType.WORD, WritingSystem4a.USA);
}
/**
@@ -329,15 +346,15 @@
@Test
public void testMultipleTrailingAttachedPunctuation() {
final String testString = "for every [student]. Return";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(7, actual.size());
- assertEquals("for", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("every", actual.get(2));
- assertEquals(" [", actual.get(3));
- assertEquals("student", actual.get(4));
- assertEquals("]. ", actual.get(5));
- assertEquals("Return", actual.get(6));
+ testToken(actual.get(0), "for", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "every", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " [", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "student", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), "]. ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "Return", TokenType.WORD, WritingSystem4a.USA);
}
/**
@@ -346,20 +363,20 @@
@Test
public void testInitialContraction() {
final String testString = "’Tis the season to be jolly.";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(12, actual.size());
- assertEquals("’Tis", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("the", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("season", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("to", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("be", actual.get(8));
- assertEquals(" ", actual.get(9));
- assertEquals("jolly", actual.get(10));
- assertEquals(".", actual.get(11));
+ testToken(actual.get(0), "’Tis", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "the", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "season", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "be", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), "jolly", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(11), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -370,16 +387,16 @@
@Test
public void testInitialPunctuation() {
final String testString = "Letter, &c.,\nat large;";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(8, actual.size());
- assertEquals("Letter", actual.get(0));
- assertEquals(", ", actual.get(1));
- assertEquals("&c", actual.get(2));
- assertEquals(".,\n", actual.get(3));
- assertEquals("at", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("large", actual.get(6));
- assertEquals(";", actual.get(7));
+ testToken(actual.get(0), "Letter", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "&c", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), ".,\n", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "at", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "large", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(7), ";", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -388,26 +405,26 @@
@Test
public void testOneExplicitToken() {
final String testString = "To be, i.e. to exist, or not to be.";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(18, actual.size());
- assertEquals("To", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("be", actual.get(2));
- assertEquals(", ", actual.get(3));
- assertEquals("i.e", actual.get(4));
- assertEquals(". ", actual.get(5));
- assertEquals("to", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("exist", actual.get(8));
- assertEquals(", ", actual.get(9));
- assertEquals("or", actual.get(10));
- assertEquals(" ", actual.get(11));
- assertEquals("not", actual.get(12));
- assertEquals(" ", actual.get(13));
- assertEquals("to", actual.get(14));
- assertEquals(" ", actual.get(15));
- assertEquals("be", actual.get(16));
- assertEquals(".", actual.get(17));
+ testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "i.e", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "exist", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), ", ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), "or", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), "not", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), "be", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(17), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -416,26 +433,26 @@
@Test
public void testOneExplicitToken2() {
final String testString = "To be, (i.e. to exist,) or not to be.";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(18, actual.size());
- assertEquals("To", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("be", actual.get(2));
- assertEquals(", (", actual.get(3));
- assertEquals("i.e", actual.get(4));
- assertEquals(". ", actual.get(5));
- assertEquals("to", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("exist", actual.get(8));
- assertEquals(",) ", actual.get(9));
- assertEquals("or", actual.get(10));
- assertEquals(" ", actual.get(11));
- assertEquals("not", actual.get(12));
- assertEquals(" ", actual.get(13));
- assertEquals("to", actual.get(14));
- assertEquals(" ", actual.get(15));
- assertEquals("be", actual.get(16));
- assertEquals(".", actual.get(17));
+ testToken(actual.get(0), "To", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "be", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), ", (", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "i.e", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "exist", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), ",) ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), "or", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(11), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(12), "not", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(13), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(14), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(15), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(16), "be", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(17), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -444,14 +461,14 @@
@Test
public void testIntrawordPeriod() {
final String testString = "Mr. P.’s hat.";
- final List<String> actual = tokenize(testString);
+ final List<Lexer.Token> actual = tokenize(testString);
assertEquals(6, actual.size());
- assertEquals("Mr", actual.get(0));
- assertEquals(". ", actual.get(1));
- assertEquals("P.’s", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("hat", actual.get(4));
- assertEquals(".", actual.get(5));
+ testToken(actual.get(0), "Mr", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), ". ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "P.’s", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "hat", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(5), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -463,20 +480,20 @@
out.addUntokenized("The trip to ", WritingSystem4a.USA);
out.addWordToken("São Paulo", WritingSystem4a.USA);
out.addUntokenized(" was nice.", WritingSystem4a.USA);
- final List<String> actual = tokenize();
+ final List<Lexer.Token> actual = tokenize();
assertEquals(12, actual.size());
- assertEquals("The", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("trip", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("to", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("São Paulo", actual.get(6));
- assertEquals(" ", actual.get(7));
- assertEquals("was", actual.get(8));
- assertEquals(" ", actual.get(9));
- assertEquals("nice", actual.get(10));
- assertEquals(".", actual.get(11));
+ testToken(actual.get(0), "The", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(2), "trip", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(4), "to", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(6), "São Paulo", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(8), "was", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.WHITESPACE, WritingSystem4a.USA);
+ testToken(actual.get(10), "nice", TokenType.WORD, WritingSystem4a.USA);
+// testToken(actual.get(11), ".", TokenType.WHITESPACE, WritingSystem4a.USA);
}
/**
@@ -491,20 +508,20 @@
out.addUntokenized("etc.", WritingSystem4a.LATIN);
out.addUntokenized("), are inert.", WritingSystem4a.USA);
- final List<String> actual = tokenize();
+ final List<Lexer.Token> actual = tokenize();
assertEquals(12, actual.size());
- assertEquals("Noble", actual.get(0));
- assertEquals(" ", actual.get(1));
- assertEquals("gases", actual.get(2));
- assertEquals(" (", actual.get(3));
- assertEquals("neon", actual.get(4));
- assertEquals(", ", actual.get(5));
- assertEquals("etc.", actual.get(6));
- assertEquals("), ", actual.get(7));
- assertEquals("are", actual.get(8));
- assertEquals(" ", actual.get(9));
- assertEquals("inert", actual.get(10));
- assertEquals(".", actual.get(11));
+ testToken(actual.get(0), "Noble", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(1), " ", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(2), "gases", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(3), " (", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(4), "neon", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(5), ", ", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(6), "etc.", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(7), "), ", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(8), "are", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(9), " ", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(10), "inert", TokenType.WORD, WritingSystem4a.USA);
+ testToken(actual.get(11), ".", TokenType.WORD, WritingSystem4a.USA);
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <vic...@us...> - 2023-09-22 21:02:02
|
Revision: 13252
http://sourceforge.net/p/foray/code/13252
Author: victormote
Date: 2023-09-22 21:01:55 +0000 (Fri, 22 Sep 2023)
Log Message:
-----------
1. Remove odd/even logic previously implied for return values. 2. Remove explicit tokens from tests.
Modified Paths:
--------------
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-21 22:30:25 UTC (rev 13251)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java 2023-09-22 21:01:55 UTC (rev 13252)
@@ -300,7 +300,8 @@
}
final Token4a returnToken = new Token4a();
returnToken.text = this.resultTextItems.get(this.nextResultIndex);
- returnToken.type = NumberUtils.isOdd(this.nextResultIndex) ? TokenType.WHITESPACE : TokenType.WORD;
+ returnToken.type = this.resultTypes.get(this.nextResultIndex);
+ //.isOdd(this.nextResultIndex) ? TokenType.WHITESPACE : TokenType.WORD;
returnToken.writingSystem = this.resultWritingSystems.get(this.nextResultIndex);
this.nextResultIndex ++;
return returnToken;
@@ -682,44 +683,31 @@
/**
* Create the token list.
* @param sequence The sequence of characters being tokenized.
- * @param rawBreaks The breaks found by the break iterator.
+ * @param rawOffsets The offsets of the breaks found by the break iterator.
* @param breakTypes The filtered break types.
* @param writingSystem The writing system for {@code sequence}.
*/
- protected void createImplicitTokens(final CharSequence sequence, final IntSequence rawBreaks,
+ protected void createImplicitTokens(final CharSequence sequence, final IntSequence rawOffsets,
final CharType[] breakTypes, final WritingSystem writingSystem) {
- boolean inWord = false;
- int startNextToken = 0;
-
- /* First token. */
- switch (breakTypes[0]) {
- case WORD_CHAR: {
- /* Sequence starts with a word. */
- inWord = true;
- break;
+ final TokenType previousTokenType =
+ resultTypes.size() > 0 ? resultTypes.get(this.resultTypes.size() - 1) : null;
+ if (previousTokenType == TokenType.WORD
+ && breakTypes[0] == CharType.WORD_CHAR) {
+ /* Existing tokens end with a word, probably an explicit word. A new word should not be starting
+ * immediately after that.*/
+ throw new IllegalStateException("Word content disallowed immediately after explicit word.");
}
- default: {
- /* Sequence starts with non-word content. */
- if (this.resultTextItems.size() < 1) {
- /* This is the first token created. Add the empty dummy token to signal that fact. */
- this.resultTextItems.add(StringUtils.EMPTY_STRING);
- this.resultTypes.add(TokenType.WORD);
- this.resultWritingSystems.add(writingSystem);
- }
- inWord = false;
- break;
- }
- }
+ boolean inWord = breakTypes[0] == CharType.WORD_CHAR;
+ int nextTokenOffset = 0;
- /* Iterate all remaining tokens. */
- for (int breakIndex = 1; breakIndex < breakTypes.length; breakIndex ++) {
+ for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
final CharType currentBreakType = breakTypes[breakIndex];
- final int sequenceIndex = rawBreaks.intAt(breakIndex);
+ final int currentOffset = rawOffsets.intAt(breakIndex);
switch (currentBreakType) {
case END: {
- this.resultTextItems.add(sequence.subSequence(startNextToken, sequence.length()));
+ this.resultTextItems.add(sequence.subSequence(nextTokenOffset, sequence.length()));
this.resultTypes.add(TokenType.WORD);
this.resultWritingSystems.add(writingSystem);
break;
@@ -727,10 +715,10 @@
case BREAK_CHAR: {
if (inWord) {
/* Write the word and roll forward. */
- this.resultTextItems.add(sequence.subSequence(startNextToken, sequenceIndex));
+ this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
this.resultTypes.add(TokenType.WORD);
this.resultWritingSystems.add(writingSystem);
- startNextToken = sequenceIndex;
+ nextTokenOffset = currentOffset;
inWord = false;
} else {
/* There is no state change. Nothing to do. */
@@ -743,11 +731,11 @@
/* There is no state change. Nothing to do. */
} else {
/* Write the interword content and roll forward. */
- this.resultTextItems.add(sequence.subSequence(startNextToken, sequenceIndex));
+ this.resultTextItems.add(sequence.subSequence(nextTokenOffset, currentOffset));
/* It isn't necessarily whitespace, but serves the purpose of inter-word content for now. */
this.resultTypes.add(TokenType.WHITESPACE);
this.resultWritingSystems.add(writingSystem);
- startNextToken = sequenceIndex;
+ nextTokenOffset = currentOffset;
inWord = true;
}
}
Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-21 22:30:25 UTC (rev 13251)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerEnglishTests.java 2023-09-22 21:01:55 UTC (rev 13252)
@@ -60,17 +60,6 @@
*/
public OrthographyServer4a createServer() throws IOException, OrthographyException {
final OrthographyServer4a server = OrthographyServer4aTests.makeHyphenationServer();
-
- final ExplicitTokens englishTokens = new ExplicitTokens();
- englishTokens.addToken("i\\.e\\.");
- englishTokens.addToken("&c\\.");
- englishTokens.addToken("etc\\.");
- server.getOrthography(WritingSystem4a.USA).setExplicitTokens(englishTokens);
-
- final ExplicitTokens latinTokens = new ExplicitTokens();
- latinTokens.addToken("etc\\.");
- server.getOrthography(WritingSystem4a.LATIN).setExplicitTokens(latinTokens);
-
return server;
}
@@ -325,14 +314,13 @@
public void testLeadingPunctuationAtStart() {
final String testString = "“Go ye into";
final List<String> actual = tokenize(testString);
- assertEquals(7, actual.size());
- assertEquals("", actual.get(0));
- assertEquals("“", actual.get(1));
- assertEquals("Go", actual.get(2));
- assertEquals(" ", actual.get(3));
- assertEquals("ye", actual.get(4));
- assertEquals(" ", actual.get(5));
- assertEquals("into", actual.get(6));
+ assertEquals(6, actual.size());
+ assertEquals("“", actual.get(0));
+ assertEquals("Go", actual.get(1));
+ assertEquals(" ", actual.get(2));
+ assertEquals("ye", actual.get(3));
+ assertEquals(" ", actual.get(4));
+ assertEquals("into", actual.get(5));
}
/**
@@ -386,8 +374,8 @@
assertEquals(8, actual.size());
assertEquals("Letter", actual.get(0));
assertEquals(", ", actual.get(1));
- assertEquals("&c.", actual.get(2));
- assertEquals(",\n", actual.get(3));
+ assertEquals("&c", actual.get(2));
+ assertEquals(".,\n", actual.get(3));
assertEquals("at", actual.get(4));
assertEquals(" ", actual.get(5));
assertEquals("large", actual.get(6));
@@ -406,8 +394,8 @@
assertEquals(" ", actual.get(1));
assertEquals("be", actual.get(2));
assertEquals(", ", actual.get(3));
- assertEquals("i.e.", actual.get(4));
- assertEquals(" ", actual.get(5));
+ assertEquals("i.e", actual.get(4));
+ assertEquals(". ", actual.get(5));
assertEquals("to", actual.get(6));
assertEquals(" ", actual.get(7));
assertEquals("exist", actual.get(8));
@@ -434,8 +422,8 @@
assertEquals(" ", actual.get(1));
assertEquals("be", actual.get(2));
assertEquals(", (", actual.get(3));
- assertEquals("i.e.", actual.get(4));
- assertEquals(" ", actual.get(5));
+ assertEquals("i.e", actual.get(4));
+ assertEquals(". ", actual.get(5));
assertEquals("to", actual.get(6));
assertEquals(" ", actual.get(7));
assertEquals("exist", actual.get(8));
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|