[FOray-commit] SF.net SVN: foray:[12698] trunk/foray
Modular XSL-FO Implementation for Java.
Status: Alpha
Brought to you by:
victormote
|
From: <vic...@us...> - 2022-07-02 22:14:51
|
Revision: 12698
http://sourceforge.net/p/foray/code/12698
Author: victormote
Date: 2022-07-02 22:14:48 +0000 (Sat, 02 Jul 2022)
Log Message:
-----------
Handle apostrophes better.
Modified Paths:
--------------
trunk/foray/foray-common/src/main/java/org/foray/common/primitive/StringUtils.java
trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Word4a.java
trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/DictionaryParser.java
Modified: trunk/foray/foray-common/src/main/java/org/foray/common/primitive/StringUtils.java
===================================================================
--- trunk/foray/foray-common/src/main/java/org/foray/common/primitive/StringUtils.java 2022-07-02 18:31:42 UTC (rev 12697)
+++ trunk/foray/foray-common/src/main/java/org/foray/common/primitive/StringUtils.java 2022-07-02 22:14:48 UTC (rev 12698)
@@ -352,6 +352,44 @@
}
/**
+ * Removes all occurrences of a given Unicode codepoint from a {@link StringBuilder}.
+ * @param builder The StringBuilder in which the replacement should occur.
+ * @param codePoint The codepoint to be deleted.
+ */
+ public static void deleteAll(final StringBuilder builder, final int codePoint) {
+ final int charCount = Character.charCount(codePoint);
+ for (int index = 0; index < builder.length(); index ++) {
+ final int c = builder.codePointAt(index);
+ if (c == codePoint) {
+ builder.delete(index, index + charCount);
+ }
+ }
+ }
+
+ /**
+ * Converts all uppercase characters in a {@link StringBuilder} to lowercase.
+ * @param builder The StringBuilder in which the replacement should occur.
+ */
+ public static void toLowercase(final StringBuilder builder) {
+ for (int index = 0; index < builder.length(); index ++) {
+ final int c = builder.codePointAt(index);
+ final int lower = Character.toLowerCase(c);
+ if (c != lower) {
+ final int deleteSize = Character.charCount(c);
+ builder.delete(index, index + deleteSize);
+ final int addSize = Character.charCount(lower);
+ if (addSize == 1) {
+ builder.insert(index, (char) lower);
+ } else {
+ builder.insert(index, Character.highSurrogate(lower));
+ index ++;
+ builder.insert(index, Character.lowSurrogate(lower));
+ }
+ }
+ }
+ }
+
+ /**
* Computes the length of all or part of a null-terminated char array. The null character is
* 0x00.
* @param charArray The array which contains the null-terminated string.
Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2022-07-02 18:31:42 UTC (rev 12697)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml 2022-07-02 22:14:48 UTC (rev 12698)
@@ -10263,7 +10263,7 @@
<w><t>as-se-ga-ing</t></w>
<w><t>as-sem-bl</t></w>
<w><t>as-sem-blage</t></w>
-<w><t>as-sem-ble</t></w>
+<w><t>as-sem-ble</t><verb><regular-root/></verb></w>
<w><t>as-sem-blé</t></w>
<w><t>as-sem-bled</t></w>
<w><t>as-sem-bler</t></w>
@@ -14325,7 +14325,7 @@
<w><t>bear-paw</t></w>
<w><t>bear-skin</t></w>
<w><t>bear-wood</t></w>
-<w><t>beast</t></w>
+<w><t>beast</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>beast-ie</t></w>
<w><t>beast-ings</t></w>
<w><t>beast-li-er</t></w>
@@ -30911,7 +30911,7 @@
<w><t>com-pat-i-ble</t></w>
<w><t>com-pat-i-ble-ness</t></w>
<w><t>com-pat-i-bly</t></w>
-<w><t>com-pat-ri-ot</t></w>
+<w><t>com-pat-ri-ot</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>com-pa-tri-ot</t></w>
<w><t>com-pa-tri-ot-ic</t></w>
<w><t>com-pa-tri-ot-ism</t></w>
@@ -35028,7 +35028,7 @@
<w><t>cre-a-tive-ly</t></w>
<w><t>cre-a-tive-ness</t></w>
<w><t>cre-a-tiv-i-ty</t></w>
-<w><t>cre-a-tor</t></w>
+<w><t>cre-a-tor</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>Cre-a-tor</t></w>
<w><t>cre-a-tor-ship</t></w>
<w><t>crea-tur-al</t></w>
@@ -44499,10 +44499,10 @@
<phrase><t>Don Quix-ote</t></phrase>
<w><t>don-sie</t></w>
<w><t>don-sy</t></w>
+<w><t>don’t</t></w>
<w><t>Do-nus</t></w>
<w><t>do-nut</t></w>
<w><t>don-zel</t></w>
-<w><t>don’t</t></w>
<w><t>doo-dad</t></w>
<w><t>doo-dah</t></w>
<w><t>doo-dle</t></w>
@@ -54743,7 +54743,7 @@
<w><t>fen</t></w>
<w><t>fe-na-gle</t></w>
<w><t>fe-na-gler</t></w>
-<w><t>fence</t></w>
+<w><t>fence</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>fence-like</t></w>
<w><t>fence-post</t></w>
<w><t>fenc-er</t></w>
@@ -86328,7 +86328,7 @@
<w><t>leant</t></w>
<w><t>lean=to</t></w>
<w><t>Lea-o</t></w>
-<w><t>leap</t></w>
+<w><t>leap</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>leap-er</t></w>
<w><t>leap-frog</t></w>
<w><t>leap-frogged</t></w>
@@ -108103,19 +108103,6 @@
<w><t>Nyx</t></w>
<w><t>NZBC</t></w>
<w><t>o/c</t></w>
-<w><t>O'Ca-sey</t></w>
-<w><t>o'clock</t></w>
-<w><t>O'Con-nell</t></w>
-<w><t>O'Con-nor</t></w>
-<w><t>O'Don-o-van</t></w>
-<w><t>o'er</t></w>
-<w><t>O'Fal-lon</t></w>
-<w><t>O'Fao-láin</t></w>
-<w><t>O'Fla-her-ty</t></w>
-<w><t>O'Har-a</t></w>
-<w><t>O'Hig-gins</t></w>
-<w><t>O'Kel-ley</t></w>
-<w><t>O'Neill</t></w>
<w><t>oaf</t></w>
<w><t>oaf-ish</t></w>
<w><t>oaf-ish-ly</t></w>
@@ -108472,6 +108459,7 @@
<w><t>O-cal-a</t></w>
<w><t>OCAM</t></w>
<w><t>oc-a-ri-na</t></w>
+<w><t>O’Ca-sey</t></w>
<w><t>Oc-cam</t></w>
<phrase><t>Oc-cam's ra-zor</t></phrase>
<w><t>Oc-cam-ism</t></w>
@@ -108605,7 +108593,10 @@
<w><t>ock-er</t></w>
<w><t>Ock-ham</t></w>
<phrase><t>Ock-ham's ra-zor</t></phrase>
+<w><t>o’clock</t></w>
<w><t>Oc-nus</t></w>
+<w><t>O’Con-nell</t></w>
+<w><t>O’Con-nor</t></w>
<w><t>O-con-o-mo-woc</t></w>
<w><t>O-con-to</t></w>
<w><t>o-co-til-lo</t></w>
@@ -108743,6 +108734,7 @@
<w><t>o-dom-e-ter</t></w>
<w><t>o-do-met-ri-cal</t></w>
<w><t>o-dom-e-try</t></w>
+<w><t>O’Don-o-van</t></w>
<w><t>od-on-tal-gi-a</t></w>
<w><t>o-don-tal-gia</t></w>
<w><t>o-don-tal-gic</t></w>
@@ -108820,6 +108812,7 @@
<w><t>Oe-nop-i-des</t></w>
<w><t>Oe-no-pi-on</t></w>
<w><t>Oe-o-nus</t></w>
+<w><t>o’er</t></w>
<w><t>oer-sted</t></w>
<w><t>Oer-sted</t></w>
<w><t>Oe-sel</t></w>
@@ -108836,6 +108829,8 @@
<w><t>oeu-vre</t></w>
<w><t>oeu-vres</t></w>
<w><t>of</t></w>
+<w><t>O’Fal-lon</t></w>
+<w><t>O’Fao-láin</t></w>
<w><t>o-fay</t></w>
<w><t>off</t></w>
<w><t>Of-fa</t></w>
@@ -108942,6 +108937,7 @@
<w><t>off-take</t></w>
<phrase><t>off the rec-ord</t></phrase>
<phrase><t>off=year e-lec-tion</t></phrase>
+<w><t>O’Fla-her-ty</t></w>
<w><t>OFM</t></w>
<w><t>OFS</t></w>
<w><t>oft</t></w>
@@ -108979,8 +108975,10 @@
<w><t>O-gyg-i-an</t></w>
<w><t>Og-y-gus</t></w>
<w><t>oh</t><interjection/></w>
+<w><t>O’Har-a</t></w>
<phrase><t>O Hen-ry</t></phrase>
<w><t>OHG</t></w>
+<w><t>O’Hig-gins</t></w>
<w><t>Ohi-o</t></w>
<w><t>O-hi-o</t></w>
<w><t>Oh-len-schl-ger</t></w>
@@ -109052,6 +109050,7 @@
<phrase><t>O-ke-fe-no-kee Swamp</t></phrase>
<w><t>Ok-e-ghem</t></w>
<w><t>O-ke-ghem</t></w>
+<w><t>O’Kel-ley</t></w>
<w><t>O-ke-mah</t></w>
<w><t>O-ken</t></w>
<w><t>o-key=doke</t></w>
@@ -109386,6 +109385,7 @@
<w><t>one=hand-ed</t></w>
<w><t>O-nei-da</t></w>
<w><t>one=i-de-aed</t></w>
+<w><t>O’Neill</t></w>
<w><t>o-nei-ric</t></w>
<w><t>o-nei-ro-crit-ic</t></w>
<w><t>o-nei-ro-crit-i-cal</t></w>
@@ -109404,6 +109404,7 @@
<w><t>on-er-ous</t></w>
<w><t>on-er-ous-ly</t></w>
<w><t>on-er-ous-ness</t></w>
+<w><t>one’s</t><adjective><possessive/></adjective></w>
<w><t>one-self</t></w>
<w><t>one=sid-ed</t></w>
<w><t>one=sid-ed-ly</t></w>
@@ -109413,7 +109414,6 @@
<w><t>one=up</t></w>
<w><t>one=up-man-ship</t></w>
<phrase><t>one=way tick-et</t></phrase>
-<w><t>one’s</t><adjective><possessive/></adjective></w>
<w><t>on-go-ing</t></w>
<w><t>on-i-cism</t></w>
<w><t>o-ni-o-ma-ni-a</t></w>
@@ -145702,7 +145702,7 @@
<w><t>ser-ow</t></w>
<w><t>Ser-pa-sil</t></w>
<w><t>Ser-pens</t></w>
-<w><t>ser-pent</t></w>
+<w><t>ser-pent</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>ser-pen-ti-form</t></w>
<w><t>ser-pen-tine</t></w>
<w><t>ser-pen-tin-i-za-tion</t></w>
@@ -156618,7 +156618,7 @@
<w><t>Sum-ter</t></w>
<w><t>Su-my</t></w>
<w><t>Sun</t></w>
-<w><t>sun</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
+<w><t>sun</t><noun><pluralizable/><convertible-to-possessive/></noun><verb><regular-root/></verb></w>
<w><t>Su-nay</t></w>
<w><t>sun-back</t></w>
<w><t>sun-bake</t></w>
@@ -165674,7 +165674,7 @@
<w><t>troph-o-zo-ite</t></w>
<w><t>tro-phy</t></w>
<w><t>tro-phy-less</t></w>
-<w><t>trop-ic</t></w>
+<w><t>trop-ic</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
<w><t>trop-i-cal</t></w>
<w><t>trop-i-cal-ih</t></w>
<w><t>trop-i-cal-ise</t></w>
@@ -186624,7 +186624,8 @@
<w><t>your</t></w>
<w><t>your'n</t></w>
<w><t>yours</t></w>
-<w><t>your-self</t></w>
+<w><t>your-self</t><noun><singular/></noun></w>
+<w><t>your-selves</t><noun><plural/></noun></w>
<phrase><t>yours tru-ly</t></phrase>
<w><t>yous</t></w>
<w><t>youse</t></w>
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Word4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Word4a.java 2022-07-02 18:31:42 UTC (rev 12697)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Word4a.java 2022-07-02 22:14:48 UTC (rev 12698)
@@ -30,6 +30,7 @@
import org.foray.common.primitive.CharSequenceUtils;
import org.foray.common.primitive.NumberUtils;
+import org.foray.common.primitive.StringUtils;
import org.axsl.fotree.text.FoDiscretionaryHyphen;
import org.axsl.fotree.text.FoWord;
@@ -78,9 +79,23 @@
final FoWordSegment segment = wordSegmentAt(index);
builder.append(segment.getText());
}
- return builder.toString();
+ return builder;
}
+ /**
+ * Returns the content of this word, adjusted for collating.
+ * Specifically, all "’" characters are removed, and the remaining content is converted to lowercase.
+ * @return The collating content for this word.
+ */
+ public CharSequence getCollatingContent() {
+ final CharSequence actualContent = getActualContent();
+ final StringBuilder builder = actualContent instanceof StringBuilder ?
+ (StringBuilder) actualContent : new StringBuilder(actualContent);
+ StringUtils.deleteAll(builder, '’');
+ StringUtils.toLowercase(builder);
+ return builder;
+ }
+
@Override
public CharSequence getNormalizedContent() {
final StringBuilder builder = new StringBuilder();
Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/DictionaryParser.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/DictionaryParser.java 2022-07-02 18:31:42 UTC (rev 12697)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/util/DictionaryParser.java 2022-07-02 22:14:48 UTC (rev 12698)
@@ -370,7 +370,7 @@
case "w": {
final StringWord word = new StringWord(this.currentPartsOfSpeech, this.currentSegments);
final String actualContent = word.getActualContent().toString();
- checkCollation(actualContent);
+ checkCollation(actualContent, word.getCollatingContent().toString());
/* Is it an existing ambiguous word? */
if (this.ambiguousWordMap.containsKey(actualContent)) {
@@ -455,7 +455,7 @@
case "phrase": {
final StringWord word = new StringWord(this.currentPartsOfSpeech, this.currentSegments);
final String actualContent = word.getActualContent().toString();
- checkCollation(actualContent);
+ checkCollation(actualContent, word.getCollatingContent().toString());
break;
}
case "vf": break;
@@ -499,25 +499,25 @@
/**
* Checks the collation of the sequence in the dictionary input to see if items are out of order.
- * @param actualContent The current content being checked for collation.
+ * @param actualContent The display content of the word being tested.
+ * @param collatingContent The current content being checked for collation.
*/
- private void checkCollation(final String actualContent) {
+ private void checkCollation(final String actualContent, final String collatingContent) {
if (! this.logDictionaryProblems) {
return;
}
- final String actualContentLowercase = actualContent.toLowerCase();
if (this.collator == null) {
- final String normalized = Normalizer.normalize(actualContentLowercase, Normalizer.Form.NFKD);
+ final String normalized = Normalizer.normalize(collatingContent, Normalizer.Form.NFKD);
if (normalized.compareTo(this.lastWord) < 0) {
warningMessage("Out of sequence (Unicode compatibility decomposition): " + actualContent);
}
this.lastWord = normalized;
} else {
- if (this.collator.compare(actualContentLowercase, this.lastWord) < 0) {
+ if (this.collator.compare(collatingContent, this.lastWord) < 0) {
warningMessage("Out of sequence (Collator " +
this.currentDictionary.orthography.toLocale().toString() + "): " + actualContent);
}
- this.lastWord = actualContentLowercase;
+ this.lastWord = collatingContent;
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|