foray-commit Mailing List for FOray (Page 73)

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 12076
          http://sourceforge.net/p/foray/code/12076
Author:   victormote
Date:     2021-11-17 14:02:41 +0000 (Wed, 17 Nov 2021)
Log Message:
-----------
Get all tokenizer tests to pass. Much reorganization of code to follow.

Modified Paths:
--------------
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerJavaBreakIterator.java
    trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerLatin1Tests.java

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================

--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java	2021-11-17 12:35:51 UTC (rev 12075)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java	2021-11-17 14:02:41 UTC (rev 12076)
@@ -47,7 +47,7 @@
         WORD_CHAR,
 
         /** Character is non-word break content. */
-        NON_WORD_BREAK_CHAR,
+        BREAK_CHAR,
 
         /** Character is a leading attached punctuation item such as (in English) an opening quotation mark or opening
          * parenthesis. */
@@ -61,7 +61,7 @@
         END,
 
         /** Character is none of the above. */
-        OTHER;
+        OTHER,
 
     }
 
@@ -118,7 +118,7 @@
      */
     public CharType computeCharType(final int c) {
         if (isBreakChar(c)) {
-            return CharType.NON_WORD_BREAK_CHAR;
+            return CharType.BREAK_CHAR;
         }
         if (isWordChar(c)) {
             return CharType.WORD_CHAR;

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerJavaBreakIterator.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerJavaBreakIterator.java	2021-11-17 12:35:51 UTC (rev 12075)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/LexerJavaBreakIterator.java	2021-11-17 14:02:41 UTC (rev 12076)
@@ -50,7 +50,7 @@
      * @param sequence The sequence whose breaks are needed.
      * @return The sequence of breaks, indexes into {@code sequence}.
      */
-    IntSequence findRawBreaks(final CharSequence sequence) {
+    protected IntSequence findRawBreaks(final CharSequence sequence) {
         final IntArrayBuilder result = new IntArrayBuilder(sequence.length());
         /* TODO: BreakIterators are reusable, not thread-safe, and expensive to create. Reuse this after figuring out
          * how instances of this class are used/reused. */
@@ -72,7 +72,7 @@
      * @return An array with a one-to-one correspondence with {@code rawBreaks}, containing the type of character at
      * that break.
      */
-    CharType[] findBreakTypes(final CharSequence sequence, final IntSequence rawBreaks) {
+    protected CharType[] findBreakTypes(final CharSequence sequence, final IntSequence rawBreaks) {
         final CharType[] breakTypes = new CharType[rawBreaks.length()];
         for (int breakIndex = 0; breakIndex < rawBreaks.length(); breakIndex ++) {
             if (breakIndex >= rawBreaks.length() - 1) {
@@ -86,164 +86,119 @@
         return breakTypes;
     }
 
-    @Override
-    public List<CharSequence> tokenize(final CharSequence sequence) {
-        if (sequence == null
-                || sequence.length() < 1) {
-            return Collections.emptyList();
+    /**
+     * Combine and eliminate the elements in the charTypes array.
+     * The touchstone here is the known word breaks which are always interword content.
+     * Anything between them must be either attached to the word break to become a part of the interword content, or
+     * must get coalesced into a "word" whether it is recognized as word content or not. If done properly, every element
+     * in the array, when finished, should be either {@link CharType#WORD_CHAR} or {@link CharType#BREAK_CHAR}.
+     * Anything not in those two categories will be treated in the final tokenization as {@link CharType#WORD_CHAR}.
+     * @param breakTypes The array of charTypes.
+     */
+    protected void filterBreakTypes(final CharType[] breakTypes) {
+        /* Convert attached trailing punctuation immediately after word content to interword. */
+        for (int breakIndex = 1; breakIndex < breakTypes.length; breakIndex ++) {
+            final CharType currentBreakType = breakTypes[breakIndex];
+            if (currentBreakType == CharType.ATTACHED_TRAILING_PUNCTUATION) {
+                final CharType previousBreakType = breakTypes[breakIndex - 1];
+                if (previousBreakType != CharType.BREAK_CHAR) {
+                    breakTypes[breakIndex] = CharType.BREAK_CHAR;
+                }
+            }
         }
+        /* Convert attached leading punctuation immediately before word content to interword. */
+        for (int breakIndex = 1; breakIndex < breakTypes.length; breakIndex ++) {
+            final CharType currentBreakType = breakTypes[breakIndex];
+            if (currentBreakType == CharType.ATTACHED_LEADING_PUNCTUATION) {
+                final CharType nextBreakType = breakTypes[breakIndex + 1];
+                if (nextBreakType != CharType.BREAK_CHAR) {
+                    breakTypes[breakIndex] = CharType.BREAK_CHAR;
+                }
+            }
+        }
+    }
 
-        /* First pass is to find all of the breaks that the BreakIterator can find. */
-        final IntSequence rawBreaks = findRawBreaks(sequence);
 
-
-        /* The BreakIterator is helpful, but for our purposes does not dig deeply enough.
-         * Our purpose is to find where words start and end and to treat all other content as non-word or interword
-         * content.
-         * So our second pass is to find out the type of each character that is at a break. */
-        final CharType[] breakTypes = findBreakTypes(sequence, rawBreaks);
-
-        /* The third pass detects which of the breaks we care about based on their type and context. */
+    protected List<CharSequence> createTokens(final CharSequence sequence, final IntSequence rawBreaks,
+            final CharType[] breakTypes) {
         final List<CharSequence> tokens = new ArrayList<CharSequence>();
+        boolean inWord = false;
+        int startNextToken = 0;
 
-        /* Handle the case of only one break. It is either a single-token word, or non-word content that must be
-         * preceded by an empty word. We will treat anything that is not a break char as part of a word. */
-        if (breakTypes.length == 1) {
-            if (breakTypes[0] == CharType.NON_WORD_BREAK_CHAR) {
-                tokens.add(StringUtils.EMPTY_STRING);
-            }
-            tokens.add(sequence);
-            return tokens;
-        }
-
-        /* There are at least two tokens. */
-
-        /* Index 0 is special. */
-        switch (breakTypes[0]) {
-        case NON_WORD_BREAK_CHAR: {
+        /* First token. */
+        switch(breakTypes[0]) {
+        case BREAK_CHAR: {
+            /* Sequence starts with a break. Add the empty dummy word that signals that fact. */
             tokens.add(StringUtils.EMPTY_STRING);
+            inWord = false;
             break;
         }
-        case WORD_CHAR: {
-            break;
+        default: {
+            /* Sequence starts with a word. */
+            inWord = true;
         }
-        case ATTACHED_LEADING_PUNCTUATION:
-        case ATTACHED_TRAILING_PUNCTUATION:
-        case OTHER: {
-            if (breakTypes[1] == CharType.NON_WORD_BREAK_CHAR) {
-                /* Treat this as a word. */
-            } else {
-                /* The next token starts a word. Threat this is non-word. */
-                tokens.add(StringUtils.EMPTY_STRING);
-            }
-            break;
         }
-        case END:
-            /* This shouldn't happen. */
-            break;
-        }
 
 
-        /* Now iterate the remaining breaks. */
-        int startNextToken = 0;
+        /* Iterate all remaining tokens. */
         for (int breakIndex = 1; breakIndex < breakTypes.length; breakIndex ++) {
+            final CharType currentBreakType = breakTypes[breakIndex];
             final int sequenceIndex = rawBreaks.intAt(breakIndex);
-            final CharType currentBreakType = breakTypes[breakIndex];
-            final CharType previousBreakType = breakTypes[breakIndex - 1];
-            final CharType nextBreakType = currentBreakType == CharType.END ? null : breakTypes[breakIndex + 1];
 
-            currentBreakType: {
-                switch (currentBreakType) {
-                case END: {
-                    tokens.add(sequence.subSequence(startNextToken, sequence.length()));
-                    break currentBreakType;
+            switch (currentBreakType) {
+            case END: {
+                tokens.add(sequence.subSequence(startNextToken, sequence.length()));
+                break;
+            }
+            case BREAK_CHAR: {
+                if (inWord) {
+                    /* Write the word and roll forward. */
+                    tokens.add(sequence.subSequence(startNextToken, sequenceIndex));
+                    startNextToken = sequenceIndex;
+                    inWord = false;
+                } else {
+                    /* There is no state change. Nothing to do. */
                 }
-                case NON_WORD_BREAK_CHAR: {
-                    previousBreakType: {
-                        switch (previousBreakType) {
-                        case WORD_CHAR: {
-                            tokens.add(sequence.subSequence(startNextToken, sequenceIndex));
-                            startNextToken = sequenceIndex;
-                            break previousBreakType;
-                        }
-                        case OTHER: {
-//                            nextBreakType: {
-//
-//                            }
-//                            tokens.add(sequence.subSequence(startNextToken, sequenceIndex));
-//                            startNextToken = sequenceIndex;
-                            break previousBreakType;
-                        }
-                        default: {
-                            break previousBreakType;
-                        }
-                        }
-                    }
-                    break currentBreakType;
+                break;
+            }
+            default: {
+                /* This is considered the start of word content. */
+                if (inWord) {
+                    /* There is no state change. Nothing to do. */
+                } else {
+                    /* Write the interword content and roll forward. */
+                    tokens.add(sequence.subSequence(startNextToken, sequenceIndex));
+                    startNextToken = sequenceIndex;
+                    inWord = true;
                 }
-                case WORD_CHAR: {
-                    if (previousBreakType == CharType.NON_WORD_BREAK_CHAR) {
-                        tokens.add(sequence.subSequence(startNextToken, sequenceIndex));
-                        startNextToken = sequenceIndex;
-                    }
-                    break currentBreakType;
-                }
-                case ATTACHED_LEADING_PUNCTUATION:
-                case ATTACHED_TRAILING_PUNCTUATION:
-                case OTHER: {
-                    previousBreakType: {
-                        switch (previousBreakType) {
-                        case WORD_CHAR: {
-                            /* The previous item is a word and this is not. Treat it as the beginning of a non-word
-                             * sequence. */
-                            tokens.add(sequence.subSequence(startNextToken, sequenceIndex));
-                            startNextToken = sequenceIndex;
-                            break previousBreakType;
-                        }
-                        case ATTACHED_LEADING_PUNCTUATION:
-                        case ATTACHED_TRAILING_PUNCTUATION:
-                        case OTHER: {
-                            /* There is no change. Nothing to do here. */
-                            break previousBreakType;
-                        }
-                        case NON_WORD_BREAK_CHAR: {
-                            nextBreakType:
-                                switch(nextBreakType) {
-                                case NON_WORD_BREAK_CHAR: {
-                                    /* Sitting between two word breaks. Treat this as a word. */
-                                    tokens.add(sequence.subSequence(startNextToken, sequenceIndex));
-                                    startNextToken = sequenceIndex;
-                                    break nextBreakType;
-                                }
-                                case ATTACHED_LEADING_PUNCTUATION:
-                                case ATTACHED_TRAILING_PUNCTUATION:
-                                case OTHER: {
-                                    break nextBreakType;
-                                }
-                                case WORD_CHAR: {
-                                    /* After a break and before a word. This is interword content. */
-                                    break nextBreakType;
-                                }
-                                case END: {
-                                    /* This can't happen. */
-                                    break nextBreakType;
-                                }
-                                }
-                            break previousBreakType;
-                        }
-                        case END: {
-                            /* This shouldn't happen. */
-                            break previousBreakType;
-                        }
-                        }
-                        break currentBreakType;
-                    }
-                }
-                }
             }
+            }
         }
-
         return tokens;
     }
 
+    @Override
+    public List<CharSequence> tokenize(final CharSequence sequence) {
+        if (sequence == null
+                || sequence.length() < 1) {
+            return Collections.emptyList();
+        }
+
+        /* First pass is to find all of the breaks that the BreakIterator can find. */
+        final IntSequence rawBreaks = findRawBreaks(sequence);
+
+
+        /* The BreakIterator is helpful, but for our purposes does not dig deeply enough.
+         * Our purpose is to find where words start and end and to treat all other content as non-word or interword
+         * content.
+         * So our second pass is to find out the type of each character that is at a break. */
+        final CharType[] breakTypes = findBreakTypes(sequence, rawBreaks);
+
+        /* Third pass. Simplify the breakTypes array. */
+        filterBreakTypes(breakTypes);
+
+        /* The fourth step iterates over the resolved break types and turns them into tokens. */
+        return createTokens(sequence, rawBreaks, breakTypes);
+    }
+
 }

Modified: trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerLatin1Tests.java
===================================================================
--- trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerLatin1Tests.java	2021-11-17 12:35:51 UTC (rev 12075)
+++ trunk/foray/foray-orthography/src/test/java/org/foray/orthography/LexerLatin1Tests.java	2021-11-17 14:02:41 UTC (rev 12076)
@@ -30,7 +30,6 @@
 
 import org.junit.Assert;
 import org.junit.Before;
-import org.junit.Ignore;
 import org.junit.Test;
 
 import java.util.List;
@@ -79,7 +78,7 @@
     public void testMedium() {
         final String testString = "39. It was the best of times. It was the worst of times.   <----";
         final List<CharSequence> actual = this.out.tokenize(testString);
-        Assert.assertEquals(28,      actual.size());
+        Assert.assertEquals(27,      actual.size());
         Assert.assertEquals("39",    actual.get(0));
         Assert.assertEquals(". ",    actual.get(1));
         Assert.assertEquals("It",    actual.get(2));
@@ -106,8 +105,7 @@
         Assert.assertEquals(" ",     actual.get(23));
         Assert.assertEquals("times", actual.get(24));
         Assert.assertEquals(".   ",  actual.get(25));
-        Assert.assertEquals("<",     actual.get(26));
-        Assert.assertEquals("----",  actual.get(27));
+        Assert.assertEquals("<----",  actual.get(26));
     }
 
     /**
@@ -156,7 +154,6 @@
      * However, because each one is between hard word breaks, should be tokenized as words.
      */
     @Test
-    @Ignore
     public void testWithSymbolsAsWords() {
         final String testString = "! @ # $ %";
         final List<CharSequence> actual = this.out.tokenize(testString);
@@ -178,7 +175,6 @@
      * treated as a word.
      */
     @Test
-    @Ignore
     public void testWithAttachedPunctuation() {
         final String testString = "Parentheses (as I stated earlier) are a matching pair of ( and ) characters.";
         final List<CharSequence> actual = this.out.tokenize(testString);

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





2006	Jan	Feb	Mar (139)	Apr (98)	May (250)	Jun (394)	Jul (84)	Aug (13)	Sep (420)	Oct (186)	Nov (1)	Dec (3)
2007	Jan (108)	Feb (202)	Mar (291)	Apr (247)	May (374)	Jun (227)	Jul (231)	Aug (60)	Sep (31)	Oct (45)	Nov (18)	Dec
2008	Jan (38)	Feb (71)	Mar (142)	Apr	May (59)	Jun (6)	Jul (10)	Aug	Sep	Oct	Nov	Dec
2009	Jan (12)	Feb (4)	Mar (88)	Apr (121)	May (17)	Jun (30)	Jul	Aug (5)	Sep	Oct (1)	Nov	Dec
2010	Jan (11)	Feb (76)	Mar (11)	Apr	May (11)	Jun	Jul	Aug (44)	Sep (14)	Oct (7)	Nov	Dec
2011	Jan	Feb	Mar	Apr	May (9)	Jun	Jul	Aug	Sep	Oct (10)	Nov	Dec
2012	Jan	Feb	Mar	Apr	May	Jun (3)	Jul (4)	Aug	Sep	Oct	Nov	Dec
2016	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec (168)
2017	Jan (77)	Feb (11)	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2018	Jan	Feb	Mar (1)	Apr (6)	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2019	Jan	Feb (88)	Mar (118)	Apr (1)	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2020	Jan	Feb	Mar	Apr	May (6)	Jun	Jul	Aug	Sep	Oct	Nov	Dec (141)
2021	Jan (170)	Feb (20)	Mar	Apr	May	Jun	Jul (1)	Aug	Sep	Oct (62)	Nov (189)	Dec (162)
2022	Jan (201)	Feb (118)	Mar (8)	Apr	May (2)	Jun (47)	Jul (19)	Aug (14)	Sep (3)	Oct	Nov (28)	Dec (235)
2023	Jan (112)	Feb (23)	Mar (2)	Apr (2)	May	Jun (1)	Jul	Aug (70)	Sep (92)	Oct (20)	Nov (1)	Dec (1)
2024	Jan	Feb	Mar (1)	Apr (1)	May (14)	Jun (11)	Jul (1)	Aug	Sep	Oct	Nov	Dec
2025	Jan (10)	Feb (29)	Mar	Apr (162)	May (245)	Jun (83)	Jul	Aug (1)	Sep	Oct	Nov (4)	Dec

foray-commit Mailing List for FOray (Page 73)

Modular XSL-FO Implementation for Java.

foray-commit — FOray repository commit log messages