foray-commit Mailing List for FOray (Page 25)

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 13276
          http://sourceforge.net/p/foray/code/13276
Author:   victormote
Date:     2023-09-27 00:09:41 +0000 (Wed, 27 Sep 2023)
Log Message:
-----------
Store the intermediate lexing results with each input item, so that all input items can be processed as contiguous items.

Modified Paths:
--------------
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================

--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java	2023-09-26 23:54:59 UTC (rev 13275)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java	2023-09-27 00:09:41 UTC (rev 13276)
@@ -197,6 +197,12 @@
         /** Indicates if this is a word token. If false, it is untokenized input. */
         private boolean isWordToken;
 
+        /** The indexes into {@link #text} where each computed break occurs. */
+        private IntSequence breakOffsets;
+
+        /** The type of each break in {@link #breakOffsets}. */
+        private TokenType2[] breakTypes;
+
     }
 
     /**
@@ -385,7 +391,7 @@
     private void tokenizeImplicit(final int index) {
         final Input inputItem = this.input.get(index);
         /* First pass is to find all of the breaks that the BreakIterator can find. */
-        final IntSequence rawBreaks = findRawBreaks(inputItem.text, inputItem.writingSystem);
+        inputItem.breakOffsets = findRawBreaks(inputItem.text, inputItem.writingSystem);
 
 
         /* The BreakIterator is helpful, but for our purposes does not dig deeply enough.
@@ -392,7 +398,7 @@
          * Our purpose is to find where words start and end and to treat all other content as non-word or interword
          * content.
          * So our second pass is to find out the type of each character that is at a break. */
-        final TokenType2[] breakTypes = findBreakTypes(inputItem.text, rawBreaks);
+        inputItem.breakTypes = findBreakTypes(inputItem.text, inputItem.breakOffsets);
 
         /* Third pass. Simplify the breakTypes array. */
         /* For normal case (no explicit tokens), the conceptual token immediately previous to the first one is a break
@@ -400,10 +406,10 @@
         final TokenType2 preSequenceBreakType = TokenType2.BREAK;
         /* The conceptual token immediately after the last actual token is the end char. */
         final TokenType2 postSequenceBreakType = TokenType2.END;
-        filterBreakTypes(breakTypes, preSequenceBreakType, postSequenceBreakType);
+        filterBreakTypes(inputItem.breakTypes, preSequenceBreakType, postSequenceBreakType);
 
         /* The fourth step iterates over the resolved break types and turns them into tokens. */
-        createImplicitTokens(inputItem.text, rawBreaks, breakTypes, inputItem.writingSystem);
+        createImplicitTokens(inputItem);
     }
 
     /**
@@ -690,26 +696,22 @@
 
     /**
      * Create the token list.
-     * @param sequence The sequence of characters being tokenized.
-     * @param rawOffsets The offsets of the breaks found by the break iterator.
-     * @param breakTypes The filtered break types.
-     * @param writingSystem The writing system for {@code sequence}.
+     * @param inputItem The input item whose raw break information should be converted to tokens.
      */
-    protected void createImplicitTokens(final CharSequence sequence, final IntSequence rawOffsets,
-            final TokenType2[] breakTypes, final WritingSystem writingSystem) {
+    protected void createImplicitTokens(final Input inputItem) {
         TokenType2 lastBreakType = TokenType2.START;
         int nextTokenOffset = 0;
 
-        for (int breakIndex = 0; breakIndex < breakTypes.length; breakIndex ++) {
-            final TokenType2 currentBreakType = breakTypes[breakIndex];
-            final int currentOffset = rawOffsets.intAt(breakIndex);
+        for (int breakIndex = 0; breakIndex < inputItem.breakTypes.length; breakIndex ++) {
+            final TokenType2 currentBreakType = inputItem.breakTypes[breakIndex];
+            final int currentOffset = inputItem.breakOffsets.intAt(breakIndex);
             if (lastBreakType != TokenType2.START
                     && (currentBreakType != lastBreakType
                     || currentBreakType != TokenType2.WORD)) {
                 final Token4a token = new Token4a();
-                token.text = sequence.subSequence(nextTokenOffset, currentOffset);
+                token.text = inputItem.text.subSequence(nextTokenOffset, currentOffset);
                 token.type = lastBreakType.wrappedTokenType;
-                token.writingSystem = writingSystem;
+                token.writingSystem = inputItem.writingSystem;
                 this.output.add(token);
                 nextTokenOffset = currentOffset;
             }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





2006	Jan	Feb	Mar (139)	Apr (98)	May (250)	Jun (394)	Jul (84)	Aug (13)	Sep (420)	Oct (186)	Nov (1)	Dec (3)
2007	Jan (108)	Feb (202)	Mar (291)	Apr (247)	May (374)	Jun (227)	Jul (231)	Aug (60)	Sep (31)	Oct (45)	Nov (18)	Dec
2008	Jan (38)	Feb (71)	Mar (142)	Apr	May (59)	Jun (6)	Jul (10)	Aug	Sep	Oct	Nov	Dec
2009	Jan (12)	Feb (4)	Mar (88)	Apr (121)	May (17)	Jun (30)	Jul	Aug (5)	Sep	Oct (1)	Nov	Dec
2010	Jan (11)	Feb (76)	Mar (11)	Apr	May (11)	Jun	Jul	Aug (44)	Sep (14)	Oct (7)	Nov	Dec
2011	Jan	Feb	Mar	Apr	May (9)	Jun	Jul	Aug	Sep	Oct (10)	Nov	Dec
2012	Jan	Feb	Mar	Apr	May	Jun (3)	Jul (4)	Aug	Sep	Oct	Nov	Dec
2016	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec (168)
2017	Jan (77)	Feb (11)	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2018	Jan	Feb	Mar (1)	Apr (6)	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2019	Jan	Feb (88)	Mar (118)	Apr (1)	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2020	Jan	Feb	Mar	Apr	May (6)	Jun	Jul	Aug	Sep	Oct	Nov	Dec (141)
2021	Jan (170)	Feb (20)	Mar	Apr	May	Jun	Jul (1)	Aug	Sep	Oct (62)	Nov (189)	Dec (162)
2022	Jan (201)	Feb (118)	Mar (8)	Apr	May (2)	Jun (47)	Jul (19)	Aug (14)	Sep (3)	Oct	Nov (28)	Dec (235)
2023	Jan (112)	Feb (23)	Mar (2)	Apr (2)	May	Jun (1)	Jul	Aug (70)	Sep (92)	Oct (20)	Nov (1)	Dec (1)
2024	Jan	Feb	Mar (1)	Apr (1)	May (14)	Jun (11)	Jul (1)	Aug	Sep	Oct	Nov	Dec
2025	Jan (10)	Feb (29)	Mar	Apr (162)	May (245)	Jun (83)	Jul	Aug (1)	Sep	Oct	Nov	Dec

foray-commit Mailing List for FOray (Page 25)

Modular XSL-FO Implementation for Java.

foray-commit — FOray repository commit log messages