[DL-Learner SVN] SF.net SVN: dl-learner:[4166] trunk/components-core/src/main/java/org/ dllearner/

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4166
          http://sourceforge.net/p/dl-learner/code/4166
Author:   dfleischhacker
Date:     2013-11-21 13:40:31 +0000 (Thu, 21 Nov 2013)
Log Message:
-----------
Add method for getting a number of tokens starting at a given token

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java	2013-11-21 13:39:34 UTC (rev 4165)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java	2013-11-21 13:40:31 UTC (rev 4166)
@@ -1,6 +1,8 @@
 package org.dllearner.algorithms.isle.index;
 
+import java.util.ArrayList;
 import java.util.LinkedList;
+import java.util.List;
 
 /**
  * A simple text document without further formatting or markup.
@@ -8,6 +10,16 @@
  * @author Daniel Fleischhacker
  */
 public class TextDocument extends LinkedList<Token> implements Document {
+    public static void main(String[] args) {
+        TextDocument t = new TextDocument();
+        String s = "This is a very long, nice text for testing our new implementation of TextDocument.";
+        for (String e : s.split(" ")) {
+            t.add(new Token(e));
+        }
+
+        System.out.println(t.getRawContent());
+    }
+
     @Override
     public String getContent() {
         return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.STEMMED);
@@ -28,7 +40,7 @@
      * surface forms according to {@code level} are used to build the string.
      *
      * @param start token to start building the string at, i.e., the first token in the returned string
-     * @param l level of surface forms to use
+     * @param l     level of surface forms to use
      * @return built string
      */
     public String getContentStartingAtToken(Token start, SurfaceFormLevel l) {
@@ -51,6 +63,42 @@
         return sb.toString();
     }
 
+    /**
+     * Returns a list containing {@code numberOfTokens} successive tokens from this document starting at the given start
+     * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not
+     * counted for the number of tokens.
+     *
+     * @param start             token to start collecting tokens from the document
+     * @param numberOfTokens    number of tokens to collect from the document
+     * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return
+     * @return list containing the given number of relevant tokens, depending in the value of ignorePunctuation, the
+     *          list might contain additional non-relevant (punctuation) tokens
+     */
+    public List<Token> getTokensStartingAtToken(Token start, int numberOfTokens, boolean ignorePunctuation) {
+        ArrayList<Token> tokens = new ArrayList<Token>();
+
+        int relevantTokens = 0;
+        boolean found = false;
+
+        for (Token t : this) {
+            if (found) {
+                tokens.add(t);
+                if (!ignorePunctuation || !t.isPunctuation()) {
+                    relevantTokens++;
+                }
+            }
+            else if (t == start) {
+                found = true;
+                tokens.add(t);
+            }
+            if (relevantTokens == numberOfTokens) {
+                break;
+            }
+        }
+
+        return tokens;
+    }
+
     private String getStringForLevel(Token t, SurfaceFormLevel l) {
         switch (l) {
             case RAW:
@@ -63,14 +111,4 @@
 
         return null;
     }
-
-    public static void main(String[] args) {
-        TextDocument t = new TextDocument();
-        String s = "This is a very long, nice text for testing our new implementation of TextDocument.";
-        for (String e : s.split(" ")) {
-            t.add(new Token(e));
-        }
-
-        System.out.println(t.getRawContent());
-    }
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4166] trunk/components-core/src/main/java/org/ dllearner/

[DL-Learner SVN] SF.net SVN: dl-learner:[4166] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle/index/TextDocument.java