From: <dfl...@us...> - 2013-08-19 09:53:17
|
Revision: 4025 http://sourceforge.net/p/dl-learner/code/4025 Author: dfleischhacker Date: 2013-08-19 09:53:14 +0000 (Mon, 19 Aug 2013) Log Message: ----------- TR API: Add equals and hashCode to TextDocument Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-08-19 09:52:57 UTC (rev 4024) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-08-19 09:53:14 UTC (rev 4025) @@ -8,6 +8,12 @@ public class TextDocument implements Document { private String content; + + /** + * Initializes a text document with the given content. + * + * @param content content of this text document + */ public TextDocument(String content) { this.content = content; } @@ -26,4 +32,27 @@ public String getRawContent() { return content; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + TextDocument that = (TextDocument) o; + + if (!content.equals(that.content)) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + return content.hashCode(); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-04 14:26:52
|
Revision: 4060 http://sourceforge.net/p/dl-learner/code/4060 Author: dfleischhacker Date: 2013-09-04 14:26:47 +0000 (Wed, 04 Sep 2013) Log Message: ----------- TextDocument cleans up text at initialization Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-04 14:15:30 UTC (rev 4059) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-04 14:26:47 UTC (rev 4060) @@ -7,15 +7,19 @@ */ public class TextDocument implements Document { private String content; + private String rawContent; /** - * Initializes a text document with the given content. + * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only + * contains letters adhering to the regular expression pattern [A-Za-z]. * - * @param content content of this text document + * @param content the raw content of this text document */ public TextDocument(String content) { - this.content = content; + this.rawContent = content; + this.content = content.replaceAll("[^A-Za-z ]", " "); + this.content = this.content.replaceAll("\\s{2,}", " "); } @Override @@ -30,7 +34,7 @@ */ @Override public String getRawContent() { - return content; + return rawContent; } @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-06 12:48:13
|
Revision: 4094 http://sourceforge.net/p/dl-learner/code/4094 Author: dfleischhacker Date: 2013-09-06 12:48:08 +0000 (Fri, 06 Sep 2013) Log Message: ----------- Normalize documents to all lowercase Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-06 11:36:33 UTC (rev 4093) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-06 12:48:08 UTC (rev 4094) @@ -20,6 +20,7 @@ this.rawContent = content; this.content = content.replaceAll("[^A-Za-z ]", " "); this.content = this.content.replaceAll("\\s{2,}", " "); + this.content = content.toLowerCase(); } @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-09 10:12:59
|
Revision: 4100 http://sourceforge.net/p/dl-learner/code/4100 Author: dfleischhacker Date: 2013-09-09 10:12:56 +0000 (Mon, 09 Sep 2013) Log Message: ----------- Improve document content cleanup Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-09 10:12:21 UTC (rev 4099) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-09 10:12:56 UTC (rev 4100) @@ -18,9 +18,10 @@ */ public TextDocument(String content) { this.rawContent = content; - this.content = content.replaceAll("[^A-Za-z ]", " "); + this.content = content.toLowerCase(); + this.content = this.content.replaceAll("[^a-z ]", " "); this.content = this.content.replaceAll("\\s{2,}", " "); - this.content = content.toLowerCase(); + this.content = content.trim(); } @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-10-24 13:48:02
|
Revision: 4129 http://sourceforge.net/p/dl-learner/code/4129 Author: dfleischhacker Date: 2013-10-24 13:47:58 +0000 (Thu, 24 Oct 2013) Log Message: ----------- Fix wrong cleaning in TextDocument Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-10-24 13:47:14 UTC (rev 4128) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-10-24 13:47:58 UTC (rev 4129) @@ -21,7 +21,7 @@ this.content = content.toLowerCase(); this.content = this.content.replaceAll("[^a-z ]", " "); this.content = this.content.replaceAll("\\s{2,}", " "); - this.content = content.trim(); + this.content = this.content.trim(); } @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 12:32:33
|
Revision: 4157 http://sourceforge.net/p/dl-learner/code/4157 Author: dfleischhacker Date: 2013-11-21 12:32:30 +0000 (Thu, 21 Nov 2013) Log Message: ----------- TextDocument refactoring Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:25:40 UTC (rev 4156) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:32:30 UTC (rev 4157) @@ -1,95 +1,12 @@ package org.dllearner.algorithms.isle.index; -import org.dllearner.algorithms.isle.StanfordPartOfSpeechTagger; +import java.util.LinkedList; /** * A simple text document without further formatting or markup. * * @author Daniel Fleischhacker */ -public class TextDocument implements Document { - private String content; - private String rawContent; - private String posTaggedContent; +public class TextDocument extends LinkedList<Token> { - /** - * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only - * contains letters adhering to the regular expression pattern [A-Za-z]. - * - * @param content the raw content of this text document - */ - public TextDocument(String content) { - this.rawContent = content; - - //build cleaned content - buildCleanedContent(); - - //build POS tagged content - buildPOSTaggedContent(); - } - - private void buildCleanedContent(){ - this.content = rawContent.toLowerCase(); - this.content = this.content.replaceAll("[^a-z ]", " "); - this.content = this.content.replaceAll("\\s{2,}", " "); - this.content = this.content.trim(); - } - - private void buildPOSTaggedContent(){ - this.posTaggedContent = StanfordPartOfSpeechTagger.getInstance().tag(rawContent); - } - - @Override - public String getContent() { - return content; - } - - /** - * The text content of this document. Returns the same data as {@link #getContent()}. - * - * @return text content of this document - */ - @Override - public String getRawContent() { - return rawContent; - } - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.index.Document#getPOSTaggedContent() - */ - @Override - public String getPOSTaggedContent() { - return posTaggedContent; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - TextDocument that = (TextDocument) o; - - if (!content.equals(that.content)) { - return false; - } - - return true; - } - - @Override - public int hashCode() { - return content.hashCode(); - } - - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ - @Override - public String toString() { - return content; - } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 12:44:12
|
Revision: 4159 http://sourceforge.net/p/dl-learner/code/4159 Author: dfleischhacker Date: 2013-11-21 12:44:09 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Re-implement TextDocument based on Tokens Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:36:47 UTC (rev 4158) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:44:09 UTC (rev 4159) @@ -7,6 +7,63 @@ * * @author Daniel Fleischhacker */ -public class TextDocument extends LinkedList<Token> { +public class TextDocument extends LinkedList<Token> implements Document { + @Override + public String getContent() { + return getContentStartingAtToken(this.getFirst(), Level.STEMMED); + } + @Override + public String getRawContent() { + return getContentStartingAtToken(this.getFirst(), Level.RAW); + } + + @Override + public String getPOSTaggedContent() { + return getContentStartingAtToken(this.getFirst(), Level.POS_TAGGED); + } + + public static enum Level { + RAW, + POS_TAGGED, + STEMMED + } + + /** + * Returns a string containing all tokens starting at the token {@code start} until the end of the list. The + * surface forms according to {@code level} are used to build the string. + * + * @param start token to start building the string at, i.e., the first token in the returned string + * @param l level of surface forms to use + * @return built string + */ + public String getContentStartingAtToken(Token start, Level l) { + StringBuilder sb = new StringBuilder(); + boolean found = false; + for (Token t : this) { + if (found) { + sb.append(" "); + sb.append(getStringForLevel(t, l)); + } + else if (t == start) { + found = true; + sb.append(getStringForLevel(t, l)); + } + } + + return sb.toString(); + } + + private String getStringForLevel(Token t, Level l) { + switch (l) { + case RAW: + return t.getRawForm(); + case POS_TAGGED: + return t.getPOSTag(); + case STEMMED: + return t.getStemmedForm(); + } + + return null; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:00:36
|
Revision: 4162 http://sourceforge.net/p/dl-learner/code/4162 Author: dfleischhacker Date: 2013-11-21 13:00:33 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Ignore punctuation in stemmed text Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:57:10 UTC (rev 4161) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 13:00:33 UTC (rev 4162) @@ -37,7 +37,10 @@ for (Token t : this) { if (found) { sb.append(" "); - sb.append(getStringForLevel(t, l)); + String surfaceForm = getStringForLevel(t, l); + if (surfaceForm != null) { + sb.append(surfaceForm); + } } else if (t == start) { found = true; @@ -55,9 +58,19 @@ case POS_TAGGED: return t.getPOSTag(); case STEMMED: - return t.getStemmedForm(); + return t.isPunctuation() ? null : t.getStemmedForm(); } return null; } + + public static void main(String[] args) { + TextDocument t = new TextDocument(); + String s = "This is a very long, nice text for testing our new implementation of TextDocument."; + for (String e : s.split(" ")) { + t.add(new Token(e)); + } + + System.out.println(t.getRawContent()); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:40:34
|
Revision: 4166 http://sourceforge.net/p/dl-learner/code/4166 Author: dfleischhacker Date: 2013-11-21 13:40:31 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Add method for getting a number of tokens starting at a given token Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 13:39:34 UTC (rev 4165) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 13:40:31 UTC (rev 4166) @@ -1,6 +1,8 @@ package org.dllearner.algorithms.isle.index; +import java.util.ArrayList; import java.util.LinkedList; +import java.util.List; /** * A simple text document without further formatting or markup. @@ -8,6 +10,16 @@ * @author Daniel Fleischhacker */ public class TextDocument extends LinkedList<Token> implements Document { + public static void main(String[] args) { + TextDocument t = new TextDocument(); + String s = "This is a very long, nice text for testing our new implementation of TextDocument."; + for (String e : s.split(" ")) { + t.add(new Token(e)); + } + + System.out.println(t.getRawContent()); + } + @Override public String getContent() { return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.STEMMED); @@ -28,7 +40,7 @@ * surface forms according to {@code level} are used to build the string. * * @param start token to start building the string at, i.e., the first token in the returned string - * @param l level of surface forms to use + * @param l level of surface forms to use * @return built string */ public String getContentStartingAtToken(Token start, SurfaceFormLevel l) { @@ -51,6 +63,42 @@ return sb.toString(); } + /** + * Returns a list containing {@code numberOfTokens} successive tokens from this document starting at the given start + * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not + * counted for the number of tokens. + * + * @param start token to start collecting tokens from the document + * @param numberOfTokens number of tokens to collect from the document + * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return + * @return list containing the given number of relevant tokens, depending in the value of ignorePunctuation, the + * list might contain additional non-relevant (punctuation) tokens + */ + public List<Token> getTokensStartingAtToken(Token start, int numberOfTokens, boolean ignorePunctuation) { + ArrayList<Token> tokens = new ArrayList<Token>(); + + int relevantTokens = 0; + boolean found = false; + + for (Token t : this) { + if (found) { + tokens.add(t); + if (!ignorePunctuation || !t.isPunctuation()) { + relevantTokens++; + } + } + else if (t == start) { + found = true; + tokens.add(t); + } + if (relevantTokens == numberOfTokens) { + break; + } + } + + return tokens; + } + private String getStringForLevel(Token t, SurfaceFormLevel l) { switch (l) { case RAW: @@ -63,14 +111,4 @@ return null; } - - public static void main(String[] args) { - TextDocument t = new TextDocument(); - String s = "This is a very long, nice text for testing our new implementation of TextDocument."; - for (String e : s.split(" ")) { - t.add(new Token(e)); - } - - System.out.println(t.getRawContent()); - } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 15:19:09
|
Revision: 4185 http://sourceforge.net/p/dl-learner/code/4185 Author: dfleischhacker Date: 2013-12-02 15:19:06 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Add getTokensStartingAtToken without numerOfToken parameter Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-02 14:59:36 UTC (rev 4184) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-02 15:19:06 UTC (rev 4185) @@ -99,6 +99,31 @@ return tokens; } + /** + * Returns a list containing all successive tokens from this document starting at the given start + * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not + * counted for the number of tokens. + * + * @param start token to start collecting tokens from the document + * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return + * @return list containing all relevant tokens, depending in the value of ignorePunctuation, the + * list might contain additional non-relevant (punctuation) tokens + */ + public List<Token> getTokensStartingAtToken(Token start, boolean ignorePunctuation) { + ArrayList<Token> tokens = new ArrayList<Token>(); + + boolean found = false; + + for (int i = 0; i < this.size(); i++) { + Token t = this.get(i); + if (t == start) { + return this.subList(i, this.size()); + } + } + + return tokens; + } + private String getStringForLevel(Token t, SurfaceFormLevel l) { switch (l) { case RAW: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 15:20:20
|
Revision: 4186 http://sourceforge.net/p/dl-learner/code/4186 Author: dfleischhacker Date: 2013-12-02 15:20:16 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Re-enabled ignorePunctuation Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-02 15:19:06 UTC (rev 4185) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-02 15:20:16 UTC (rev 4186) @@ -112,13 +112,20 @@ public List<Token> getTokensStartingAtToken(Token start, boolean ignorePunctuation) { ArrayList<Token> tokens = new ArrayList<Token>(); + int relevantTokens = 0; boolean found = false; - for (int i = 0; i < this.size(); i++) { - Token t = this.get(i); - if (t == start) { - return this.subList(i, this.size()); + for (Token t : this) { + if (found) { + tokens.add(t); + if (!ignorePunctuation || !t.isPunctuation()) { + relevantTokens++; + } } + else if (t == start) { + found = true; + tokens.add(t); + } } return tokens; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |