From: <dfl...@us...> - 2013-09-04 14:26:52
|
Revision: 4060 http://sourceforge.net/p/dl-learner/code/4060 Author: dfleischhacker Date: 2013-09-04 14:26:47 +0000 (Wed, 04 Sep 2013) Log Message: ----------- TextDocument cleans up text at initialization Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-04 14:15:30 UTC (rev 4059) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-04 14:26:47 UTC (rev 4060) @@ -7,15 +7,19 @@ */ public class TextDocument implements Document { private String content; + private String rawContent; /** - * Initializes a text document with the given content. + * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only + * contains letters adhering to the regular expression pattern [A-Za-z]. * - * @param content content of this text document + * @param content the raw content of this text document */ public TextDocument(String content) { - this.content = content; + this.rawContent = content; + this.content = content.replaceAll("[^A-Za-z ]", " "); + this.content = this.content.replaceAll("\\s{2,}", " "); } @Override @@ -30,7 +34,7 @@ */ @Override public String getRawContent() { - return content; + return rawContent; } @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |