[Corpusreader-svn] SF.net SVN: corpusreader:[227] trunk/corpusreader/src/main/java/tei/cr/ filters
Status: Alpha
Brought to you by:
sylvainloiseau
|
From: <syl...@us...> - 2009-09-24 13:31:46
|
Revision: 227
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=227&view=rev
Author: sylvainloiseau
Date: 2009-09-24 13:31:17 +0000 (Thu, 24 Sep 2009)
Log Message:
-----------
New filter
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/ExtractNGram.java
trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
Added Paths:
-----------
trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java
trunk/corpusreader/src/main/java/tei/cr/filters/ProvidingPhenomenaStream.java
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/ExtractNGram.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/ExtractNGram.java 2009-09-24 13:31:08 UTC (rev 226)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/ExtractNGram.java 2009-09-24 13:31:17 UTC (rev 227)
@@ -69,7 +69,7 @@
public void startPipeline() throws FilterException {
if (patternsBuilder.isEmpty()) {
throw new FilterException(
- "No locator pattern provided."
+ "No PhenomenaStream pattern provided."
);
}
patterns =
@@ -79,7 +79,7 @@
super.addContentHandlerBefore((ContentHandler)patterns[i]);
}
log.info(
- "Number of TeiLocatorPattern registered: "
+ "Number of PhenomenaStream Pattern registered: "
+ patterns.length
);
super.startPipeline();
Added: trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java 2009-09-24 13:31:17 UTC (rev 227)
@@ -0,0 +1,90 @@
+package tei.cr.filters;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import tei.cr.pipeline.AbstractForkingBase;
+import tei.cr.pipeline.FilterByNames;
+import tei.cr.pipeline.WrongArgsException;
+import tei.cr.querydoc.FilterArguments;
+import tei.cr.querydoc.ParseOccurrencePattern;
+import tei.cr.teiDocument.TeiDocument;
+import tei.cr.utils.sax.occurrence.OccurrencePattern;
+import tei.cr.component.ngram.NGramStreamBuilder;
+import tei.cr.component.frequencyList.FrequencyList;
+import tei.cr.component.phenomenaStream.PhenomenaStream;
+import tei.cr.component.phenomenaStream.PhenomenaStreamBuilder;
+
+final public class ExtractPhenomena
+ extends AbstractForkingBase
+ implements ReceiveOccurrence, ProvidingPhenomenaStream {
+
+ private final List<OccurrencePattern> patternsBuilder = new ArrayList<OccurrencePattern>();
+ private OccurrencePattern[] patterns;
+ private final Logger log = Logger.getLogger(getClass().getName());
+ private PhenomenaStreamBuilder tokensStreamBuilder = new PhenomenaStreamBuilder();
+
+ public void addPattern(OccurrencePattern pattern) {
+ patternsBuilder.add(pattern);
+ }
+
+ /**
+ * Receive arguments from a query doc.
+ */
+ public void setArguments(FilterArguments fA, FilterByNames nH,
+ TeiDocument doc)
+ throws WrongArgsException {
+ NodeList list = fA.getNodeList("occurrences/*");
+ if (list.getLength() == 0) {
+ throw new WrongArgsException(
+ "No occurrences provided."
+ );
+ }
+
+ for (int i = 0; i < list.getLength(); i++) {
+ Node element = list.item(i);
+ if (element.getNodeType() != Node.ELEMENT_NODE) {
+ continue;
+ }
+ OccurrencePattern pattern =
+ ParseOccurrencePattern.getOccurrencePattern(((ReceiveOccurrence)this), element, fA);
+ addPattern(pattern);
+ }
+ super.setArguments(fA, nH, doc);
+
+ }
+
+ public void startPipeline() throws FilterException {
+ if (patternsBuilder.isEmpty()) {
+ throw new FilterException(
+ "No PhenomenaStream pattern provided."
+ );
+ }
+ patterns =
+ (OccurrencePattern[])
+ patternsBuilder.toArray(new OccurrencePattern[]{});
+ for (int i = 0; i < patterns.length; i++) {
+ super.addContentHandlerBefore((ContentHandler)patterns[i]);
+ }
+ log.info(
+ "Number of PhenomenaStream Pattern registered: "
+ + patterns.length
+ );
+ super.startPipeline();
+ }
+
+ public void occurrence(Object phenomenon) {
+ tokensStreamBuilder.addPhenomenon(phenomenon);
+ }
+
+ public PhenomenaStream getPhenomenaStream() {
+ PhenomenaStream p = tokensStreamBuilder.getPhenomenaStrem();
+ tokensStreamBuilder = new PhenomenaStreamBuilder();
+ return p;
+ }
+
+}
Added: trunk/corpusreader/src/main/java/tei/cr/filters/ProvidingPhenomenaStream.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/ProvidingPhenomenaStream.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/ProvidingPhenomenaStream.java 2009-09-24 13:31:17 UTC (rev 227)
@@ -0,0 +1,29 @@
+package tei.cr.filters;
+
+import tei.cr.component.phenomenaStream.PhenomenaStream;
+
+/**
+ * A filter able to provide a {@link FrequencyList}.
+ *
+ * @version 0.1
+ *
+ * @author Sylvain Loiseau <syl...@u-...>
+ *
+ * @see tei.cr.filters.ExtractFrequencyList
+ *
+ * @see tei.cr.filters.WriteFrequencyList
+ */
+public interface ProvidingPhenomenaStream {
+
+ /**
+ * <p>Return the lexicon extracted by the filter.</p>
+ *
+ * <p>The contract of this method is that a different
+ * {@link FrequencyList} instance is return at each
+ * different call to this method.</p>
+ *
+ * @return FrequencyList a lexicon containing the data
+ * received since the preceding call to this method.
+ */
+ public PhenomenaStream getPhenomenaStream();
+}
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-09-24 13:31:08 UTC (rev 226)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-09-24 13:31:17 UTC (rev 227)
@@ -1,4 +1,4 @@
-package tei.cr.filters;
+ptextWriter.wackage tei.cr.filters;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
@@ -13,203 +13,102 @@
import tei.cr.pipeline.WrongArgsException;
import tei.cr.querydoc.FilterArguments;
import tei.cr.teiDocument.TeiDocument;
+import tei.cr.component.phenomenaStream.PhenomenaStream;
/**
- * TODO : to be rewrite with tei.cr.component.phenomenaStream
- * <p>Convert into plain text, using w/@lm.</p>
+ * Convert a stream of phenomena provided by a
+ * ProvidingPhenomeaStream filter into a text file. Each
+ * token is blank-separated, and each new document
+ * produce a carriage return.
*
- * <p>Token local name and paragrahp local name must be different.</p>
- *
* @author Sylvain Loiseau <slo...@u-...>
* @version 0.1
*/
final public class TextFormat extends AbstractBase {
- ///////////////////////////////
- // Fields
- ///////////////////////////////
+ ///////////////////////////////
+ // Fields
+ ///////////////////////////////
- private BufferedWriter textWriter;
+ private BufferedWriter textWriter;
- private String textUri = null;
+ private String textUri = null;
- /**
- * The name of the element surrounding the tokens
- */
- private String tokenLocalName = DEFAULT_TOKEN_LOCAL_NAME;
-
- /**
- * Default name of the element surrounding the tokens
- */
- private final static String DEFAULT_TOKEN_LOCAL_NAME = "w";
+ private ProvidingPhenomenaStream handler;
- /**
- * The name of the element surrounding the paragraph, separated with \n
- */
- private String paragraphLocalName = DEFAULT_PARAGRAPH_LOCAL_NAME;
-
- /**
- * Default name of the element surrounding the tokens
- */
- private final static String DEFAULT_PARAGRAPH_LOCAL_NAME = "p";
+ ///////////////////////////////
+ // Pipelinable
+ ///////////////////////////////
- private boolean isInParagraph = false;
+ public void setArguments(FilterArguments fA, FilterByNames nH, TeiDocument doc)
+ throws WrongArgsException {
+ String phenomena_filter = fA.getText(FilterArguments.TEXT_FORMAT_PROVIDE_PHENOMENA);
- private boolean isInToken = false;
+ handler =
+ (ProvidingPhenomenaStream) nH.get(phenomena_filter);
+ if (handler == null) {
+ throw new WrongArgsException("No filter of name \""
+ + lexiconHandlerName + "\"found in the pipeline");
+ }
- ///////////////////////////////
- // Accessor
- ///////////////////////////////
-
- /**
- * @param localName the local name to be used for
- * annotating the token
- * @throws WrongArgsException if the local name is null or empty.
- */
- public void setTokenLocalName(String localName)
- throws WrongArgsException {
- if ((localName == null) || localName.equals("")) {
- throw new WrongArgsException("The token element local name cannot be null or empty.");
- }
- tokenLocalName = localName;
+ textUri = fA.getText(FilterArguments.TEXT_FORMAT_TEXT_URI);
+ if ((textUri == null) || textUri.equals("")) {
+ throw new WrongArgsException("Text url cannot be null.");
+ }
}
- /**
- * @param localName the local name to be used for
- * annotating the token
- * @throws WrongArgsException if the local name is null or empty.
- */
- public void setParagraphLocalName(String localName)
- throws WrongArgsException {
- if ((localName == null) || localName.equals("")) {
- throw new WrongArgsException("The token element local name cannot be null or empty.");
- }
- tokenLocalName = localName;
+ /**
+ * Create the file.
+ */
+ public void startPipeline() throws FilterException {
+ try {
+ textWriter = new BufferedWriter(
+ new OutputStreamWriter(
+ new FileOutputStream(textUri),
+ "ISO-8859-1"
+ )
+ );
+ } catch (IOException e) {
+ throw new FilterException("Error while creating the file: " +
+ e.getMessage(),
+ e
+ );
}
+ super.startPipeline();
+ }
- ///////////////////////////////
- // Pipelinable
- ///////////////////////////////
+ public void endPipeline() throws FilterException {
+ flush();
+ super.endPipeline();
+ }
- public void setArguments(FilterArguments fA, FilterByNames nH, TeiDocument doc)
- throws WrongArgsException {
- String w_localName = fA.getText(FilterArguments.TEXT_FORMAT_TOKEN_LOCAL_NAME);
- if ((w_localName != null) && w_localName.equals("")) {
- setTokenLocalName(w_localName);
- }
- String p_localName = fA.getText(FilterArguments.TEXT_FORMAT_PARAGRAPH_LOCAL_NAME);
- if ((p_localName != null) && p_localName.equals("")) {
- setParagraphLocalName(p_localName);
- }
- textUri = fA.getText(FilterArguments.TEXT_FORMAT_TEXT_URI);
- if ((textUri == null) || textUri.equals("")) {
- throw new WrongArgsException("Text url cannot be null.");
- }
+ private void flush() throws FilterException {
+ try {
+ textWriter.flush();
+ } catch (IOException ioE) {
+ throw new FilterException("Error while flushing \"" +
+ textUri +
+ "\": " +
+ ioE.getMessage(),
+ ioE
+ );
}
+ }
- /**
- * Create the file.
- */
- public void startPipeline() throws FilterException {
- try {
- textWriter = new BufferedWriter(
- new OutputStreamWriter(
- new FileOutputStream(textUri),
- "ISO-8859-1"
- )
- );
- } catch (IOException e) {
- throw new FilterException("Error while creating the file: " +
- e.getMessage(),
- e
- );
- }
- super.startPipeline();
- }
+ ///////////////////////////////
+ // Event handlers
+ ///////////////////////////////
- public void endPipeline() throws FilterException {
- flush();
- super.endPipeline();
+ public void endDocument throws SAXException {
+ PhenomenaStream p = handler.getPhenomenaStream();
+ int length = p.getPhenomenaLength();
+ for (int i = 0; i < length; i++) {
+ String phen = p.getPhenomenon(i);
+ textWriter.write(phen, 0, phen.length());
}
- private void flush() throws FilterException {
- try {
- textWriter.flush();
- } catch (IOException ioE) {
- throw new FilterException("Error while flushing \"" +
- textUri +
- "\": " +
- ioE.getMessage(),
- ioE
- );
- }
- }
-
- ///////////////////////////////
- // Event handlers
- ///////////////////////////////
-
- public void startElement(String uri, String localname, String qname, Attributes attributes)
- throws SAXException {
- if (!isInParagraph) {
- if (localname.equals(paragraphLocalName)) {
- isInParagraph = true;
- try {
- textWriter.newLine();
- } catch (IOException e) {
- throw new FilterException("Error while writing: " + e.getMessage(), e);
- }
- }
- }
-
- if (!isInToken) {
- if (localname.equals(tokenLocalName)) {
- isInToken = true;
- try {
- textWriter.write(" ", 0, 1);
- String lemma = attributes.getValue("", "lm");
- if (lemma != null) {
- textWriter.write(lemma, 0, lemma.length());
- }
- } catch (IOException e) {
- throw new FilterException("Error while writing: " + e.getMessage(), e);
- }
- }
- }
- super.startElement(uri, localname, qname, attributes);
- }
-
- public void endElement(String namespaceURI, String lName, String qName)
- throws SAXException {
- if (isInParagraph) {
- if (lName.equals(paragraphLocalName)) {
- isInParagraph = false;
- }
- }
- if (isInToken) {
- if (lName.equals(tokenLocalName)) {
- isInToken = false;
- }
- }
- super.endElement(namespaceURI, lName, qName);
- }
-
-// public void characters(char[] buf, int offset, int len)
-// throws SAXException {
-// if (isInToken) {
-// try {
-// textWriter.write(buf, offset, len);
-// } catch (IOException e) {
-// throw new FilterException("Error while writing: " + e.getMessage(), e);
-// }
-// }
-// super.characters(buf, offset, len);
-// }
-//
-// public void ignorableWhitespace(char[] ch, int start, int length)
-// throws SAXException {
-// characters(ch, start, length);
-// }
-
+ textWriter.newLine();
+ super.endDocument();
+ }
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|