[Corpusreader-svn] SF.net SVN: corpusreader:[215] trunk/corpusreader/src
Status: Alpha
Brought to you by:
sylvainloiseau
|
From: <syl...@us...> - 2009-08-28 20:47:50
|
Revision: 215
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=215&view=rev
Author: sylvainloiseau
Date: 2009-08-28 20:47:44 +0000 (Fri, 28 Aug 2009)
Log Message:
-----------
new filters
Modified Paths:
--------------
trunk/corpusreader/src/TODO.tei
trunk/corpusreader/src/main/java/tei/cr/Version.java
trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
Added Paths:
-----------
trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
trunk/corpusreader/src/main/schema/filters/TextFormat.rnc
Modified: trunk/corpusreader/src/TODO.tei
===================================================================
--- trunk/corpusreader/src/TODO.tei 2009-02-16 14:28:33 UTC (rev 214)
+++ trunk/corpusreader/src/TODO.tei 2009-08-28 20:47:44 UTC (rev 215)
@@ -28,6 +28,35 @@
</head>
<list>
+ --
+
+ Clarifier entre reference pattern et occurrence pattern
+ Mettre tous les types (localName, TEIName, etc.) et pas seulement "elxpath" avec les patterns qui prennent un argument dans reference pattern. Par exemple :
+ <use elxpath="date/@annee">
+ <match localname="date" />
+ </use>
+
+ Et pas seulement
+
+ <use elxpath="date/@annee">
+ <match elxpath="date" />
+ </use>
+
+ De m\xEAme :
+
+ <valueOf localName="Taille" />
+
+ Plut\xF4t que elxpath
+
+ Pareil dans SelectSubTrees : mettre localName
+
+ Dans selectSubTrees : l'argument test/@elxpath devrait \xEAtre nomm\xE9 test/@xpath
+
+ --
+
+
+
+
Mettre par d\xE9faut dans le r\xE9pertoire de lancement le fichier de log
AbstractForkingBase :
Modified: trunk/corpusreader/src/main/java/tei/cr/Version.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/Version.java 2009-02-16 14:28:33 UTC (rev 214)
+++ trunk/corpusreader/src/main/java/tei/cr/Version.java 2009-08-28 20:47:44 UTC (rev 215)
@@ -11,7 +11,7 @@
// don't change this lines; regexp are used during build for updating the fields.
private static final String VERSION = "0.1"; // "8.2";
- private static final String RELEASE_DATE = "20081221-1138";
+ private static final String RELEASE_DATE = "20090826-1111";
private static final String PRODUCT_NAME = "CorpusReader";
private Version() {
Added: trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-08-28 20:47:44 UTC (rev 215)
@@ -0,0 +1,215 @@
+package tei.cr.filters;
+
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import tei.cr.pipeline.AbstractBase;
+import tei.cr.pipeline.FilterByNames;
+import tei.cr.pipeline.WrongArgsException;
+import tei.cr.querydoc.FilterArguments;
+import tei.cr.teiDocument.TeiDocument;
+
+/**
+ * TODO : to be rewrite with tei.cr.component.phenomenaStream
+ * <p>Convert into plain text, using w/@lm.</p>
+ *
+ * <p>Token local name and paragrahp local name must be different.</p>
+ *
+ * @author Sylvain Loiseau <slo...@u-...>
+ * @version 0.1
+ */
+
+final public class TextFormat extends AbstractBase {
+
+ ///////////////////////////////
+ // Fields
+ ///////////////////////////////
+
+ private BufferedWriter textWriter;
+
+ private String textUri = null;
+
+ /**
+ * The name of the element surrounding the tokens
+ */
+ private String tokenLocalName = DEFAULT_TOKEN_LOCAL_NAME;
+
+ /**
+ * Default name of the element surrounding the tokens
+ */
+ private final static String DEFAULT_TOKEN_LOCAL_NAME = "w";
+
+ /**
+ * The name of the element surrounding the paragraph, separated with \n
+ */
+ private String paragraphLocalName = DEFAULT_PARAGRAPH_LOCAL_NAME;
+
+ /**
+ * Default name of the element surrounding the tokens
+ */
+ private final static String DEFAULT_PARAGRAPH_LOCAL_NAME = "p";
+
+ private boolean isInParagraph = false;
+
+ private boolean isInToken = false;
+
+ ///////////////////////////////
+ // Accessor
+ ///////////////////////////////
+
+ /**
+ * @param localName the local name to be used for
+ * annotating the token
+ * @throws WrongArgsException if the local name is null or empty.
+ */
+ public void setTokenLocalName(String localName)
+ throws WrongArgsException {
+ if ((localName == null) || localName.equals("")) {
+ throw new WrongArgsException("The token element local name cannot be null or empty.");
+ }
+ tokenLocalName = localName;
+ }
+
+ /**
+ * @param localName the local name to be used for
+ * annotating the token
+ * @throws WrongArgsException if the local name is null or empty.
+ */
+ public void setParagraphLocalName(String localName)
+ throws WrongArgsException {
+ if ((localName == null) || localName.equals("")) {
+ throw new WrongArgsException("The token element local name cannot be null or empty.");
+ }
+ tokenLocalName = localName;
+ }
+
+ ///////////////////////////////
+ // Pipelinable
+ ///////////////////////////////
+
+ public void setArguments(FilterArguments fA, FilterByNames nH, TeiDocument doc)
+ throws WrongArgsException {
+ String w_localName = fA.getText(FilterArguments.TEXT_FORMAT_TOKEN_LOCAL_NAME);
+ if ((w_localName != null) && w_localName.equals("")) {
+ setTokenLocalName(w_localName);
+ }
+ String p_localName = fA.getText(FilterArguments.TEXT_FORMAT_PARAGRAPH_LOCAL_NAME);
+ if ((p_localName != null) && p_localName.equals("")) {
+ setParagraphLocalName(p_localName);
+ }
+ textUri = fA.getText(FilterArguments.TEXT_FORMAT_TEXT_URI);
+ if ((textUri == null) || textUri.equals("")) {
+ throw new WrongArgsException("Text url cannot be null.");
+ }
+ }
+
+ /**
+ * Create the file.
+ */
+ public void startPipeline() throws FilterException {
+ try {
+ textWriter = new BufferedWriter(
+ new OutputStreamWriter(
+ new FileOutputStream(textUri),
+ "ISO-8859-1"
+ )
+ );
+ } catch (IOException e) {
+ throw new FilterException("Error while creating the file: " +
+ e.getMessage(),
+ e
+ );
+ }
+ super.startPipeline();
+ }
+
+ public void endPipeline() throws FilterException {
+ flush();
+ super.endPipeline();
+ }
+
+ private void flush() throws FilterException {
+ try {
+ textWriter.flush();
+ } catch (IOException ioE) {
+ throw new FilterException("Error while flushing \"" +
+ textUri +
+ "\": " +
+ ioE.getMessage(),
+ ioE
+ );
+ }
+ }
+
+ ///////////////////////////////
+ // Event handlers
+ ///////////////////////////////
+
+ public void startElement(String uri, String localname, String qname, Attributes attributes)
+ throws SAXException {
+ if (!isInParagraph) {
+ if (localname.equals(paragraphLocalName)) {
+ isInParagraph = true;
+ try {
+ textWriter.newLine();
+ } catch (IOException e) {
+ throw new FilterException("Error while writing: " + e.getMessage(), e);
+ }
+ }
+ }
+
+ if (!isInToken) {
+ if (localname.equals(tokenLocalName)) {
+ isInToken = true;
+ try {
+ textWriter.write(" ", 0, 1);
+ String lemma = attributes.getValue("", "lm");
+ if (lemma != null) {
+ textWriter.write(lemma, 0, lemma.length());
+ }
+ } catch (IOException e) {
+ throw new FilterException("Error while writing: " + e.getMessage(), e);
+ }
+ }
+ }
+ super.startElement(uri, localname, qname, attributes);
+ }
+
+ public void endElement(String namespaceURI, String lName, String qName)
+ throws SAXException {
+ if (isInParagraph) {
+ if (lName.equals(paragraphLocalName)) {
+ isInParagraph = false;
+ }
+ }
+ if (isInToken) {
+ if (lName.equals(tokenLocalName)) {
+ isInToken = false;
+ }
+ }
+ super.endElement(namespaceURI, lName, qName);
+ }
+
+// public void characters(char[] buf, int offset, int len)
+// throws SAXException {
+// if (isInToken) {
+// try {
+// textWriter.write(buf, offset, len);
+// } catch (IOException e) {
+// throw new FilterException("Error while writing: " + e.getMessage(), e);
+// }
+// }
+// super.characters(buf, offset, len);
+// }
+//
+// public void ignorableWhitespace(char[] ch, int start, int length)
+// throws SAXException {
+// characters(ch, start, length);
+// }
+
+}
Modified: trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-02-16 14:28:33 UTC (rev 214)
+++ trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-08-28 20:47:44 UTC (rev 215)
@@ -653,4 +653,9 @@
// ArchiveProducer
public final static String ARCHIVE_PRODUCER_TYPE = "archive/@type";
+ // TextFormat
+ public final static String TEXT_FORMAT_TOKEN_LOCAL_NAME = "token/@localName";
+ public final static String TEXT_FORMAT_PARAGRAPH_LOCAL_NAME = "paragraph/@localName";
+
+ public final static String TEXT_FORMAT_TEXT_URI = "textFile/@url";
}
Added: trunk/corpusreader/src/main/schema/filters/TextFormat.rnc
===================================================================
--- trunk/corpusreader/src/main/schema/filters/TextFormat.rnc (rev 0)
+++ trunk/corpusreader/src/main/schema/filters/TextFormat.rnc 2009-08-28 20:47:44 UTC (rev 215)
@@ -0,0 +1,15 @@
+datatypes xs = "http://www.w3.org/2001/XMLSchema-datatypes"
+
+start=TextFormat
+
+TextFormat = element args {
+ element token {
+ attribute localName {string}
+ }?,
+ element paragraph {
+ attribute localName {string}
+ }?,
+ element textFile {
+ attribute url {string}
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|