corpusreader-svn Mailing List for CorpusReader
Status: Alpha
Brought to you by:
sylvainloiseau
You can subscribe to this list here.
| 2008 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(35) |
Jun
(18) |
Jul
(8) |
Aug
(4) |
Sep
(26) |
Oct
(38) |
Nov
(25) |
Dec
(7) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2009 |
Jan
(1) |
Feb
(2) |
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
(1) |
Sep
(11) |
Oct
(1) |
Nov
(1) |
Dec
|
| 2010 |
Jan
(2) |
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
|
From: <syl...@us...> - 2010-01-23 15:35:40
|
Revision: 232
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=232&view=rev
Author: sylvainloiseau
Date: 2010-01-23 15:35:33 +0000 (Sat, 23 Jan 2010)
Log Message:
-----------
lib/ contains the CR.jar library.
Added Paths:
-----------
trunk/corpusreader/src/main/R/corpusreader/lib/
trunk/corpusreader/src/main/R/corpusreader/man/
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2010-01-23 10:33:35
|
Revision: 230
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=230&view=rev
Author: sylvainloiseau
Date: 2010-01-23 10:33:28 +0000 (Sat, 23 Jan 2010)
Log Message:
-----------
local modif
Modified Paths:
--------------
trunk/corpusreader/src/main/R/visu-chi2.r
Modified: trunk/corpusreader/src/main/R/visu-chi2.r
===================================================================
--- trunk/corpusreader/src/main/R/visu-chi2.r 2009-11-05 15:56:58 UTC (rev 229)
+++ trunk/corpusreader/src/main/R/visu-chi2.r 2010-01-23 10:33:28 UTC (rev 230)
@@ -39,7 +39,7 @@
## ----------
# mdat <- matrix(c(1,2,3,11,12,13), nrow = 2, ncol=3, byrow=TRUE, dimnames=list(c("R1", "R2"), c("C1", "C2", "C3")));
# mdat <- matrix(c(2,58,18,22), nrow = 2, ncol=2, byrow=TRUE, dimnames=list(c("bleu", "vert"), c("blond", "brun")));
-# mdat <- matrix(c(5,55,15,25,23,13), nrow = 2, ncol=3, byrow=TRUE, dimnames=list(c("partie 1", "partie 2"), c("b\xE9b\xE9", "gentil", "agit\xE9")));
+# mdat <- matrix(c(5,55,15,25,23,13), nrow = 2, ncol=3, byrow=TRUE, dimnames=list(c("partie 1", "partie 2"), c("bÈbÈ", "gentil", "agitÈ")));
# or :
# mdat <- matrix(c(7,5,20,16,40,33), nrow = 3, ncol=2, byrow=TRUE, dimnames=list(c("DC", "NREA", "REA"), c("BAS", "CES")));
# plotTwoCategorialVariables(mdat);
@@ -64,14 +64,14 @@
## TODO
## - autres graphiques
-## * les sous-fr\xE9quences attendues dans tout le tableau
-## * les sous-fr\xE9quences attendues plus les sous-fr\xE9quences observ\xE9es dans tout le tableau
-## * le total d'une variable dans un carr\xE9
-#* Le total d'une variable et les fr\xE9quences attendues/observ\xE9es de cette variable dans une modalit\xE9 de l'autre variable.
+## * les sous-frÈquences attendues dans tout le tableau
+## * les sous-frÈquences attendues plus les sous-frÈquences observÈes dans tout le tableau
+## * le total d'une variable dans un carrÈ
+#* Le total d'une variable et les frÈquences attendues/observÈes de cette variable dans une modalitÈ de l'autre variable.
## * des marques du calcul
-## - rendre optionnel l'affichage des valeurs num\xE9riques
+## - rendre optionnel l'affichage des valeurs numÈriques
## - vectoriser pour rect
-## - remonter le calcul des coordonn\xE9s
+## - remonter le calcul des coordonnÈs
## - utiliser "grid"
plotTwoCategorialVariables <- function(m, usedim1 = TRUE, margin = 0.05, obswidth=2, showObserved=TRUE, showLabel=TRUE, squarecolor="green", squarecolorcel="yellow", threshold=FALSE) {
@@ -123,19 +123,19 @@
# op <- par(las = 1, lend="square");
plot(c(0, 1), c(0, 1), type="n", ann=FALSE, axes=FALSE);
#box(which="plot", lty = '1373', col = 'red');
- mtext("D\xE9pendance entre deux variables", side=3, line=3);
+ mtext("Dépendance entre deux variables", side=3, line=3);
## Make the axis
## -------------------------------
axis(2, at=cellVerticalCoordinates[2:rowNbr]+cellHeight/2, labels=attr(m, "dimnames")[[1]]);
- mtext("Modalit\xE9s de la premi\xE8re variable", side=2, adj=1, line=2);
- mtext("Total 2nd variable", side=2, adj=0, line=2);
+ mtext("Modalités de la première variable", side=2, adj=1, line=2);
+ mtext("Total 2nd variable", side=2, adj=0, line=1);
axis(3, at=cellHorizontalCoordinates[1:(colNbr-1)]+cellWidth/2, labels=attr(m, "dimnames")[[2]]);
- mtext("Modalit\xE9s de la seconde variable", side=3, adj=0, line=2);
- mtext("Total 1\xE8re variable", side=3, adj=1, line=2);
+ mtext("Modalités de la seconde variable", side=3, adj=0, line=2);
+ mtext("Total 1ère variable", side=3, adj=1, line=1);
- ##\xA0draw the frame
+ ##†draw the frame
## -------------------------------
for(i in 1:(rowNbr-1)) {
lines(c(0,1), rep(i*cellHeight, 2));
@@ -145,10 +145,10 @@
lines(rep(i*cellWidth, 2), c(0,1));
}
- ##\xA0draw the boxes
+ ##†draw the boxes
## -------------------------------
- #\xA0the coordinate of each corner of the boxes.
+ #†the coordinate of each corner of the boxes.
# for each modalities of each variable (i.e. each box), there is *four*
# corners, and each corner is defined by *two* coordinates.
boxCoordinates <- array(dim=c(rowNbr,colNbr, 4, 2));
@@ -165,7 +165,7 @@
}
}
- ##\xA0draw lines from corner 1 to corner 2 and three and from corner 2 and 3 to
+ ##†draw lines from corner 1 to corner 2 and three and from corner 2 and 3 to
## corner 4 for each box.
for (i in 1:rowNbr) {
for (j in 1:colNbr) {
@@ -266,7 +266,7 @@
# add the text
if (showLabel) {
- text(j*cellWidth + margin, (i+1)*cellHeight - margin, labels=paste(" th\xE9orique :\n", expected[i, j+1], "%\n", "observ\xE9 :\n", actual[i, j + 1], "%"), adj=c(0,1), cex=.8);
+ text(j*cellWidth + margin, (i+1)*cellHeight - margin, labels=paste(" thÈorique :\n", expected[i, j+1], "%\n", "observÈ :\n", actual[i, j + 1], "%"), adj=c(0,1), cex=.8);
}
# add the expected value rect
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-11-05 15:57:10
|
Revision: 229
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=229&view=rev
Author: sylvainloiseau
Date: 2009-11-05 15:56:58 +0000 (Thu, 05 Nov 2009)
Log Message:
-----------
creating plain text file
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/ExtractFrequencyList.java
trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/ExtractFrequencyList.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/ExtractFrequencyList.java 2009-10-02 15:29:00 UTC (rev 228)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/ExtractFrequencyList.java 2009-11-05 15:56:58 UTC (rev 229)
@@ -6,6 +6,7 @@
import org.xml.sax.SAXException;
import tei.cr.component.frequencyList.FrequencyList;
import tei.cr.component.frequencyList.FrequencyListImpl;
+import tei.cr.component.frequencyList.FrequencyListWithPosition;
import tei.cr.filters.ProvidingFrequencyList;
import tei.cr.filters.ExtractLocation;
import tei.cr.pipeline.AbstractBase;
@@ -492,8 +493,8 @@
}
FrequencyList returnedFL = fList;
- //fList = new FrequencyListWithPosition();
- fList = new FrequencyListImpl();
+ fList = new FrequencyListWithPosition();
+ //fList = new FrequencyListImpl();
return returnedFL;
}
@@ -544,8 +545,8 @@
// else
// TODO two places where lexicon is created.
//fList = new FrequencyListImpl();
- //fList = new FrequencyListWithPosition();
- fList = new FrequencyListImpl();
+ fList = new FrequencyListWithPosition();
+ //fList = new FrequencyListImpl();
super.startPipeline();
}
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-10-02 15:29:00 UTC (rev 228)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-11-05 15:56:58 UTC (rev 229)
@@ -107,6 +107,7 @@
for (int i = 0; i < length; i++) {
String phen = p.getPhenomenon(i).toString();
textWriter.write(phen, 0, phen.length());
+ textWriter.write(" ", 0, 1);
}
textWriter.newLine();
} catch (IOException e) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-10-02 15:39:03
|
Revision: 228
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=228&view=rev
Author: sylvainloiseau
Date: 2009-10-02 15:29:00 +0000 (Fri, 02 Oct 2009)
Log Message:
-----------
debugging
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java
trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
trunk/corpusreader/src/main/resources/schema/filters/ExtractPhenomena.rnc
trunk/corpusreader/src/main/resources/schema/filters/TextFormat.rnc
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java 2009-09-24 13:31:17 UTC (rev 227)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java 2009-10-02 15:29:00 UTC (rev 228)
@@ -82,7 +82,7 @@
}
public PhenomenaStream getPhenomenaStream() {
- PhenomenaStream p = tokensStreamBuilder.getPhenomenaStrem();
+ PhenomenaStream p = tokensStreamBuilder.getPhenomenaStream();
tokensStreamBuilder = new PhenomenaStreamBuilder();
return p;
}
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-09-24 13:31:17 UTC (rev 227)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-10-02 15:29:00 UTC (rev 228)
@@ -1,4 +1,4 @@
-ptextWriter.wackage tei.cr.filters;
+package tei.cr.filters;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
@@ -43,20 +43,20 @@
public void setArguments(FilterArguments fA, FilterByNames nH, TeiDocument doc)
throws WrongArgsException {
- String phenomena_filter = fA.getText(FilterArguments.TEXT_FORMAT_PROVIDE_PHENOMENA);
+ String phenomena_filter = fA.getText(FilterArguments.TEXT_FORMAT_PROVIDE_PHENOMENA);
- handler =
- (ProvidingPhenomenaStream) nH.get(phenomena_filter);
- if (handler == null) {
- throw new WrongArgsException("No filter of name \""
- + lexiconHandlerName + "\"found in the pipeline");
- }
+ handler =
+ (ProvidingPhenomenaStream) nH.get(phenomena_filter);
+ if (handler == null) {
+ throw new WrongArgsException("No filter of name \""
+ + phenomena_filter + "\"found in the pipeline");
+ }
- textUri = fA.getText(FilterArguments.TEXT_FORMAT_TEXT_URI);
- if ((textUri == null) || textUri.equals("")) {
- throw new WrongArgsException("Text url cannot be null.");
- }
+ textUri = fA.getText(FilterArguments.TEXT_FORMAT_TEXT_URI);
+ if ((textUri == null) || textUri.equals("")) {
+ throw new WrongArgsException("Text url cannot be null.");
}
+ }
/**
* Create the file.
@@ -100,15 +100,18 @@
// Event handlers
///////////////////////////////
- public void endDocument throws SAXException {
+ public void endDocument() throws SAXException {
PhenomenaStream p = handler.getPhenomenaStream();
int length = p.getPhenomenaLength();
- for (int i = 0; i < length; i++) {
- String phen = p.getPhenomenon(i);
- textWriter.write(phen, 0, phen.length());
+ try {
+ for (int i = 0; i < length; i++) {
+ String phen = p.getPhenomenon(i).toString();
+ textWriter.write(phen, 0, phen.length());
+ }
+ textWriter.newLine();
+ } catch (IOException e) {
+ throw new FilterException("Unable to write in file: " + e.getMessage(), e);
}
-
- textWriter.newLine();
super.endDocument();
}
}
Modified: trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-09-24 13:31:17 UTC (rev 227)
+++ trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-10-02 15:29:00 UTC (rev 228)
@@ -668,8 +668,6 @@
public final static String ARCHIVE_PRODUCER_TYPE = "archive/@type";
// TextFormat
- public final static String TEXT_FORMAT_TOKEN_LOCAL_NAME = "token/@localName";
- public final static String TEXT_FORMAT_PARAGRAPH_LOCAL_NAME = "paragraph/@localName";
-
+ public final static String TEXT_FORMAT_PROVIDE_PHENOMENA = "phenomena/@filter";
public final static String TEXT_FORMAT_TEXT_URI = "textFile/@url";
}
Modified: trunk/corpusreader/src/main/resources/schema/filters/ExtractPhenomena.rnc
===================================================================
--- trunk/corpusreader/src/main/resources/schema/filters/ExtractPhenomena.rnc 2009-09-24 13:31:17 UTC (rev 227)
+++ trunk/corpusreader/src/main/resources/schema/filters/ExtractPhenomena.rnc 2009-10-02 15:29:00 UTC (rev 228)
@@ -3,38 +3,30 @@
start=ExtractPhenomena
ExtractPhenomena = element args {
-
- Occurrences
- ,
- element gram {
- attribute n { xs:int },
- empty
- }
-}
-
-Occurrences = element occurrences {
- (
- element elementName { empty }
- |
- element AttributeName { empty }
- |
- element elementStringValue {
+ element occurrences {
+ (
+ element elementName { empty }
+ |
+ element AttributeName { empty }
+ |
+ element elementStringValue {
attribute localName { string },
empty
- }
- |
- element use {
+ }
+ |
+ element use {
attribute elxpath { string },
element match {
- attribute elxpath { string }
+ attribute elxpath { string }
}?
- }
- |
- element evaluate {
+ }
+ |
+ element evaluate {
attribute xpath { string },
element root {
- attribute elxpath { string }
+ attribute elxpath { string }
}?
- }
- )+
+ }
+ )+
+ }
}
Modified: trunk/corpusreader/src/main/resources/schema/filters/TextFormat.rnc
===================================================================
--- trunk/corpusreader/src/main/resources/schema/filters/TextFormat.rnc 2009-09-24 13:31:17 UTC (rev 227)
+++ trunk/corpusreader/src/main/resources/schema/filters/TextFormat.rnc 2009-10-02 15:29:00 UTC (rev 228)
@@ -4,9 +4,11 @@
TextFormat = element args {
element phenomena {
- attribute filter {string}
+ attribute filter {string},
+ empty
}?,
element textFile {
- attribute url {string}
+ attribute url {string},
+ empty
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-24 13:31:46
|
Revision: 227
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=227&view=rev
Author: sylvainloiseau
Date: 2009-09-24 13:31:17 +0000 (Thu, 24 Sep 2009)
Log Message:
-----------
New filter
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/ExtractNGram.java
trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
Added Paths:
-----------
trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java
trunk/corpusreader/src/main/java/tei/cr/filters/ProvidingPhenomenaStream.java
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/ExtractNGram.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/ExtractNGram.java 2009-09-24 13:31:08 UTC (rev 226)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/ExtractNGram.java 2009-09-24 13:31:17 UTC (rev 227)
@@ -69,7 +69,7 @@
public void startPipeline() throws FilterException {
if (patternsBuilder.isEmpty()) {
throw new FilterException(
- "No locator pattern provided."
+ "No PhenomenaStream pattern provided."
);
}
patterns =
@@ -79,7 +79,7 @@
super.addContentHandlerBefore((ContentHandler)patterns[i]);
}
log.info(
- "Number of TeiLocatorPattern registered: "
+ "Number of PhenomenaStream Pattern registered: "
+ patterns.length
);
super.startPipeline();
Added: trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/ExtractPhenomena.java 2009-09-24 13:31:17 UTC (rev 227)
@@ -0,0 +1,90 @@
+package tei.cr.filters;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import tei.cr.pipeline.AbstractForkingBase;
+import tei.cr.pipeline.FilterByNames;
+import tei.cr.pipeline.WrongArgsException;
+import tei.cr.querydoc.FilterArguments;
+import tei.cr.querydoc.ParseOccurrencePattern;
+import tei.cr.teiDocument.TeiDocument;
+import tei.cr.utils.sax.occurrence.OccurrencePattern;
+import tei.cr.component.ngram.NGramStreamBuilder;
+import tei.cr.component.frequencyList.FrequencyList;
+import tei.cr.component.phenomenaStream.PhenomenaStream;
+import tei.cr.component.phenomenaStream.PhenomenaStreamBuilder;
+
+final public class ExtractPhenomena
+ extends AbstractForkingBase
+ implements ReceiveOccurrence, ProvidingPhenomenaStream {
+
+ private final List<OccurrencePattern> patternsBuilder = new ArrayList<OccurrencePattern>();
+ private OccurrencePattern[] patterns;
+ private final Logger log = Logger.getLogger(getClass().getName());
+ private PhenomenaStreamBuilder tokensStreamBuilder = new PhenomenaStreamBuilder();
+
+ public void addPattern(OccurrencePattern pattern) {
+ patternsBuilder.add(pattern);
+ }
+
+ /**
+ * Receive arguments from a query doc.
+ */
+ public void setArguments(FilterArguments fA, FilterByNames nH,
+ TeiDocument doc)
+ throws WrongArgsException {
+ NodeList list = fA.getNodeList("occurrences/*");
+ if (list.getLength() == 0) {
+ throw new WrongArgsException(
+ "No occurrences provided."
+ );
+ }
+
+ for (int i = 0; i < list.getLength(); i++) {
+ Node element = list.item(i);
+ if (element.getNodeType() != Node.ELEMENT_NODE) {
+ continue;
+ }
+ OccurrencePattern pattern =
+ ParseOccurrencePattern.getOccurrencePattern(((ReceiveOccurrence)this), element, fA);
+ addPattern(pattern);
+ }
+ super.setArguments(fA, nH, doc);
+
+ }
+
+ public void startPipeline() throws FilterException {
+ if (patternsBuilder.isEmpty()) {
+ throw new FilterException(
+ "No PhenomenaStream pattern provided."
+ );
+ }
+ patterns =
+ (OccurrencePattern[])
+ patternsBuilder.toArray(new OccurrencePattern[]{});
+ for (int i = 0; i < patterns.length; i++) {
+ super.addContentHandlerBefore((ContentHandler)patterns[i]);
+ }
+ log.info(
+ "Number of PhenomenaStream Pattern registered: "
+ + patterns.length
+ );
+ super.startPipeline();
+ }
+
+ public void occurrence(Object phenomenon) {
+ tokensStreamBuilder.addPhenomenon(phenomenon);
+ }
+
+ public PhenomenaStream getPhenomenaStream() {
+ PhenomenaStream p = tokensStreamBuilder.getPhenomenaStrem();
+ tokensStreamBuilder = new PhenomenaStreamBuilder();
+ return p;
+ }
+
+}
Added: trunk/corpusreader/src/main/java/tei/cr/filters/ProvidingPhenomenaStream.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/ProvidingPhenomenaStream.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/ProvidingPhenomenaStream.java 2009-09-24 13:31:17 UTC (rev 227)
@@ -0,0 +1,29 @@
+package tei.cr.filters;
+
+import tei.cr.component.phenomenaStream.PhenomenaStream;
+
+/**
+ * A filter able to provide a {@link FrequencyList}.
+ *
+ * @version 0.1
+ *
+ * @author Sylvain Loiseau <syl...@u-...>
+ *
+ * @see tei.cr.filters.ExtractFrequencyList
+ *
+ * @see tei.cr.filters.WriteFrequencyList
+ */
+public interface ProvidingPhenomenaStream {
+
+ /**
+ * <p>Return the lexicon extracted by the filter.</p>
+ *
+ * <p>The contract of this method is that a different
+ * {@link FrequencyList} instance is return at each
+ * different call to this method.</p>
+ *
+ * @return FrequencyList a lexicon containing the data
+ * received since the preceding call to this method.
+ */
+ public PhenomenaStream getPhenomenaStream();
+}
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-09-24 13:31:08 UTC (rev 226)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-09-24 13:31:17 UTC (rev 227)
@@ -1,4 +1,4 @@
-package tei.cr.filters;
+ptextWriter.wackage tei.cr.filters;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
@@ -13,203 +13,102 @@
import tei.cr.pipeline.WrongArgsException;
import tei.cr.querydoc.FilterArguments;
import tei.cr.teiDocument.TeiDocument;
+import tei.cr.component.phenomenaStream.PhenomenaStream;
/**
- * TODO : to be rewrite with tei.cr.component.phenomenaStream
- * <p>Convert into plain text, using w/@lm.</p>
+ * Convert a stream of phenomena provided by a
+ * ProvidingPhenomeaStream filter into a text file. Each
+ * token is blank-separated, and each new document
+ * produce a carriage return.
*
- * <p>Token local name and paragrahp local name must be different.</p>
- *
* @author Sylvain Loiseau <slo...@u-...>
* @version 0.1
*/
final public class TextFormat extends AbstractBase {
- ///////////////////////////////
- // Fields
- ///////////////////////////////
+ ///////////////////////////////
+ // Fields
+ ///////////////////////////////
- private BufferedWriter textWriter;
+ private BufferedWriter textWriter;
- private String textUri = null;
+ private String textUri = null;
- /**
- * The name of the element surrounding the tokens
- */
- private String tokenLocalName = DEFAULT_TOKEN_LOCAL_NAME;
-
- /**
- * Default name of the element surrounding the tokens
- */
- private final static String DEFAULT_TOKEN_LOCAL_NAME = "w";
+ private ProvidingPhenomenaStream handler;
- /**
- * The name of the element surrounding the paragraph, separated with \n
- */
- private String paragraphLocalName = DEFAULT_PARAGRAPH_LOCAL_NAME;
-
- /**
- * Default name of the element surrounding the tokens
- */
- private final static String DEFAULT_PARAGRAPH_LOCAL_NAME = "p";
+ ///////////////////////////////
+ // Pipelinable
+ ///////////////////////////////
- private boolean isInParagraph = false;
+ public void setArguments(FilterArguments fA, FilterByNames nH, TeiDocument doc)
+ throws WrongArgsException {
+ String phenomena_filter = fA.getText(FilterArguments.TEXT_FORMAT_PROVIDE_PHENOMENA);
- private boolean isInToken = false;
+ handler =
+ (ProvidingPhenomenaStream) nH.get(phenomena_filter);
+ if (handler == null) {
+ throw new WrongArgsException("No filter of name \""
+ + lexiconHandlerName + "\"found in the pipeline");
+ }
- ///////////////////////////////
- // Accessor
- ///////////////////////////////
-
- /**
- * @param localName the local name to be used for
- * annotating the token
- * @throws WrongArgsException if the local name is null or empty.
- */
- public void setTokenLocalName(String localName)
- throws WrongArgsException {
- if ((localName == null) || localName.equals("")) {
- throw new WrongArgsException("The token element local name cannot be null or empty.");
- }
- tokenLocalName = localName;
+ textUri = fA.getText(FilterArguments.TEXT_FORMAT_TEXT_URI);
+ if ((textUri == null) || textUri.equals("")) {
+ throw new WrongArgsException("Text url cannot be null.");
+ }
}
- /**
- * @param localName the local name to be used for
- * annotating the token
- * @throws WrongArgsException if the local name is null or empty.
- */
- public void setParagraphLocalName(String localName)
- throws WrongArgsException {
- if ((localName == null) || localName.equals("")) {
- throw new WrongArgsException("The token element local name cannot be null or empty.");
- }
- tokenLocalName = localName;
+ /**
+ * Create the file.
+ */
+ public void startPipeline() throws FilterException {
+ try {
+ textWriter = new BufferedWriter(
+ new OutputStreamWriter(
+ new FileOutputStream(textUri),
+ "ISO-8859-1"
+ )
+ );
+ } catch (IOException e) {
+ throw new FilterException("Error while creating the file: " +
+ e.getMessage(),
+ e
+ );
}
+ super.startPipeline();
+ }
- ///////////////////////////////
- // Pipelinable
- ///////////////////////////////
+ public void endPipeline() throws FilterException {
+ flush();
+ super.endPipeline();
+ }
- public void setArguments(FilterArguments fA, FilterByNames nH, TeiDocument doc)
- throws WrongArgsException {
- String w_localName = fA.getText(FilterArguments.TEXT_FORMAT_TOKEN_LOCAL_NAME);
- if ((w_localName != null) && w_localName.equals("")) {
- setTokenLocalName(w_localName);
- }
- String p_localName = fA.getText(FilterArguments.TEXT_FORMAT_PARAGRAPH_LOCAL_NAME);
- if ((p_localName != null) && p_localName.equals("")) {
- setParagraphLocalName(p_localName);
- }
- textUri = fA.getText(FilterArguments.TEXT_FORMAT_TEXT_URI);
- if ((textUri == null) || textUri.equals("")) {
- throw new WrongArgsException("Text url cannot be null.");
- }
+ private void flush() throws FilterException {
+ try {
+ textWriter.flush();
+ } catch (IOException ioE) {
+ throw new FilterException("Error while flushing \"" +
+ textUri +
+ "\": " +
+ ioE.getMessage(),
+ ioE
+ );
}
+ }
- /**
- * Create the file.
- */
- public void startPipeline() throws FilterException {
- try {
- textWriter = new BufferedWriter(
- new OutputStreamWriter(
- new FileOutputStream(textUri),
- "ISO-8859-1"
- )
- );
- } catch (IOException e) {
- throw new FilterException("Error while creating the file: " +
- e.getMessage(),
- e
- );
- }
- super.startPipeline();
- }
+ ///////////////////////////////
+ // Event handlers
+ ///////////////////////////////
- public void endPipeline() throws FilterException {
- flush();
- super.endPipeline();
+ public void endDocument throws SAXException {
+ PhenomenaStream p = handler.getPhenomenaStream();
+ int length = p.getPhenomenaLength();
+ for (int i = 0; i < length; i++) {
+ String phen = p.getPhenomenon(i);
+ textWriter.write(phen, 0, phen.length());
}
- private void flush() throws FilterException {
- try {
- textWriter.flush();
- } catch (IOException ioE) {
- throw new FilterException("Error while flushing \"" +
- textUri +
- "\": " +
- ioE.getMessage(),
- ioE
- );
- }
- }
-
- ///////////////////////////////
- // Event handlers
- ///////////////////////////////
-
- public void startElement(String uri, String localname, String qname, Attributes attributes)
- throws SAXException {
- if (!isInParagraph) {
- if (localname.equals(paragraphLocalName)) {
- isInParagraph = true;
- try {
- textWriter.newLine();
- } catch (IOException e) {
- throw new FilterException("Error while writing: " + e.getMessage(), e);
- }
- }
- }
-
- if (!isInToken) {
- if (localname.equals(tokenLocalName)) {
- isInToken = true;
- try {
- textWriter.write(" ", 0, 1);
- String lemma = attributes.getValue("", "lm");
- if (lemma != null) {
- textWriter.write(lemma, 0, lemma.length());
- }
- } catch (IOException e) {
- throw new FilterException("Error while writing: " + e.getMessage(), e);
- }
- }
- }
- super.startElement(uri, localname, qname, attributes);
- }
-
- public void endElement(String namespaceURI, String lName, String qName)
- throws SAXException {
- if (isInParagraph) {
- if (lName.equals(paragraphLocalName)) {
- isInParagraph = false;
- }
- }
- if (isInToken) {
- if (lName.equals(tokenLocalName)) {
- isInToken = false;
- }
- }
- super.endElement(namespaceURI, lName, qName);
- }
-
-// public void characters(char[] buf, int offset, int len)
-// throws SAXException {
-// if (isInToken) {
-// try {
-// textWriter.write(buf, offset, len);
-// } catch (IOException e) {
-// throw new FilterException("Error while writing: " + e.getMessage(), e);
-// }
-// }
-// super.characters(buf, offset, len);
-// }
-//
-// public void ignorableWhitespace(char[] ch, int start, int length)
-// throws SAXException {
-// characters(ch, start, length);
-// }
-
+ textWriter.newLine();
+ super.endDocument();
+ }
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-24 13:31:36
|
Revision: 226
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=226&view=rev
Author: sylvainloiseau
Date: 2009-09-24 13:31:08 +0000 (Thu, 24 Sep 2009)
Log Message:
-----------
New filters
Modified Paths:
--------------
trunk/corpusreader/src/main/resources/schema/filters/TextFormat.rnc
Added Paths:
-----------
trunk/corpusreader/src/main/resources/schema/filters/ExtractPhenomena.rnc
Added: trunk/corpusreader/src/main/resources/schema/filters/ExtractPhenomena.rnc
===================================================================
--- trunk/corpusreader/src/main/resources/schema/filters/ExtractPhenomena.rnc (rev 0)
+++ trunk/corpusreader/src/main/resources/schema/filters/ExtractPhenomena.rnc 2009-09-24 13:31:08 UTC (rev 226)
@@ -0,0 +1,40 @@
+datatypes xs = "http://www.w3.org/2001/XMLSchema-datatypes"
+
+start=ExtractPhenomena
+
+ExtractPhenomena = element args {
+
+ Occurrences
+ ,
+ element gram {
+ attribute n { xs:int },
+ empty
+ }
+}
+
+Occurrences = element occurrences {
+ (
+ element elementName { empty }
+ |
+ element AttributeName { empty }
+ |
+ element elementStringValue {
+ attribute localName { string },
+ empty
+ }
+ |
+ element use {
+ attribute elxpath { string },
+ element match {
+ attribute elxpath { string }
+ }?
+ }
+ |
+ element evaluate {
+ attribute xpath { string },
+ element root {
+ attribute elxpath { string }
+ }?
+ }
+ )+
+}
Modified: trunk/corpusreader/src/main/resources/schema/filters/TextFormat.rnc
===================================================================
--- trunk/corpusreader/src/main/resources/schema/filters/TextFormat.rnc 2009-09-12 18:49:58 UTC (rev 225)
+++ trunk/corpusreader/src/main/resources/schema/filters/TextFormat.rnc 2009-09-24 13:31:08 UTC (rev 226)
@@ -3,12 +3,9 @@
start=TextFormat
TextFormat = element args {
- element token {
- attribute localName {string}
+ element phenomena {
+ attribute filter {string}
}?,
- element paragraph {
- attribute localName {string}
- }?,
element textFile {
attribute url {string}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-12 18:50:06
|
Revision: 225
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=225&view=rev
Author: sylvainloiseau
Date: 2009-09-12 18:49:58 +0000 (Sat, 12 Sep 2009)
Log Message:
-----------
Adding option to UpdateDatabase filter
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 17:58:04 UTC (rev 224)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 18:49:58 UTC (rev 225)
@@ -45,8 +45,13 @@
private ExtractLocation[] locators;
private String[] fieldTypes;
+ private List<Set<String>> fieldSet;
+ /** Should the statement be skipped if it entails duplicate value for this field in the db */
private boolean[] fieldUnique;
- private List<Set<String>> fieldSet;
+ /** Should the statement be skipped when the value is empty */
+ private boolean[] fieldNonEmpty;
+ /** Should the value be trimed when fetched from TEILocator */
+ private boolean[] fieldTrim;
private Connection connexion;
private int numberOfRecordsInserted;
@@ -75,17 +80,23 @@
String fieldType = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_TYPE);
String locatorName = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_FILTERNAME);
String uniquestr = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_UNIQUE);
- boolean unique = false;
- if (uniquestr != null && !uniquestr.equals("")) {
- if (uniquestr.equals("true")) unique = true;
- }
+ if (uniquestr != null && uniquestr.equals("true"))
+ setUnique(i, true);
+ String non_empty_str = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_NON_EMPTY);
+ if (non_empty_str != null && non_empty_str.equals("true"))
+ setNonEmpty(i, true);
+
+ String trim_str = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_TRIM);
+ if (trim_str != null && trim_str.equals("true"))
+ setTrim(i, true);
+
ExtractLocation locator = (ExtractLocation) nH.get(locatorName);
if (locator == null) {
throw new WrongArgsException("No filter named \"" + locatorName + "\"found in the pipeline");
}
try {
- setField(i, fieldType, locator, unique);
+ setField(i, fieldType, locator);
} catch (FilterException e) {
throw new WrongArgsException("Unable to set a field: " + e.getMessage(), e);
}
@@ -105,6 +116,8 @@
for (int i = 0; i < nbrOfFields; i++) {
fieldSet.add(new HashSet<String>());
}
+ fieldNonEmpty = new boolean[nbrOfFields];
+ fieldTrim = new boolean[nbrOfFields];
}
/**
@@ -119,10 +132,6 @@
* for retreive the value for this field.
*/
public void setField(int indexOfField, String fieldType, ExtractLocation locator) throws FilterException {
- setField(indexOfField, fieldType, locator, false);
- }
-
- public void setField(int indexOfField, String fieldType, ExtractLocation locator, boolean unique) throws FilterException {
if (indexOfField >= locators.length) {
throw new FilterException("Field index out of bound.");
}
@@ -137,18 +146,31 @@
}
locators[indexOfField] = locator;
fieldTypes[indexOfField] = fieldType;
- fieldUnique[indexOfField] = unique;
}
-
+
+ public void setNonEmpty(int index, boolean non_empty) {
+ fieldNonEmpty[index] = non_empty;
+ }
+
+ public void setTrim(int index, boolean trim) {
+ fieldTrim[index] = trim;
+ }
+
+ public void setUnique(int index, boolean unique) {
+ fieldUnique[index] = unique;
+ }
+
////////////////////////////////////////////////////////
/**
* Each end document is a query to the database.
*/
public void endDocument() throws SAXException {
- boolean test = shouldUpdate();
+ String[] values = fetchValues();
+ boolean test = shouldUpdate(values);
+
if (test) {
- doStatement();
+ doStatement(values);
numberOfRecordsInserted++;
if ((numberOfRecordsInserted % 1000) == 0) {
newConnection();
@@ -191,7 +213,18 @@
////////////////////////
- private void doStatement() throws FilterException {
+ private String[] fetchValues () {
+ String[] values = new String[fieldTypes.length];
+ for (int i = 0; i < values.length; i++) {
+ values[i] = locators[i].getReference();
+ if (fieldTrim[i]) {
+ values[i] = values[i].trim();
+ }
+ }
+ return values;
+ }
+
+ private void doStatement(String[] values) throws FilterException {
PreparedStatement statement;
try {
statement = connexion.prepareStatement(parameterizedQuery);
@@ -200,7 +233,7 @@
}
try {
- setParameters(statement);
+ setParameters(statement, values);
} catch (SQLException e) {
throw new FilterException("Unable to set parameters. " + e.getMessage(), e);
}
@@ -243,9 +276,9 @@
* Construct the query, calling the registered TEILocator
* for retreiving values.
*/
- private void setParameters(PreparedStatement statement) throws SQLException {
+ private void setParameters(PreparedStatement statement, String[] values) throws SQLException {
for(int i = 0; i < fieldTypes.length; i++) {
- String value = locators[i].getReference();
+ String value = values[i];
if (fieldTypes[i].equals("int")) {
statement.setInt(i+1, Integer.parseInt(value));
} else if (fieldTypes[i].equals("String")) {
@@ -259,13 +292,22 @@
* into a new query.
* @return
*/
- private boolean shouldUpdate() {
- for (int i = 0; i < fieldUnique.length; i++) {
+ private boolean shouldUpdate(String[] values) {
+ //Doing in that order, values are recorded in fieldSet only if
+ // they have realy been inserted (and not only seen) before.
+ for (int i = 0; i < values.length; i++) {
+ if (fieldNonEmpty[i]) {
+ if (values[i].length() == 0) {
+ return false;
+ }
+ }
+ }
+ for (int i = 0; i < values.length; i++) {
if (fieldUnique[i]) {
- if (fieldSet.get(i).contains(locators[i].getReference())) {
+ if (fieldSet.get(i).contains(values[i])) {
return false;
} else {
- fieldSet.get(i).add(locators[i].getReference());
+ fieldSet.get(i).add(values[i]);
}
}
}
Modified: trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-09-12 17:58:04 UTC (rev 224)
+++ trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-09-12 18:49:58 UTC (rev 225)
@@ -530,7 +530,10 @@
public final static String UPDATE_DATABASE_FIELD_TYPE = "@fieldType";
public final static String UPDATE_DATABASE_FIELD_FILTERNAME = "@filterName";
public final static String UPDATE_DATABASE_FIELD_UNIQUE = "@unique";
+ public final static String UPDATE_DATABASE_FIELD_NON_EMPTY = "@non_empty";
+ public final static String UPDATE_DATABASE_FIELD_TRIM = "@trim";
public final static String UPDATE_DATABASE_QUERY = "preparedQuery/@query";
+
// BUILD_TABLE_FROM_TEI_LOCATOR
Modified: trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc
===================================================================
--- trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc 2009-09-12 17:58:04 UTC (rev 224)
+++ trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc 2009-09-12 18:49:58 UTC (rev 225)
@@ -14,6 +14,8 @@
attribute fieldType { string },
attribute filterName { xs:IDREF },
attribute unique { ("true" | "false") }?,
+ attribute non_empty { ("true" | "false") }?,
+ attribute trim { ("true" | "false") }?,
empty
}+
},
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-12 17:58:11
|
Revision: 224
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=224&view=rev
Author: sylvainloiseau
Date: 2009-09-12 17:58:04 +0000 (Sat, 12 Sep 2009)
Log Message:
-----------
debugging
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 17:02:19 UTC (rev 223)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 17:58:04 UTC (rev 224)
@@ -77,7 +77,7 @@
String uniquestr = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_UNIQUE);
boolean unique = false;
if (uniquestr != null && !uniquestr.equals("")) {
- unique = Boolean.getBoolean(uniquestr);
+ if (uniquestr.equals("true")) unique = true;
}
ExtractLocation locator = (ExtractLocation) nH.get(locatorName);
@@ -85,7 +85,7 @@
throw new WrongArgsException("No filter named \"" + locatorName + "\"found in the pipeline");
}
try {
- setField(i,fieldType,locator, unique);
+ setField(i, fieldType, locator, unique);
} catch (FilterException e) {
throw new WrongArgsException("Unable to set a field: " + e.getMessage(), e);
}
@@ -147,7 +147,7 @@
*/
public void endDocument() throws SAXException {
boolean test = shouldUpdate();
- if (test) {
+ if (test) {
doStatement();
numberOfRecordsInserted++;
if ((numberOfRecordsInserted % 1000) == 0) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-12 17:02:26
|
Revision: 223
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=223&view=rev
Author: sylvainloiseau
Date: 2009-09-12 17:02:19 +0000 (Sat, 12 Sep 2009)
Log Message:
-----------
debugging unitialized array
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 16:46:55 UTC (rev 222)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 17:02:19 UTC (rev 223)
@@ -8,6 +8,8 @@
import java.util.regex.Pattern;
import java.util.Set;
import java.util.HashSet;
+import java.util.List;
+import java.util.ArrayList;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -44,7 +46,7 @@
private ExtractLocation[] locators;
private String[] fieldTypes;
private boolean[] fieldUnique;
- private Set<String>[] fieldSet;
+ private List<Set<String>> fieldSet;
private Connection connexion;
private int numberOfRecordsInserted;
@@ -99,8 +101,9 @@
locators = new ExtractLocation[nbrOfFields];
fieldTypes = new String[nbrOfFields];
fieldUnique = new boolean[nbrOfFields];
+ fieldSet = new ArrayList<Set<String>>();
for (int i = 0; i < nbrOfFields; i++) {
- fieldSet[i] = new HashSet<String>();
+ fieldSet.add(new HashSet<String>());
}
}
@@ -259,10 +262,10 @@
private boolean shouldUpdate() {
for (int i = 0; i < fieldUnique.length; i++) {
if (fieldUnique[i]) {
- if (fieldSet[i].contains(locators[i].getReference())) {
+ if (fieldSet.get(i).contains(locators[i].getReference())) {
return false;
} else {
- fieldSet[i].add(locators[i].getReference());
+ fieldSet.get(i).add(locators[i].getReference());
}
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-12 16:47:02
|
Revision: 222
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=222&view=rev
Author: sylvainloiseau
Date: 2009-09-12 16:46:55 +0000 (Sat, 12 Sep 2009)
Log Message:
-----------
adding a "unique" feature
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 14:49:55 UTC (rev 221)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 16:46:55 UTC (rev 222)
@@ -6,6 +6,8 @@
import java.sql.SQLException;
import java.util.logging.Logger;
import java.util.regex.Pattern;
+import java.util.Set;
+import java.util.HashSet;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -37,11 +39,12 @@
private String DbUserName = null;
private String DbUserPwd = null;
- private PreparedStatement preparedStatement = null;
private String parameterizedQuery = null;
private ExtractLocation[] locators;
private String[] fieldTypes;
+ private boolean[] fieldUnique;
+ private Set<String>[] fieldSet;
private Connection connexion;
private int numberOfRecordsInserted;
@@ -69,13 +72,18 @@
}
String fieldType = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_TYPE);
String locatorName = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_FILTERNAME);
+ String uniquestr = fA.getText(element, FilterArguments.UPDATE_DATABASE_FIELD_UNIQUE);
+ boolean unique = false;
+ if (uniquestr != null && !uniquestr.equals("")) {
+ unique = Boolean.getBoolean(uniquestr);
+ }
ExtractLocation locator = (ExtractLocation) nH.get(locatorName);
if (locator == null) {
throw new WrongArgsException("No filter named \"" + locatorName + "\"found in the pipeline");
}
try {
- setField(i,fieldType,locator);
+ setField(i,fieldType,locator, unique);
} catch (FilterException e) {
throw new WrongArgsException("Unable to set a field: " + e.getMessage(), e);
}
@@ -90,6 +98,10 @@
public void initFields(int nbrOfFields) {
locators = new ExtractLocation[nbrOfFields];
fieldTypes = new String[nbrOfFields];
+ fieldUnique = new boolean[nbrOfFields];
+ for (int i = 0; i < nbrOfFields; i++) {
+ fieldSet[i] = new HashSet<String>();
+ }
}
/**
@@ -104,6 +116,10 @@
* for retreive the value for this field.
*/
public void setField(int indexOfField, String fieldType, ExtractLocation locator) throws FilterException {
+ setField(indexOfField, fieldType, locator, false);
+ }
+
+ public void setField(int indexOfField, String fieldType, ExtractLocation locator, boolean unique) throws FilterException {
if (indexOfField >= locators.length) {
throw new FilterException("Field index out of bound.");
}
@@ -118,42 +134,23 @@
}
locators[indexOfField] = locator;
fieldTypes[indexOfField] = fieldType;
+ fieldUnique[indexOfField] = unique;
}
-
+
////////////////////////////////////////////////////////
/**
* Each end document is a query to the database.
*/
public void endDocument() throws SAXException {
-
- PreparedStatement statement;
- try {
- statement = connexion.prepareStatement(parameterizedQuery);
- } catch (SQLException e) {
- throw new FilterException("Unable to create statement. " + e.getMessage(), e);
+ boolean test = shouldUpdate();
+ if (test) {
+ doStatement();
+ numberOfRecordsInserted++;
+ if ((numberOfRecordsInserted % 1000) == 0) {
+ newConnection();
+ }
}
-
- try {
- setParameters(statement);
- } catch (SQLException e) {
- throw new FilterException("Unable to set parameters. " + e.getMessage(), e);
- }
-
- try {
- int updated = statement.executeUpdate();
- log.info(updated + " row(s) updated.");
- statement.close();
- } catch (SQLException e) {
- throw new FilterException("Database or SQL statement error: " + e.getMessage() + ".", e);
- }
-
- numberOfRecordsInserted++;
- if ((numberOfRecordsInserted % 1000) == 0) {
- commitAndCloseConnexion();
- logNumberOfQueries();
- openConnexion();
- }
super.endDocument();
}
@@ -165,19 +162,15 @@
public void startPipeline() throws FilterException {
// check all the arguments.
- if (fieldTypes == null) {
+ if (fieldTypes == null)
throw new FilterException("Field names may not be null.");
- }
- if (locators == null) {
+ if (locators == null)
throw new FilterException("TEILocator filters may not be null.");
- }
for(int i = 0; i < fieldTypes.length; i++) {
- if (fieldTypes[i] == null || fieldTypes[i].equals("")) {
+ if (fieldTypes[i] == null || fieldTypes[i].equals(""))
throw new FilterException("Field "+ i +" has no name.");
- }
- if (locators[i] == null) {
+ if (locators[i] == null)
throw new FilterException("Field " + i + "has no locator.");
- }
}
// don't forget to load the driver.
@@ -193,6 +186,38 @@
super.startPipeline();
}
+ ////////////////////////
+
+ private void doStatement() throws FilterException {
+ PreparedStatement statement;
+ try {
+ statement = connexion.prepareStatement(parameterizedQuery);
+ } catch (SQLException e) {
+ throw new FilterException("Unable to create statement. " + e.getMessage(), e);
+ }
+
+ try {
+ setParameters(statement);
+ } catch (SQLException e) {
+ throw new FilterException("Unable to set parameters. " + e.getMessage(), e);
+ }
+
+ try {
+ int updated = statement.executeUpdate();
+ log.info(updated + " row(s) updated.");
+ statement.close();
+ } catch (SQLException e) {
+ throw new FilterException("Database or SQL statement error: " + e.getMessage() + ".", e);
+ }
+
+ }
+
+ private void newConnection() throws FilterException {
+ commitAndCloseConnexion();
+ logNumberOfQueries();
+ openConnexion();
+ }
+
private void commitAndCloseConnexion() throws FilterException {
try {
connexion.commit();
@@ -227,6 +252,24 @@
}
/**
+ * Test if this set of parameters should be inserted
+ * into a new query.
+ * @return
+ */
+ private boolean shouldUpdate() {
+ for (int i = 0; i < fieldUnique.length; i++) {
+ if (fieldUnique[i]) {
+ if (fieldSet[i].contains(locators[i].getReference())) {
+ return false;
+ } else {
+ fieldSet[i].add(locators[i].getReference());
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
* Escape slash and quote
*/
private String escape(String toBeEscaped) {
@@ -239,6 +282,4 @@
private void logNumberOfQueries() {
log.info(numberOfRecordsInserted + "queries executed.");
}
-
-
}
Modified: trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-09-12 14:49:55 UTC (rev 221)
+++ trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-09-12 16:46:55 UTC (rev 222)
@@ -529,6 +529,7 @@
public final static String UPDATE_DATABASE_FIELDS = "fields/field";
public final static String UPDATE_DATABASE_FIELD_TYPE = "@fieldType";
public final static String UPDATE_DATABASE_FIELD_FILTERNAME = "@filterName";
+ public final static String UPDATE_DATABASE_FIELD_UNIQUE = "@unique";
public final static String UPDATE_DATABASE_QUERY = "preparedQuery/@query";
// BUILD_TABLE_FROM_TEI_LOCATOR
Modified: trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc
===================================================================
--- trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc 2009-09-12 14:49:55 UTC (rev 221)
+++ trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc 2009-09-12 16:46:55 UTC (rev 222)
@@ -13,6 +13,7 @@
element field {
attribute fieldType { string },
attribute filterName { xs:IDREF },
+ attribute unique { ("true" | "false") }?,
empty
}+
},
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-12 14:50:01
|
Revision: 221
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=221&view=rev
Author: sylvainloiseau
Date: 2009-09-12 14:49:55 +0000 (Sat, 12 Sep 2009)
Log Message:
-----------
bug : parameter index of PreparedStatement are 1-based
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 14:36:52 UTC (rev 220)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/UpdateDatabase.java 2009-09-12 14:49:55 UTC (rev 221)
@@ -219,9 +219,9 @@
for(int i = 0; i < fieldTypes.length; i++) {
String value = locators[i].getReference();
if (fieldTypes[i].equals("int")) {
- statement.setInt(i, Integer.parseInt(value));
+ statement.setInt(i+1, Integer.parseInt(value));
} else if (fieldTypes[i].equals("String")) {
- statement.setString(i, escape(value));
+ statement.setString(i+1, escape(value));
}
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-12 14:36:59
|
Revision: 220
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=220&view=rev
Author: sylvainloiseau
Date: 2009-09-12 14:36:52 +0000 (Sat, 12 Sep 2009)
Log Message:
-----------
typo in schema dir name
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/querydoc/Query.java
Modified: trunk/corpusreader/src/main/java/tei/cr/querydoc/Query.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/querydoc/Query.java 2009-09-12 14:28:54 UTC (rev 219)
+++ trunk/corpusreader/src/main/java/tei/cr/querydoc/Query.java 2009-09-12 14:36:52 UTC (rev 220)
@@ -155,7 +155,7 @@
// validate the query document
log.info("Validating the query document...");
- boolean isValide = validateInternal(querydoc, "/schemas/querydoc.rnc");
+ boolean isValide = validateInternal(querydoc, "/schema/querydoc.rnc");
if (isValide) {
log.info("...the query document is valid.");
} else {
@@ -331,7 +331,7 @@
if (filterClassQualifiedName.startsWith("tei.cr.filters") && nodeArgument != null && !isDocumentSplitterFilter) {
int indexOfLocalName = filterClassQualifiedName.lastIndexOf(".") + 1;
String filterClassLocalName = filterClassQualifiedName.substring(indexOfLocalName);
- String schemaURI = "/schemas/filters/" + filterClassLocalName + ".rnc";
+ String schemaURI = "/schema/filters/" + filterClassLocalName + ".rnc";
log.info(new StringBuffer().append("Validating \"args\" subtree for filter \"").append(filterName != null ? filterName : filterClassLocalName).append("\" using schema ").append(schemaURI).toString());
boolean isValid = validateInternal(nodeArgument, schemaURI);
if (isValid) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-12 14:29:05
|
Revision: 219
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=219&view=rev
Author: sylvainloiseau
Date: 2009-09-12 14:28:54 +0000 (Sat, 12 Sep 2009)
Log Message:
-----------
moving schema into src/main/resources so that maven include them into jar layout
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/querydoc/Query.java
Removed Paths:
-------------
trunk/corpusreader/src/main/schema/
Modified: trunk/corpusreader/src/main/java/tei/cr/querydoc/Query.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/querydoc/Query.java 2009-09-12 14:27:20 UTC (rev 218)
+++ trunk/corpusreader/src/main/java/tei/cr/querydoc/Query.java 2009-09-12 14:28:54 UTC (rev 219)
@@ -155,7 +155,7 @@
// validate the query document
log.info("Validating the query document...");
- boolean isValide = validateInternal(querydoc, "/resources/schemas/querydoc.rnc");
+ boolean isValide = validateInternal(querydoc, "/schemas/querydoc.rnc");
if (isValide) {
log.info("...the query document is valid.");
} else {
@@ -331,7 +331,7 @@
if (filterClassQualifiedName.startsWith("tei.cr.filters") && nodeArgument != null && !isDocumentSplitterFilter) {
int indexOfLocalName = filterClassQualifiedName.lastIndexOf(".") + 1;
String filterClassLocalName = filterClassQualifiedName.substring(indexOfLocalName);
- String schemaURI = "/resources/schemas/filters/" + filterClassLocalName + ".rnc";
+ String schemaURI = "/schemas/filters/" + filterClassLocalName + ".rnc";
log.info(new StringBuffer().append("Validating \"args\" subtree for filter \"").append(filterName != null ? filterName : filterClassLocalName).append("\" using schema ").append(schemaURI).toString());
boolean isValid = validateInternal(nodeArgument, schemaURI);
if (isValid) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-12 14:27:28
|
Revision: 218
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=218&view=rev
Author: sylvainloiseau
Date: 2009-09-12 14:27:20 +0000 (Sat, 12 Sep 2009)
Log Message:
-----------
moving schema into resources directory
Added Paths:
-----------
trunk/corpusreader/src/main/resources/schema/
trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc
Property changes on: trunk/corpusreader/src/main/resources/schema
___________________________________________________________________
Added: svn:mergeinfo
+
Copied: trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc (from rev 217, trunk/corpusreader/src/main/schema/filters/UpdateDatabase.rnc)
===================================================================
--- trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc (rev 0)
+++ trunk/corpusreader/src/main/resources/schema/filters/UpdateDatabase.rnc 2009-09-12 14:27:20 UTC (rev 218)
@@ -0,0 +1,23 @@
+datatypes xs = "http://www.w3.org/2001/XMLSchema-datatypes"
+
+start=UpdateDatabase
+
+UpdateDatabase = element args {
+ element database {
+ attribute URL { string },
+ attribute user { string },
+ attribute password { string },
+ empty
+ },
+ element fields {
+ element field {
+ attribute fieldType { string },
+ attribute filterName { xs:IDREF },
+ empty
+ }+
+ },
+ element preparedQuery {
+ attribute query { string },
+ empty
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-09-10 16:53:28
|
Revision: 216
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=216&view=rev
Author: sylvainloiseau
Date: 2009-09-10 16:53:18 +0000 (Thu, 10 Sep 2009)
Log Message:
-----------
configuring the eclipse workspace
Added Paths:
-----------
trunk/corpusreader/src/ECLIPSE.txt
Added: trunk/corpusreader/src/ECLIPSE.txt
===================================================================
--- trunk/corpusreader/src/ECLIPSE.txt (rev 0)
+++ trunk/corpusreader/src/ECLIPSE.txt 2009-09-10 16:53:18 UTC (rev 216)
@@ -0,0 +1,7 @@
+# Using the eclipse IDE :
+
+mvn -DartifactId=maven-eclipse-plugin -DgroupId=maven-plugins plugin:download
+mvn eclipse:configure-workspace -Declipse.workspace=\path\to\your\eclipse\workspace
+# in the corpusreader directory :
+mvn eclipse:eclipse
+
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-08-28 20:47:50
|
Revision: 215
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=215&view=rev
Author: sylvainloiseau
Date: 2009-08-28 20:47:44 +0000 (Fri, 28 Aug 2009)
Log Message:
-----------
new filters
Modified Paths:
--------------
trunk/corpusreader/src/TODO.tei
trunk/corpusreader/src/main/java/tei/cr/Version.java
trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
Added Paths:
-----------
trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
trunk/corpusreader/src/main/schema/filters/TextFormat.rnc
Modified: trunk/corpusreader/src/TODO.tei
===================================================================
--- trunk/corpusreader/src/TODO.tei 2009-02-16 14:28:33 UTC (rev 214)
+++ trunk/corpusreader/src/TODO.tei 2009-08-28 20:47:44 UTC (rev 215)
@@ -28,6 +28,35 @@
</head>
<list>
+ --
+
+ Clarifier entre reference pattern et occurrence pattern
+ Mettre tous les types (localName, TEIName, etc.) et pas seulement "elxpath" avec les patterns qui prennent un argument dans reference pattern. Par exemple :
+ <use elxpath="date/@annee">
+ <match localname="date" />
+ </use>
+
+ Et pas seulement
+
+ <use elxpath="date/@annee">
+ <match elxpath="date" />
+ </use>
+
+ De m\xEAme :
+
+ <valueOf localName="Taille" />
+
+ Plut\xF4t que elxpath
+
+ Pareil dans SelectSubTrees : mettre localName
+
+ Dans selectSubTrees : l'argument test/@elxpath devrait \xEAtre nomm\xE9 test/@xpath
+
+ --
+
+
+
+
Mettre par d\xE9faut dans le r\xE9pertoire de lancement le fichier de log
AbstractForkingBase :
Modified: trunk/corpusreader/src/main/java/tei/cr/Version.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/Version.java 2009-02-16 14:28:33 UTC (rev 214)
+++ trunk/corpusreader/src/main/java/tei/cr/Version.java 2009-08-28 20:47:44 UTC (rev 215)
@@ -11,7 +11,7 @@
// don't change this lines; regexp are used during build for updating the fields.
private static final String VERSION = "0.1"; // "8.2";
- private static final String RELEASE_DATE = "20081221-1138";
+ private static final String RELEASE_DATE = "20090826-1111";
private static final String PRODUCT_NAME = "CorpusReader";
private Version() {
Added: trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/TextFormat.java 2009-08-28 20:47:44 UTC (rev 215)
@@ -0,0 +1,215 @@
+package tei.cr.filters;
+
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import tei.cr.pipeline.AbstractBase;
+import tei.cr.pipeline.FilterByNames;
+import tei.cr.pipeline.WrongArgsException;
+import tei.cr.querydoc.FilterArguments;
+import tei.cr.teiDocument.TeiDocument;
+
+/**
+ * TODO : to be rewrite with tei.cr.component.phenomenaStream
+ * <p>Convert into plain text, using w/@lm.</p>
+ *
+ * <p>Token local name and paragrahp local name must be different.</p>
+ *
+ * @author Sylvain Loiseau <slo...@u-...>
+ * @version 0.1
+ */
+
+final public class TextFormat extends AbstractBase {
+
+ ///////////////////////////////
+ // Fields
+ ///////////////////////////////
+
+ private BufferedWriter textWriter;
+
+ private String textUri = null;
+
+ /**
+ * The name of the element surrounding the tokens
+ */
+ private String tokenLocalName = DEFAULT_TOKEN_LOCAL_NAME;
+
+ /**
+ * Default name of the element surrounding the tokens
+ */
+ private final static String DEFAULT_TOKEN_LOCAL_NAME = "w";
+
+ /**
+ * The name of the element surrounding the paragraph, separated with \n
+ */
+ private String paragraphLocalName = DEFAULT_PARAGRAPH_LOCAL_NAME;
+
+ /**
+ * Default name of the element surrounding the tokens
+ */
+ private final static String DEFAULT_PARAGRAPH_LOCAL_NAME = "p";
+
+ private boolean isInParagraph = false;
+
+ private boolean isInToken = false;
+
+ ///////////////////////////////
+ // Accessor
+ ///////////////////////////////
+
+ /**
+ * @param localName the local name to be used for
+ * annotating the token
+ * @throws WrongArgsException if the local name is null or empty.
+ */
+ public void setTokenLocalName(String localName)
+ throws WrongArgsException {
+ if ((localName == null) || localName.equals("")) {
+ throw new WrongArgsException("The token element local name cannot be null or empty.");
+ }
+ tokenLocalName = localName;
+ }
+
+ /**
+ * @param localName the local name to be used for
+ * annotating the token
+ * @throws WrongArgsException if the local name is null or empty.
+ */
+ public void setParagraphLocalName(String localName)
+ throws WrongArgsException {
+ if ((localName == null) || localName.equals("")) {
+ throw new WrongArgsException("The token element local name cannot be null or empty.");
+ }
+ tokenLocalName = localName;
+ }
+
+ ///////////////////////////////
+ // Pipelinable
+ ///////////////////////////////
+
+ public void setArguments(FilterArguments fA, FilterByNames nH, TeiDocument doc)
+ throws WrongArgsException {
+ String w_localName = fA.getText(FilterArguments.TEXT_FORMAT_TOKEN_LOCAL_NAME);
+ if ((w_localName != null) && w_localName.equals("")) {
+ setTokenLocalName(w_localName);
+ }
+ String p_localName = fA.getText(FilterArguments.TEXT_FORMAT_PARAGRAPH_LOCAL_NAME);
+ if ((p_localName != null) && p_localName.equals("")) {
+ setParagraphLocalName(p_localName);
+ }
+ textUri = fA.getText(FilterArguments.TEXT_FORMAT_TEXT_URI);
+ if ((textUri == null) || textUri.equals("")) {
+ throw new WrongArgsException("Text url cannot be null.");
+ }
+ }
+
+ /**
+ * Create the file.
+ */
+ public void startPipeline() throws FilterException {
+ try {
+ textWriter = new BufferedWriter(
+ new OutputStreamWriter(
+ new FileOutputStream(textUri),
+ "ISO-8859-1"
+ )
+ );
+ } catch (IOException e) {
+ throw new FilterException("Error while creating the file: " +
+ e.getMessage(),
+ e
+ );
+ }
+ super.startPipeline();
+ }
+
+ public void endPipeline() throws FilterException {
+ flush();
+ super.endPipeline();
+ }
+
+ private void flush() throws FilterException {
+ try {
+ textWriter.flush();
+ } catch (IOException ioE) {
+ throw new FilterException("Error while flushing \"" +
+ textUri +
+ "\": " +
+ ioE.getMessage(),
+ ioE
+ );
+ }
+ }
+
+ ///////////////////////////////
+ // Event handlers
+ ///////////////////////////////
+
+ public void startElement(String uri, String localname, String qname, Attributes attributes)
+ throws SAXException {
+ if (!isInParagraph) {
+ if (localname.equals(paragraphLocalName)) {
+ isInParagraph = true;
+ try {
+ textWriter.newLine();
+ } catch (IOException e) {
+ throw new FilterException("Error while writing: " + e.getMessage(), e);
+ }
+ }
+ }
+
+ if (!isInToken) {
+ if (localname.equals(tokenLocalName)) {
+ isInToken = true;
+ try {
+ textWriter.write(" ", 0, 1);
+ String lemma = attributes.getValue("", "lm");
+ if (lemma != null) {
+ textWriter.write(lemma, 0, lemma.length());
+ }
+ } catch (IOException e) {
+ throw new FilterException("Error while writing: " + e.getMessage(), e);
+ }
+ }
+ }
+ super.startElement(uri, localname, qname, attributes);
+ }
+
+ public void endElement(String namespaceURI, String lName, String qName)
+ throws SAXException {
+ if (isInParagraph) {
+ if (lName.equals(paragraphLocalName)) {
+ isInParagraph = false;
+ }
+ }
+ if (isInToken) {
+ if (lName.equals(tokenLocalName)) {
+ isInToken = false;
+ }
+ }
+ super.endElement(namespaceURI, lName, qName);
+ }
+
+// public void characters(char[] buf, int offset, int len)
+// throws SAXException {
+// if (isInToken) {
+// try {
+// textWriter.write(buf, offset, len);
+// } catch (IOException e) {
+// throw new FilterException("Error while writing: " + e.getMessage(), e);
+// }
+// }
+// super.characters(buf, offset, len);
+// }
+//
+// public void ignorableWhitespace(char[] ch, int start, int length)
+// throws SAXException {
+// characters(ch, start, length);
+// }
+
+}
Modified: trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-02-16 14:28:33 UTC (rev 214)
+++ trunk/corpusreader/src/main/java/tei/cr/querydoc/FilterArguments.java 2009-08-28 20:47:44 UTC (rev 215)
@@ -653,4 +653,9 @@
// ArchiveProducer
public final static String ARCHIVE_PRODUCER_TYPE = "archive/@type";
+ // TextFormat
+ public final static String TEXT_FORMAT_TOKEN_LOCAL_NAME = "token/@localName";
+ public final static String TEXT_FORMAT_PARAGRAPH_LOCAL_NAME = "paragraph/@localName";
+
+ public final static String TEXT_FORMAT_TEXT_URI = "textFile/@url";
}
Added: trunk/corpusreader/src/main/schema/filters/TextFormat.rnc
===================================================================
--- trunk/corpusreader/src/main/schema/filters/TextFormat.rnc (rev 0)
+++ trunk/corpusreader/src/main/schema/filters/TextFormat.rnc 2009-08-28 20:47:44 UTC (rev 215)
@@ -0,0 +1,15 @@
+datatypes xs = "http://www.w3.org/2001/XMLSchema-datatypes"
+
+start=TextFormat
+
+TextFormat = element args {
+ element token {
+ attribute localName {string}
+ }?,
+ element paragraph {
+ attribute localName {string}
+ }?,
+ element textFile {
+ attribute url {string}
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-02-16 14:28:37
|
Revision: 214
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=214&view=rev
Author: sylvainloiseau
Date: 2009-02-16 14:28:33 +0000 (Mon, 16 Feb 2009)
Log Message:
-----------
File moved into the textometrie project
Removed Paths:
-------------
trunk/corpusreader/src/main/R/specificites.R
Deleted: trunk/corpusreader/src/main/R/specificites.R
===================================================================
--- trunk/corpusreader/src/main/R/specificites.R 2009-02-16 14:27:57 UTC (rev 213)
+++ trunk/corpusreader/src/main/R/specificites.R 2009-02-16 14:28:33 UTC (rev 214)
@@ -1,114 +0,0 @@
-specificites <- function(corpus, souscorpus) {
- ## Sylvain Loiseau
- ## 2006-2008
- ## Dernière édition samedi 13 décembre 2008, 19:38:22 (UTC+0100)
- ##
- ## Indice de spécificité des formes d'un sous corpus par rapport à un corpus.
- ## Mesurée par la formule hypergéométrique,
- ## retourne un tableau :
- ## une ligne par forme du sous corpus
- ## en colonne : indice de spécificité, sous fréquence, fréquence.
- ##
-
- ## Les fréquences dans le corpus
- if (class(corpus) == "data.frame") {
- frequencesCorpus <- colSums(corpus);
- print("Le corpus est un data frame");
- } else {
- frequencesCorpus = corpus;
- }
-
- print(paste("Nombre de variables dans le corpus : ", length(frequencesCorpus)));
-
- ## Un peu de nettoyage dans le sous corpus : on supprime
- ## d'éventuelles forme ou partie sans aucune valeur
- souscorpus <- souscorpus[,colSums(souscorpus) > 0];
- souscorpus <- souscorpus[rowSums(souscorpus) > 0,];
-
- ## Les fréquences dans le sous-corpus
- sousFrequences <- colSums(souscorpus);
-
- specificitesFrequencyLists(frequencesCorpus, sousFrequences);
-}
-
-specificitesFrequencyLists <- function(frequencesCorpus, sousFrequences) {
- if (any(is.na(frequencesCorpus))) {
- stop("Valeurs non numériques dans la liste des frequences du corpus.");
- }
- if (any(is.na(sousFrequences))) {
- stop("Valeurs non numériques dans la liste des fréquences du sous corpus.");
- }
-
- ## Nombre de formes à traiter
- nbrFormes <- length(sousFrequences);
- print(paste("Nombre de formes dans le sous corpus :", nbrFormes));
-
- ## On ne garde des frequences du corpus principal
- ## que les variables qui sont dans le sous corpus
- frequencesTotales <- frequencesCorpus[names(sousFrequences)]
- if (length(frequencesTotales) < nbrFormes) {
- warning("Attention : toutes les variables du sous corpus n'ont pas été trouvées dans le corpus.")
- }
- if (any(is.na(frequencesTotales))) {
- na_index <- is.na(frequencesTotales);
- print("NA :");
- print(names(sousFrequences)[na_index]);
- stop("NA in FrequencesTotales");
- }
-
- #frequencesTotales <- frequencesTotales[ !is.na(frequencesTotales) ];
- #if (any(is.na(frequencesTotales))) {
- # stop("NA in frequencesTotales");
- #}
-
- ## Nombre d'occurrences dans le corpus et son sous-corpus
- longueurCorpus <- sum(frequencesCorpus)
- longueurSousCorpus <- sum(sousFrequences)
- print(paste("Longueur du corpus :", longueurCorpus));
- print(paste("Longueur du sous corpus :", longueurSousCorpus));
-
- ## Pour chaque fréquence totale, on calcule la différence entre la taille du corpus et cette fréquence
- ## (nécessaire pour les fonctions phyper et dhyper)
- longueurCorpusMoinsFrequencesTotales <- (longueurCorpus - frequencesTotales);
- if (any(is.na(longueurCorpusMoinsFrequencesTotales))) {
- stop("Valeurs non numériques dans la liste des fréquences complémentaires de chaque forme.");
- }
- # print(paste("Taille de corpus moins fréquence", length(longueurCorpusMoinsFrequencesTotales)));
- # if (any(is.na(longueurCorpusMoinsFrequencesTotales))) {
- # stop("NA in longueurCorpusMoinsFrequencesTotales");
- # }
-
- ## on met dans un tableau "cumulative", pour chaque forme, la propabilité cumulée
- ## (phyper) correspondant à sa sous-fréquence dans le sous corpus compte tenu de sa
- ## fréquence absolue dans le corpus principal et des tailles des deux corpus.
- ##
- cumulative <- double(nbrFormes);
- cumulative <- phyper(sousFrequences, frequencesTotales, longueurCorpusMoinsFrequencesTotales, longueurSousCorpus);
- if (any(is.na(cumulative))) {
- stop("NA in cumulative");
- }
- # cumulative[is.na(cumulative)] <- 0;
-
- ## Si la fréquence cumulée est inférieure à 0.5, c'est qu'il y a *moins* d'occurrences
- ## que ne le laisserait prévoir le hasard, si la fréquence cumulée est supérieure à 0.5
- ## c'est qu'il y a plus d'occurrences qu'il n'y aurait dû en avoir. On inverse donc les signes
- ## pour prendre en compte cette différence.
- specificiteIndex <- double(nbrFormes);
-
- # if (any(is.na(specificiteIndex))) {
- # stop("NA in specificiteIndex");
- # }
-
- # specificiteIndex[cumulative < 0.5] <- specificiteIndex[cumulative < 0.5];
- specificiteIndex[cumulative >= 0.5] <- (1 - cumulative[cumulative >= 0.5]);
-
- m <- matrix(
- c(cumulative, sousFrequences, frequencesTotales),
- nrow = nbrFormes,
- ncol=3,
- dimnames = list(names(sousFrequences), c("Indice de spécificité", "Sous fréquence", "Fréquence totale"))
- );
-
- sorted <- m[order(m[,1], decreasing=TRUE),];
- return(sorted);
-}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-02-16 14:28:02
|
Revision: 213
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=213&view=rev
Author: sylvainloiseau
Date: 2009-02-16 14:27:57 +0000 (Mon, 16 Feb 2009)
Log Message:
-----------
Last change before deletion
Modified Paths:
--------------
trunk/corpusreader/src/main/R/specificites.R
Modified: trunk/corpusreader/src/main/R/specificites.R
===================================================================
--- trunk/corpusreader/src/main/R/specificites.R 2009-01-01 19:27:52 UTC (rev 212)
+++ trunk/corpusreader/src/main/R/specificites.R 2009-02-16 14:27:57 UTC (rev 213)
@@ -1,16 +1,16 @@
specificites <- function(corpus, souscorpus) {
## Sylvain Loiseau
## 2006-2008
- ## Derni\xE8re \xE9dition samedi 13 d\xE9cembre 2008, 19:38:22 (UTC+0100)
+ ## Dernière édition samedi 13 décembre 2008, 19:38:22 (UTC+0100)
##
- ## Indice de sp\xE9cificit\xE9 des formes d'un sous corpus par rapport \xE0 un corpus.
- ## Mesur\xE9e par la formule hyperg\xE9om\xE9trique,
+ ## Indice de spécificité des formes d'un sous corpus par rapport à un corpus.
+ ## Mesurée par la formule hypergéométrique,
## retourne un tableau :
## une ligne par forme du sous corpus
- ## en colonne : indice de sp\xE9cificit\xE9, sous fr\xE9quence, fr\xE9quence.
+ ## en colonne : indice de spécificité, sous fréquence, fréquence.
##
- ## Les fr\xE9quences dans le corpus
+ ## Les fréquences dans le corpus
if (class(corpus) == "data.frame") {
frequencesCorpus <- colSums(corpus);
print("Le corpus est un data frame");
@@ -21,11 +21,11 @@
print(paste("Nombre de variables dans le corpus : ", length(frequencesCorpus)));
## Un peu de nettoyage dans le sous corpus : on supprime
- ## d'\xE9ventuelles forme ou partie sans aucune valeur
+ ## d'éventuelles forme ou partie sans aucune valeur
souscorpus <- souscorpus[,colSums(souscorpus) > 0];
souscorpus <- souscorpus[rowSums(souscorpus) > 0,];
- ## Les fr\xE9quences dans le sous-corpus
+ ## Les fréquences dans le sous-corpus
sousFrequences <- colSums(souscorpus);
specificitesFrequencyLists(frequencesCorpus, sousFrequences);
@@ -33,13 +33,13 @@
specificitesFrequencyLists <- function(frequencesCorpus, sousFrequences) {
if (any(is.na(frequencesCorpus))) {
- stop("Valeurs non num\xE9riques dans la liste des frequences du corpus.");
+ stop("Valeurs non numériques dans la liste des frequences du corpus.");
}
if (any(is.na(sousFrequences))) {
- stop("Valeurs non num\xE9riques dans la liste des fr\xE9quences du sous corpus.");
+ stop("Valeurs non numériques dans la liste des fréquences du sous corpus.");
}
- ## Nombre de formes \xE0 traiter
+ ## Nombre de formes à traiter
nbrFormes <- length(sousFrequences);
print(paste("Nombre de formes dans le sous corpus :", nbrFormes));
@@ -47,8 +47,14 @@
## que les variables qui sont dans le sous corpus
frequencesTotales <- frequencesCorpus[names(sousFrequences)]
if (length(frequencesTotales) < nbrFormes) {
- warning("Attention : toutes les variables du sous corpus n'ont pas \xE9t\xE9 trouv\xE9es dans le corpus.")
+ warning("Attention : toutes les variables du sous corpus n'ont pas été trouvées dans le corpus.")
}
+ if (any(is.na(frequencesTotales))) {
+ na_index <- is.na(frequencesTotales);
+ print("NA :");
+ print(names(sousFrequences)[na_index]);
+ stop("NA in FrequencesTotales");
+ }
#frequencesTotales <- frequencesTotales[ !is.na(frequencesTotales) ];
#if (any(is.na(frequencesTotales))) {
@@ -61,45 +67,46 @@
print(paste("Longueur du corpus :", longueurCorpus));
print(paste("Longueur du sous corpus :", longueurSousCorpus));
- ## Pour chaque fr\xE9quence totale, on calcule la diff\xE9rence entre la taille du corpus et cette fr\xE9quence
- ## (n\xE9cessaire pour les fonctions phyper et dhyper)
+ ## Pour chaque fréquence totale, on calcule la différence entre la taille du corpus et cette fréquence
+ ## (nécessaire pour les fonctions phyper et dhyper)
longueurCorpusMoinsFrequencesTotales <- (longueurCorpus - frequencesTotales);
- # print(paste("Taille de corpus moins fr\xE9quence", length(longueurCorpusMoinsFrequencesTotales)));
+ if (any(is.na(longueurCorpusMoinsFrequencesTotales))) {
+ stop("Valeurs non numériques dans la liste des fréquences complémentaires de chaque forme.");
+ }
+ # print(paste("Taille de corpus moins fréquence", length(longueurCorpusMoinsFrequencesTotales)));
# if (any(is.na(longueurCorpusMoinsFrequencesTotales))) {
# stop("NA in longueurCorpusMoinsFrequencesTotales");
# }
- ## on met dans un tableau "cumulative", pour chaque forme, la propabilit\xE9 cumul\xE9e
- ## (phyper) correspondant \xE0 sa sous fr\xE9quence dans le sous corpus compte tenu de sa
- ## fr\xE9quence absolue dans le corpus principal et des tailles des deux corpus.
+ ## on met dans un tableau "cumulative", pour chaque forme, la propabilité cumulée
+ ## (phyper) correspondant à sa sous-fréquence dans le sous corpus compte tenu de sa
+ ## fréquence absolue dans le corpus principal et des tailles des deux corpus.
##
cumulative <- double(nbrFormes);
cumulative <- phyper(sousFrequences, frequencesTotales, longueurCorpusMoinsFrequencesTotales, longueurSousCorpus);
+ if (any(is.na(cumulative))) {
+ stop("NA in cumulative");
+ }
# cumulative[is.na(cumulative)] <- 0;
- # if (any(is.na(cumulative))) {
- # stop("NA in cumulative");
- # }
- ## Si la fr\xE9quence cumul\xE9e est inf\xE9rieure \xE0 0.5, c'est qu'il y a *moins* d'occurrences
- ## que ne le laisserait pr\xE9voir le hasard, si la fr\xE9quence cumul\xE9e est sup\xE9rieure \xE0 0.5
- ## c'est qu'il y a plus d'occurrences qu'il n'y aurait d\xFB en avoir. On inverse donc les signes
- ## pour prendre en compte cette diff\xE9rence.
+ ## Si la fréquence cumulée est inférieure à 0.5, c'est qu'il y a *moins* d'occurrences
+ ## que ne le laisserait prévoir le hasard, si la fréquence cumulée est supérieure à 0.5
+ ## c'est qu'il y a plus d'occurrences qu'il n'y aurait dû en avoir. On inverse donc les signes
+ ## pour prendre en compte cette différence.
specificiteIndex <- double(nbrFormes);
- if (any(is.na(specificiteIndex))) {
- stop("NA in specificiteIndex");
- }
- if (any(is.na(sousFrequences))) {
- stop("NA in sousFrequences");
- }
+ # if (any(is.na(specificiteIndex))) {
+ # stop("NA in specificiteIndex");
+ # }
+
# specificiteIndex[cumulative < 0.5] <- specificiteIndex[cumulative < 0.5];
- specificiteIndex[cumulative >= 0.5] <- (1 - specificiteIndex[cumulative >= 0.5]);
+ specificiteIndex[cumulative >= 0.5] <- (1 - cumulative[cumulative >= 0.5]);
m <- matrix(
- c(specificiteIndex, sousFrequences, frequencesTotales),
+ c(cumulative, sousFrequences, frequencesTotales),
nrow = nbrFormes,
ncol=3,
- dimnames = list(names(sousFrequences), c("Indice de sp\xE9cificit\xE9", "Sous fr\xE9quence", "Fr\xE9quence totale"))
+ dimnames = list(names(sousFrequences), c("Indice de spécificité", "Sous fréquence", "Fréquence totale"))
);
sorted <- m[order(m[,1], decreasing=TRUE),];
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2009-01-01 19:27:57
|
Revision: 212
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=212&view=rev
Author: sylvainloiseau
Date: 2009-01-01 19:27:52 +0000 (Thu, 01 Jan 2009)
Log Message:
-----------
Version used for 2008 TIM cours
Modified Paths:
--------------
trunk/corpusreader/src/main/R/specificites.R
Modified: trunk/corpusreader/src/main/R/specificites.R
===================================================================
--- trunk/corpusreader/src/main/R/specificites.R 2008-12-21 11:32:24 UTC (rev 211)
+++ trunk/corpusreader/src/main/R/specificites.R 2009-01-01 19:27:52 UTC (rev 212)
@@ -22,11 +22,11 @@
## Un peu de nettoyage dans le sous corpus : on supprime
## d'\xE9ventuelles forme ou partie sans aucune valeur
- sousCorpus <- sousCorpus[,colSums(sousCorpus) > 0];
- sousCorpus <- sousCorpus[rowSums(sousCorpus) > 0,];
+ souscorpus <- souscorpus[,colSums(souscorpus) > 0];
+ souscorpus <- souscorpus[rowSums(souscorpus) > 0,];
## Les fr\xE9quences dans le sous-corpus
- sousFrequences <- colSums(sousCorpus);
+ sousFrequences <- colSums(souscorpus);
specificitesFrequencyLists(frequencesCorpus, sousFrequences);
}
@@ -99,7 +99,7 @@
c(specificiteIndex, sousFrequences, frequencesTotales),
nrow = nbrFormes,
ncol=3,
- dimnames = list(names(sousFrequences), c("Indice de sp\xE9cificit\xE9", "Sous fr\xE9quence", "Fr\xE9quence absolue"))
+ dimnames = list(names(sousFrequences), c("Indice de sp\xE9cificit\xE9", "Sous fr\xE9quence", "Fr\xE9quence totale"))
);
sorted <- m[order(m[,1], decreasing=TRUE),];
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2008-12-21 12:22:10
|
Revision: 211
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=211&view=rev
Author: sylvainloiseau
Date: 2008-12-21 11:32:24 +0000 (Sun, 21 Dec 2008)
Log Message:
-----------
Accessing the StAX stream with the correct encoding
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java
Modified: trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java 2008-12-21 11:13:51 UTC (rev 210)
+++ trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java 2008-12-21 11:32:24 UTC (rev 211)
@@ -1,8 +1,11 @@
package tei.cr.utils.syntex;
import java.io.BufferedReader;
+import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
@@ -33,13 +36,16 @@
f.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
f.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
try {
- reader = f.createXMLEventReader(new BufferedReader(new FileReader(uri)));
+ reader = f.createXMLEventReader(new BufferedReader(new InputStreamReader(new FileInputStream(uri), "ISO-8859-1")));
} catch (XMLStreamException xsE) {
IllegalStateException iSE = new IllegalStateException("Unable to create reader: " +
xsE.getMessage());
iSE.initCause(xsE);
throw iSE;
- }
+ } catch (UnsupportedEncodingException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
public SyntexSentenceIterator getSentenceIterator() {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2008-12-21 11:13:53
|
Revision: 210
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=210&view=rev
Author: sylvainloiseau
Date: 2008-12-21 11:13:51 +0000 (Sun, 21 Dec 2008)
Log Message:
-----------
bug
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/Version.java
trunk/corpusreader/src/main/java/tei/cr/filters/MarkBalanced.java
trunk/corpusreader/src/main/java/tei/cr/saxConverterImpl/Codegram.java
trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java
trunk/corpusreader/src/test/java/tei/cr/utils/syntex/TestSyntexBuffer.java
Modified: trunk/corpusreader/src/main/java/tei/cr/Version.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/Version.java 2008-12-20 21:23:44 UTC (rev 209)
+++ trunk/corpusreader/src/main/java/tei/cr/Version.java 2008-12-21 11:13:51 UTC (rev 210)
@@ -11,7 +11,7 @@
// don't change this lines; regexp are used during build for updating the fields.
private static final String VERSION = "0.1"; // "8.2";
- private static final String RELEASE_DATE = "20081219-1918";
+ private static final String RELEASE_DATE = "20081221-1138";
private static final String PRODUCT_NAME = "CorpusReader";
private Version() {
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/MarkBalanced.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/MarkBalanced.java 2008-12-20 21:23:44 UTC (rev 209)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/MarkBalanced.java 2008-12-21 11:13:51 UTC (rev 210)
@@ -25,7 +25,7 @@
// ************************************** //
private Stack openedElement = new Stack();
- private char[] codePoint = { '(', ')', '', '' };
+ private char[] codePoint = { '(', ')', '«', '»' };
private boolean[] isOpening = { true, false, true, false };
private String[] markup = { "parenthesis", "PRE laquo POST raquo" };
Modified: trunk/corpusreader/src/main/java/tei/cr/saxConverterImpl/Codegram.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/saxConverterImpl/Codegram.java 2008-12-20 21:23:44 UTC (rev 209)
+++ trunk/corpusreader/src/main/java/tei/cr/saxConverterImpl/Codegram.java 2008-12-21 11:13:51 UTC (rev 210)
@@ -22,15 +22,15 @@
/*-------------------------------------------------------------------------------------------*/
/*
Accepter redondance :
- interrogatif dans "assertivit\xE9" et dans "type de d\xE9terminant"
+ interrogatif dans "assertivit" et dans "type de dterminant"
fini dans "finitude" et dans "type"
nbr dans "type" et dans "nbr"
Accepter perte d'information :
- on ne sait plus lequel est le nombre du possesseur et lequel du poss\xE9d\xE9.
+ on ne sait plus lequel est le nombre du possesseur et lequel du possd.
*/
/* --------------------------------------------------------------------------------- */
- /* cat\xE9gorie partie du discours */
+ /* catgorie partie du discours */
public static final String POS_ADJECTIF = "pos.adj";
public static final String POS_CONJONCTION = "pos.c";
public static final String POS_DETERMINANT = "pos.d";
@@ -44,7 +44,7 @@
public static final String POS_PONCTUATION = "pos.ponct";
/* --------------------------------------------------------------------------------- */
- /* type : sous-cat\xE9gorisation de POS */
+ /* type : sous-catgorisation de POS */
// Adjectif :
public static final String TYPE_QUALIFICATIF = "ty.qual";
@@ -54,14 +54,14 @@
public static final String TYPE_COORDINATION = "ty.coo"; // Cc
public static final String TYPE_SUBORDINATION = "ty.sub"; // Cs
- // d\xE9terminant
+ // dterminant
public static final String TYPE_ARTICLE = "ty.art";
- // d\xE9terminant et pronom
+ // dterminant et pronom
public static final String TYPE_DEMONSTRATIF = "ty.dem";
public static final String TYPE_INTERROGATIF = "ty.int";
- // adjectif d\xE9terminant-adjectif et pronom
+ // adjectif dterminant-adjectif et pronom
public static final String TYPE_POSSESSIF = "ty.poss";
public static final String TYPE_INDEFINI = "ty.ind";
@@ -89,7 +89,7 @@
public static final String COMPARATIF = "comp.comp";
/* --------------------------------------------------------------------------------- */
- /* num\xE9raux */
+ /* numraux */
// Adjectif ordinal:
public static final String NUMERIQUE_ORDINAL = "num.ord";
@@ -110,12 +110,12 @@
public static final String PERSONNEL_PROPRE = "pers.nom";
/* --------------------------------------------------------------------------------- */
- /* r\xE9fl\xE9chis */
+ /* rflchis */
public static final String REFLECHI = "refl.oui";
public static final String NON_REFLECHI = "refl.non";
/* --------------------------------------------------------------------------------- */
- /* cat\xE9gorie syntaxique */
+ /* catgorie syntaxique */
public static final String CAT_SUJET = "cat.s";
public static final String CAT_COD = "cat.cod";
public static final String CAT_COI = "cat.coi";
@@ -126,11 +126,11 @@
- personne :
1, 2, 3
pour pronom : personne du possesseur
- d\xE9terminant possessif : id.
+ dterminant possessif : id.
adj ??
- - nmb_possesseur (ou \xE0 mettre dans nombre seul ?, avec deux traits nombre ?)
+ - nmb_possesseur (ou mettre dans nombre seul ?, avec deux traits nombre ?)
adjectif : possessif
- d\xE9terminant : possessif
+ dterminant : possessif
*/
public static final String PERS_1 = "p.1";
public static final String PERS_2 = "p.2";
@@ -139,21 +139,21 @@
/* --------------------------------------------------------------------------------- */
/* Fini */
/*
- d\xE9fini : d\xE9terminant : article d\xE9fini
- ind\xE9fini :
- d\xE9terminant : article ind\xE9fini,
- d\xE9terminant : adjectif ind\xE9fini (?)
- adjectif : ind\xE9fini (diff\xE9rence avec ci-dessus ???)
- pronom : ind\xE9fini
+ dfini : dterminant : article dfini
+ indfini :
+ dterminant : article indfini,
+ dterminant : adjectif indfini (?)
+ adjectif : indfini (diffrence avec ci-dessus ???)
+ pronom : indfini
*/
- // adj, art, det-adj, et pronom (redouble tout les types "ind\xE9finis"
- // plus les d\xE9terminant-articles "ind\xE9fini ou d\xE9fini"
+ // adj, art, det-adj, et pronom (redouble tout les types "indfinis"
+ // plus les dterminant-articles "indfini ou dfini"
public static final String FIN_DEFINI = "fin.d";
public static final String FIN_INDEFINI = "fin.i";
// adj indef = //tout, quelque, chaque, tel, aucune
- // adj, det et pronom, ajouter det art indefini ? cat n\xE9cessaire ?
+ // adj, det et pronom, ajouter det art indefini ? cat ncessaire ?
/* --------------------------------------------------------------------------------- */
/* Nombre */
@@ -166,21 +166,21 @@
public static final String GENRE_FEMININ = "g.f";
/* --------------------------------------------------------------------------------- */
- /* Assertivit\xE9 */
+ /* Assertivit */
/* assertif :
- d\xE9terminant : d\xE9monstratif,
- pronom : d\xE9monstratif
- interrogativit\xE9
- d\xE9terminant : interrogatif
+ dterminant : dmonstratif,
+ pronom : dmonstratif
+ interrogativit
+ dterminant : interrogatif
pronom : interrogatif
- n\xE9gation
- adv : n\xE9gation*/
+ ngation
+ adv : ngation*/
public static final String ASS_INTERROGATIF = "ass.i";
public static final String ASS_DEMONSTRATIF = "ass.d";
- // penser \xE0 encoder aussi qu'est adj.
+ // penser encoder aussi qu'est adj.
// en commun avec ADJ_POSSESSIF et pronom possessif ?
- // ??? quelle cat\xE9gorie : type ? != ass
+ // ??? quelle catgorie : type ? != ass
public static final String ASS_POSSESSIF = "ass.poss";
// adverbe
@@ -200,7 +200,7 @@
public static final String MODE_IMPERATIF = "m.imp";
public static final String MODE_PARTICIPE = "m.partpres";
- // = participe pass\xE9 ???
+ // = participe pass ???
/* --------------------------------------------------------------------------------- */
/* Temps */
@@ -321,7 +321,7 @@
case 'S':
- /* S p pour une pr\xE9position */
+ /* S p pour une prposition */
addIDREF(POS_PREPOSITION);
break;
@@ -354,29 +354,29 @@
}
/* pour adjectif
- A f c/p <genre> <nombre> pour un adjectif qualificatif (\x93 c \x94 si comparatif, \x93 p \x94 sinon)
- A i - <genre> <nombre> pour un adjectif ind\xE9fini
+ A f c/p <genre> <nombre> pour un adjectif qualificatif ( c 铔 si comparatif, p sinon)
+ A i - <genre> <nombre> pour un adjectif indfini
A o - <genre> <nombre> pour un adjectif ordinal
A s - <genre> <nombre> pour un adjectif possessif
*/
/*
- //QUALIFICATIF : Afpms "sens commun" ; Afpm. "c'est faux du commencement \xE0 la fin.
+ //QUALIFICATIF : Afpms "sens commun" ; Afpm. "c'est faux du commencement la fin.
- Ind\xE9fini :
- ajouter ind\xE9fini : cf. d\xE9terminant, adj ind\xE9fini ???
+ Indfini :
+ ajouter indfini : cf. dterminant, adj indfini ???
"quel" Ai-ms-
quelle (blessure n'est pas de guerre) Ai-fs- => quasi toujours interrogatif ? == beaucoup d'erreur.
- (mais) quelle (\xE9trange tension presque insupportable, cette \xE9treinte [...]) Ai-fs-
- (on peut encore se demander) quel (genre de folie repr\xE9sentent ...) Ai-ms-
+ (mais) quelle (trange tension presque insupportable, cette treinte [...]) Ai-fs-
+ (on peut encore se demander) quel (genre de folie reprsentent ...) Ai-ms-
(ou) quel (est le sujet du discours philosophique ?) Ai-ms-
- (on ne sait plus du tout de) quel (c\xF4t\xE9 est le maximum d'ironie) Ai-ms-
+ (on ne sait plus du tout de) quel (ct est le maximum d'ironie) Ai-ms-
// Ordinal
- // Ao-.. : jamais de genre ou nombre indiqu\xE9 ???
+ // Ao-.. : jamais de genre ou nombre indiqu ???
// adjectif cardinal ??
// possessif :
- // As-mp "je les fasse miens", As-fs "tu es mienne" As-ms "reconna\xEEtra comme sien"
+ // As-mp "je les fasse miens", As-fs "tu es mienne" As-ms "reconnatra comme sien"
// ? et pronom possessif ? et "ma fille" = ?
// + personne, cf. verbe
// + genre, cf. nom
@@ -559,7 +559,7 @@
}
/*
- (o\xF9 <cat> est 'n' pour le sujet, 'a' pour le COD, 'd' pour le COI)
+ (o <cat> est 'n' pour le sujet, 'a' pour le COD, 'd' pour le COI)
!!! ou aussi '-' !!!
*/
public void doCat(int index) {
@@ -604,8 +604,8 @@
/*
R g c pour un adverbe comparatif
- R g p pour un adverbe non comparatif et non de n\xE9gation
- R p n pour l\x92adverbe de n\xE9gation \x93 ne \x94 ou \x93 n\x92 \x94
+ R g p pour un adverbe non comparatif et non de ngation
+ R p n pour ladverbe de ngation ne 铔 ou n
*/
private void doAdverbe() {
if (codechar.length < 3) {
@@ -642,18 +642,18 @@
}
/*
- D a - <genre> <nombre> d pour un article d\xE9fini
+ D a - <genre> <nombre> d pour un article dfini
le Da-ms-d
la Da-fs-d
les Da-.p-d
du Da-ms-d
!!! plus '-' !!!
- D a - <genre> <nombre> i pour un article ind\xE9fini
- (celle) des (choses et celle du monde) Da-.p-i == erreur, ici d\xE9fini !!!
+ D a - <genre> <nombre> i pour un article indfini
+ (celle) des (choses et celle du monde) Da-.p-i == erreur, ici dfini !!!
du (monde) Da-ms-d
- le pays des (f\xE9es) Da-.p-i
+ le pays des (fes) Da-.p-i
- D d - <genre> <nombre> pour un adjectif d\xE9monstratif
+ D d - <genre> <nombre> pour un adjectif dmonstratif
cette Dd-fs-
Cet Dd-ms-
ces termes Dd-.p-
@@ -664,27 +664,27 @@
AUCUN ds LS ou MP !!!
D s 1/2/3 <nombre> <nb poss.> pour un adjectif possessif (1 ou 2 ou 3 selon la personne),
- <nb poss.> est \xE9gal \xE0 'p' si le possesseur est pluriel, \xE0 's' s'il est singulier
+ <nb poss.> est gal 'p' si le possesseur est pluriel, 's' s'il est singulier
!!! ++ genre !!!
- (le langage atteint \xE0) sa (plus haute puissance) Ds3fss
- diff\xE9rent du n\xF4tre Ps1.sa.
+ (le langage atteint ) sa (plus haute puissance) Ds3fss
+ diffrent du ntre Ps1.sa.
qui ne sont plus la sienne Ps3fs-s
tellement semblable aux siennes Ps2fpas
- sup\xE9rieur \xE0 la n\xF4tre Ps1.sd.
+ suprieur la ntre Ps1.sd.
il y a toujours un autre souffle dans le mien Ps1msds
- une autre pens\xE9e dans la mienne Ps1fsds
+ une autre pense dans la mienne Ps1fsds
aucun nom propre ne se substitue au souffle hyperbolique du mien Ps1msds
- qui ne se laisse pas substituer \xE0 la mienne Ps1fsns
- analogue au n\xF4tre Ps1.sa.
- le v\xF4tre Ps2.s-.
+ qui ne se laisse pas substituer la mienne Ps1fsns
+ analogue au ntre Ps1.sa.
+ le vtre Ps2.s-.
- D t - <genre> <nombre> pour un adjectif ind\xE9fini
- (qui se d\xE9robe \xE0) toute (identification) Dt-fs-
- (o\xF9) chaque (syst\xE8me individuel se d\xE9fait) Dt-.s-
- (qui repporte la qualit\xE9 \xE0 ) quelque (chose de fixe) Dt-.s-
- (de) telle (mani\xE8re que) aucune Dt-fs- Dt-fs-
- (pourquoi \xE0) tel (moment plut\xF4t qu'\xE0 tel autre ?) Dt-ms-
+ D t - <genre> <nombre> pour un adjectif indfini
+ (qui se drobe ) toute (identification) Dt-fs-
+ (o) chaque (systme individuel se dfait) Dt-.s-
+ (qui repporte la qualit ) quelque (chose de fixe) Dt-.s-
+ (de) telle (manire que) aucune Dt-fs- Dt-fs-
+ (pourquoi ) tel (moment plutt qu' tel autre ?) Dt-ms-
(qui n'autorise) aucun (amalgame) Dt-ms-
*/
private void doDeterminant() {
@@ -785,9 +785,9 @@
}
/*
- M c <genre> <nombre> pour un num\xE9ral
- (o\xF9 <genre> est cod\xE9 par \x93 m \x94 pour masculin, \x93 f \x94 pour f\xE9minin,
- et o\xF9 <nombre> est cod\xE9 \x93 s \x94 pour singulier, \x93 p \x94 pour pluriel)
+ M c <genre> <nombre> pour un numral
+ (o <genre> est cod par m 铔 pour masculin, f pour fminin,
+ et o <nombre> est cod s 铔 pour singulier, p pour pluriel)
N c <genre> <nombre> pour un nom commun
N p <genre> <nombre> pour un nom propre
*/
@@ -824,27 +824,27 @@
/*
- // d\xE9monstratif, ind\xE9fini, personnel, non r\xE9fl\xE9chi, relatif, possessif, interrogatif,
- // personnel non-r\xE9fl\xE9chis :
+ // dmonstratif, indfini, personnel, non rflchi, relatif, possessif, interrogatif,
+ // personnel non-rflchis :
- // utiliser les autres cat\xE9gories. Distinguer "ind\xE9fini" comme cat\xE9gorie (commune \xE0 det ?)
- // et comme type d'article ? possessif : \xE0 noter avec adj et det possessifs.
- // personnel : en faire un type commun (avec d\xE9monstratif, etc.)
- // avec une sous-cat\xE9gorie r\xE9fl\xE9chi ou non)
+ // utiliser les autres catgories. Distinguer "indfini" comme catgorie (commune det ?)
+ // et comme type d'article ? possessif : noter avec adj et det possessifs.
+ // personnel : en faire un type commun (avec dmonstratif, etc.)
+ // avec une sous-catgorie rflchi ou non)
- P d - <genre> <nombre> <cat> pour un pronom d\xE9monstratif
- (o\xF9 <cat> est 'n' pour le sujet, 'a' pour le COD, 'd' pour le COI)
+ P d - <genre> <nombre> <cat> pour un pronom dmonstratif
+ (o <cat> est 'n' pour le sujet, 'a' pour le COD, 'd' pour le COI)
!!! ou aussi '-' !!!
- P i - <genre> <nombre> <cat> pour un pronom ind\xE9fini
+ P i - <genre> <nombre> <cat> pour un pronom indfini
// = ???
- P p 1/2/3 <nombre> <cat> pour un pronom personnel non r\xE9fl\xE9chi
+ P p 1/2/3 <nombre> <cat> pour un pronom personnel non rflchi
P r -<genre> <nombre> <cat> pour un pronom relatif
Pr-ms-
P s - 1/2/3 <nombre> <cat> pour un pronom possessif
!!! + genre !!!
!!! pas de tiret !!!
P t - <genre> <nombre> <cat> pour un pronom interrogatif
- P x 1/2/3 <nombre> pour un pronom personnel r\xE9fl\xE9chi
+ P x 1/2/3 <nombre> pour un pronom personnel rflchi
!!! + genre !!!
!!! + cat !!!
@@ -926,19 +926,19 @@
}
/*
- V a/m n/i/s/c/f/p p/i/s/f/r/m/c/\xE9/a 1/2/3 s/p (ou <genre> <nombre> si participe)
+ V a/m n/i/s/c/f/p p/i/s/f/r/m/c//a 1/2/3 s/p (ou <genre> <nombre> si participe)
!!! c'est l'inverse : <nbr><genre> si participe !
- o\xF9 a/m correspond \xE0 \x93 auxiliaire \x94 ou \x93 principal \x94
- o\xF9 n/i/s/c/f/p correspond au mode
- (\x93 n \x94=infinitif, \x93 i \x94=indicatif, \x93 s \x94=subjonctif, \x93 c \x94=conditionnel,
- \x93 f \x94=imp\xE9ratif, \x93 p \x94=participe)
- o\xF9 p/i/s/f/r/m/c/\xE9/a correspond au temps
- (\x93 p \x94=pr\xE9sent, \x93 i \x94=imparfait, \x93 s \x94=pass\xE9, \x93 f \x94=futur,
- \x93 r \x94=subjonctif pr\xE9sent, \x93 m \x94=subjonctif imparfait,
- \x93 c \x94=conditionnel, \x93 \xE9 \x94=imp\xE9ratif, \x93 a \x94= participe pass\xE9
- o\xF9 1/2/3 est \xE9gal \xE0 1 ou 2 ou 3 selon la personne
- o\xF9 s/p correspond \xE0 \x93 singulier \x94 ou \x93 pluriel \x94 pour la personne
+ o a/m correspond auxiliaire Ӕ ou principal
+ o n/i/s/c/f/p correspond au mode
+ ( n ?=infinitif, ? i =indicatif, s =subjonctif, c =conditionnel,
+ f =impratif, p 铔=participe)
+ o p/i/s/f/r/m/c//a correspond au temps
+ ( p 铔=prsent, i 铔=imparfait, s =pass, f 铔=futur,
+ r =subjonctif prsent, m 铔=subjonctif imparfait,
+ c =conditionnel, =impratif, a 铔= participe pass
+ o 1/2/3 est gal 1 ou 2 ou 3 selon la personne
+ o s/p correspond singulier Ӕ ou pluriel pour la personne
*/
private void doVerbe() {
boolean isParticipe = false;
@@ -983,7 +983,7 @@
return; // infinitif: do not process further.
- // ??? s\xFBr de ne rien perdre ?
+ // ??? sr de ne rien perdre ?
case 'i':
addIDREF(MODE_INDICATIF);
@@ -1068,11 +1068,11 @@
break;
- case '\xE9':
- addIDREF(TEMPS_IMPERATIF);
+// case '':
+// addIDREF(TEMPS_IMPERATIF);
+//
+// break;
- break;
-
case 'a':
addIDREF(TEMPS_PARTICIPE_PASSE);
@@ -1101,8 +1101,8 @@
/*
Y p w ponctuation finale pour une fin de phrase
Y p s ponctuation de pause pour une virgule ou une ponctuation de pause
- Y p o ponctuation d\x92insertion pour une parenth\xE8se ouverte ou signe voisin
- Y p c ponctuation de fin d\x92insertion pour une parenth\xE8se ferm\xE9e ou signe voisin
+ Y p o ponctuation dinsertion pour une parenthse ouverte ou signe voisin
+ Y p c ponctuation de fin dinsertion pour une parenthse ferme ou signe voisin
Y s s ponctuation pour un autre type de ponctuation
*/
private void doPonctuation() {
Modified: trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java 2008-12-20 21:23:44 UTC (rev 209)
+++ trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java 2008-12-21 11:13:51 UTC (rev 210)
@@ -1,9 +1,15 @@
package tei.cr.utils.syntex;
+import java.io.BufferedReader;
import java.io.FileNotFoundException;
+import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
+
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
@@ -11,39 +17,66 @@
import tei.cr.utils.stax.StAXBuffer;
import tei.cr.utils.stax.StAXBufferImpl;
+/**
+ * Get an iterator over sentences of a Syntex file.
+ *
+ * @author sloiseau
+ *
+ */
public class SyntexBuffer {
- private final StAXBuffer syntex;
+ private XMLEventReader reader;
public SyntexBuffer(String uri) throws FileNotFoundException {
- syntex = new StAXBufferImpl(uri);
+ XMLInputFactory f = XMLInputFactory.newInstance();
+ f.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
+ f.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
+ f.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
+ try {
+ reader = f.createXMLEventReader(new BufferedReader(new FileReader(uri)));
+ } catch (XMLStreamException xsE) {
+ IllegalStateException iSE = new IllegalStateException("Unable to create reader: " +
+ xsE.getMessage());
+ iSE.initCause(xsE);
+ throw iSE;
+ }
}
public SyntexSentenceIterator getSentenceIterator() {
- return new SyntexSentenceIterator(syntex);
+ return new SyntexSentenceIterator(reader);
}
public class SyntexSentenceIterator {
-
- private int currentEventIndex = 0;
- private List<XMLEvent> events;
- public SyntexSentenceIterator(StAXBuffer syntex) {
- events = syntex.getXMLEvents();
+ private List<List<XMLEvent>> buffer;
+ private XMLEventReader reader;
+ /** does the XMLReader iterator has next element */
+ private boolean hasNextElement = true;
+ /** does the Sentence iterator has next element */
+ private boolean hasNextSentence = true;
+ private final static int BUFFER_SIZE = 100;
+
+ public SyntexSentenceIterator(XMLEventReader reader) {
+ if (reader == null) {
+ throw new IllegalArgumentException("");
+ }
+ this.reader = reader;
+ buffer = new ArrayList<List<XMLEvent>>();
+ goToNextSentence();
}
/**
- * Return the complete (<code>qualified</code>) id of the next sentence.
+ * Return the complete (qualified) id of the next sentence.
*
* @return the complete id.
+ *
* @throws NoSuchElementException if there is no more sentence.
*/
- public String getNextSentenceQualifiedId() {
- int index = getNextSentenceIndex();
- if (index == -1) {
+ public String peekNextSentenceQualifiedId() {
+ if (!hasNextSentence()) {
throw new NoSuchElementException("No more sentence");
}
- Attribute a = events.get(index).asStartElement().getAttributeByName(SyntexVocabulary.SEQ_ELEMENT_ID_ATTRIBUTE);
+ Attribute a = buffer.get(0).get(0).asStartElement().getAttributeByName(SyntexVocabulary.SEQ_ELEMENT_ID_ATTRIBUTE);
return a.getValue();
}
@@ -53,11 +86,11 @@
* @return the complete id.
* @throws NoSuchElementException if there is no more sentence.
*/
- public String getNextSentenceDocumentId() {
- String qualifiedId = getNextSentenceQualifiedId();
+ public String peekNextSentenceDocumentId() {
+ String qualifiedId = peekNextSentenceQualifiedId();
return getDocumentPart(qualifiedId);
}
-
+
/**
* Return the next sentence
*
@@ -66,64 +99,139 @@
* @throws NoSuchElementException if there is no more sentence.
*/
public List<XMLEvent> getNextSentence() {
- List<XMLEvent> sentence = new ArrayList<XMLEvent>();
- int start = getNextSentenceIndex();
- if (start == -1) {
+ if (!hasNextSentence()) {
throw new NoSuchElementException("No more sentence");
}
- int end = getNextSentenceEndIndex(start);
- if (end == -1) {
- throw new IllegalStateException("No EndElement for this sentence. Bug somewhere.");
- }
- List<XMLEvent> subList = events.subList(start, end);
- sentence.addAll(subList);
- currentEventIndex = end;
- return sentence;
+ return buffer.remove(0);
}
+ // -----------------------
+
+ /**
+ * Fill the buffer and test the wrapped reader. Must be called by all
+ * methods needing to test for availability of sentences.
+ */
public boolean hasNextSentence() {
- int index = getNextSentenceIndex();
- return index != -1;
+ if (!buffer.isEmpty()) {
+ return true;
+ } else {
+ fillBuffer();
+ return hasNextSentence;
+ }
}
+ // protected
+
+ protected String getDocumentPart(String qualifiedId) {
+ return qualifiedId.substring(0, qualifiedId.lastIndexOf("_"));
+ }
+
/**
- * Get the index of the StartElement of the next sentence.
- * @return
+ * This method set the <code>hasNextSentence</code> flag.
*/
- protected int getNextSentenceIndex() {
- for (int i = currentEventIndex; i+1 < events.size(); i++) {
- XMLEvent e = events.get(i);
- if (e.isStartElement()) {
- StartElement se = e.asStartElement();
- if (se.getName().getLocalPart().equals(SyntexVocabulary.SENTENCE)) {
- return i;
- }
+ protected void fillBuffer() {
+ while (buffer.size() < BUFFER_SIZE && hasNextElement) {
+ List<XMLEvent> sentence = getOneSentence();
+ if (sentence != null) {
+ buffer.add(sentence);
+ } else {
+ break;
}
}
- return -1;
+ if (!hasNextElement && buffer.size() == 0) {
+ hasNextSentence = false;
+ }
}
- /**
- * Get the index of the EndElement of the next sentence.
- * @return
- */
- protected int getNextSentenceEndIndex(int nextSentenceStartIndex) {
- int end = nextSentenceStartIndex + 1;
- for (; end < events.size(); end++) {
- XMLEvent e = events.get(end);
- if (e.isEndElement()) {
- EndElement se = e.asEndElement();
- if (se.getName().getLocalPart().equals(SyntexVocabulary.SENTENCE)) {
- return end;
+ // the for methods below access the reader, deal with XMLStreamException, and set hasNextElement.
+
+ protected List<XMLEvent> getOneSentence() {
+ // skip the blank char between two sentences.
+ goToNextSentence();
+
+ if (!hasNextElement) {
+ return null;
+ }
+// XMLEvent ev = null;
+// try {
+// ev = (XMLEvent) reader.peek();
+// } catch (XMLStreamException e1) {
+// // TODO Auto-generated catch block
+// e1.printStackTrace();
+// }
+// System.out.println("---" + ev + "----");
+ if (!isOnSentenceStart()) {
+ try {
+ if (reader.peek() == null) {
+ hasNextElement = false;
+ return null;
+ } else if (reader.peek().isEndElement() && reader.peek().asEndElement().getName().getLocalPart().equals("syntex")) {
+ hasNextElement = false;
+ return null;
}
+ } catch (XMLStreamException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
}
+ try {
+ throw new IllegalStateException("Next element is not a start sentence: " + reader.peek());
+ } catch (XMLStreamException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
- return -1;
+
+ List<XMLEvent> sentence = new ArrayList<XMLEvent>();
+ while (!isOnSentenceEnd()) {
+ sentence.add((XMLEvent)reader.next());
+ }
+ XMLEvent last = (XMLEvent)reader.next();
+// System.out.println("last: " + last);
+ sentence.add(last);
+ return sentence;
}
- protected String getDocumentPart(String qualifiedId) {
- return qualifiedId.substring(0, qualifiedId.lastIndexOf("_"));
+ protected boolean isOnSentenceStart() {
+ try {
+ if (reader.peek() == null) {
+ hasNextElement = false;
+ return false;
+ }
+// System.out.println(reader.peek());
+ return
+ reader.peek().isStartElement()
+ &&
+ reader.peek().asStartElement().getName().getLocalPart().equals(SyntexVocabulary.SENTENCE);
+ } catch (XMLStreamException e) {
+ throw new IllegalStateException(e);
+ }
}
+ protected boolean isOnSentenceEnd() {
+ try {
+ if (reader.peek() == null) {
+ hasNextElement = false;
+ return false;
+ }
+ return
+ reader.peek().isEndElement()
+ &&
+ reader.peek().asEndElement().getName().getLocalPart().equals(SyntexVocabulary.SENTENCE);
+ } catch (XMLStreamException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ /** used for positionning the reader on the first sentence. */
+ protected void goToNextSentence() {
+ while (!isOnSentenceStart()) {
+ if (!reader.hasNext()) {
+ hasNextElement = false;
+ return;
+ }
+ XMLEvent e = (XMLEvent)reader.next();
+// System.out.println(">" + e + "<");
+ }
+ }
}
}
Modified: trunk/corpusreader/src/test/java/tei/cr/utils/syntex/TestSyntexBuffer.java
===================================================================
--- trunk/corpusreader/src/test/java/tei/cr/utils/syntex/TestSyntexBuffer.java 2008-12-20 21:23:44 UTC (rev 209)
+++ trunk/corpusreader/src/test/java/tei/cr/utils/syntex/TestSyntexBuffer.java 2008-12-21 11:13:51 UTC (rev 210)
@@ -14,6 +14,7 @@
protected void setUp() {
try {
+ // the test file must have 2 sentences in order for the tests to work.
syntexBuffer = new SyntexBuffer(System.getProperty("user.dir") + "/src/test/resources/syntex.xml");
} catch (FileNotFoundException e) {
e.printStackTrace();
@@ -24,38 +25,51 @@
syntexBuffer = null;
}
- public void testGetSentenceIterator() {
- syntexBuffer.getSentenceIterator();
- }
-
- public void testNextSentenceIndex() {
+ public void testGoToNextSentence() {
SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
- assertEquals(3, it.getNextSentenceIndex());
+ it.goToNextSentence();
+ assertTrue(it.isOnSentenceStart());
}
- public void testNextSentenceEndIndex() {
+// public void testIsOnSentenceStart() {
+// SyntexSentenceIterator it = syntexBuffer.getSentenceIterator()
+// it.goToNextSentence();
+// List<XMLEvent> sentence = it.getNextSentence();
+//
+// }
+
+ public void testGetOneSentence() {
SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
- int start = it.getNextSentenceIndex();
- assertEquals(464, it.getNextSentenceEndIndex(start));
+ it.goToNextSentence();
+
+ // since there is two sentences in the test file.
+ List<XMLEvent> sentence = it.getOneSentence();
+ assertTrue(sentence != null);
+
+ sentence = it.getOneSentence();
+ assertTrue(sentence != null);
+
+ sentence = it.getOneSentence();
+ assertTrue(sentence == null);
}
- public void testNextSentenceDocumentId() {
+ public void testPeekNextSentenceDocumentId() {
SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
- assertEquals("300001_1782665", it.getNextSentenceDocumentId());
+ assertEquals("300001_1782665", it.peekNextSentenceDocumentId());
}
- public void testNextSentenceQualifiedId() {
+ public void testPeekNextSentenceQualifiedId() {
SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
- assertEquals("300001_1782665_1", it.getNextSentenceQualifiedId());
+ assertEquals("300001_1782665_1", it.peekNextSentenceQualifiedId());
}
public void testHasSentence() {
SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
assertTrue(it.hasNextSentence());
-// List<XMLEvent> sent1 = it.getNextSentence();
+ List<XMLEvent> sent1 = it.getNextSentence();
// System.out.println(sent1);
assertTrue(it.hasNextSentence());
-// List<XMLEvent> sent2 = it.getNextSentence();
+ List<XMLEvent> sent2 = it.getNextSentence();
// System.out.println(sent2);
assertFalse(it.hasNextSentence());
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2008-12-20 21:23:48
|
Revision: 209
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=209&view=rev
Author: sylvainloiseau
Date: 2008-12-20 21:23:44 +0000 (Sat, 20 Dec 2008)
Log Message:
-----------
Checking argument in FileUtils
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/utils/FileUtils.java
trunk/corpusreader/src/main/java/tei/cr/utils/sax/StAX2SAXEvent.java
Modified: trunk/corpusreader/src/main/java/tei/cr/utils/FileUtils.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/utils/FileUtils.java 2008-12-20 17:24:08 UTC (rev 208)
+++ trunk/corpusreader/src/main/java/tei/cr/utils/FileUtils.java 2008-12-20 21:23:44 UTC (rev 209)
@@ -7,6 +7,9 @@
final public class FileUtils {
public final static List<File> getFilesOfDirRec (String dirName) {
+ if (dirName == null) {
+ throw new IllegalArgumentException("Dir name may not be null.");
+ }
List<File> list = new ArrayList<File>();
File dir = new File(dirName);
if (!dir.isDirectory()) {
Modified: trunk/corpusreader/src/main/java/tei/cr/utils/sax/StAX2SAXEvent.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/utils/sax/StAX2SAXEvent.java 2008-12-20 17:24:08 UTC (rev 208)
+++ trunk/corpusreader/src/main/java/tei/cr/utils/sax/StAX2SAXEvent.java 2008-12-20 21:23:44 UTC (rev 209)
@@ -116,14 +116,13 @@
default:}
}
-
+
public final static Attributes createSAXAttributes(StartElement s) {
AttributesImpl att = new AttributesImpl();
Iterator i = s.getAttributes();
while (i.hasNext()) {
Attribute a = (Attribute) i.next();
QName aQN = a.getName();
-
String uri = QNameUtils.getNamespaceURI(aQN);
String lname = aQN.getLocalPart();
String qname = QNameUtils.getQName(aQN);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2008-12-20 17:24:14
|
Revision: 208
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=208&view=rev
Author: sylvainloiseau
Date: 2008-12-20 17:24:08 +0000 (Sat, 20 Dec 2008)
Log Message:
-----------
classes for handling syntex files + various utilities
Modified Paths:
--------------
trunk/corpusreader/src/main/java/tei/cr/Version.java
trunk/corpusreader/src/main/java/tei/cr/filters/MarkBalanced.java
trunk/corpusreader/src/main/java/tei/cr/utils/sax/StAX2SAXEvent.java
trunk/corpusreader/src/test/java/tei/cr/teiScheme/TestAGG.java
Added Paths:
-----------
trunk/corpusreader/src/main/java/tei/cr/utils/FileUtils.java
trunk/corpusreader/src/main/java/tei/cr/utils/syntex/
trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java
trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexVocabulary.java
trunk/corpusreader/src/test/java/tei/cr/utils/syntex/
trunk/corpusreader/src/test/java/tei/cr/utils/syntex/TestSyntexBuffer.java
trunk/corpusreader/src/test/resources/syntex.xml
Modified: trunk/corpusreader/src/main/java/tei/cr/Version.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/Version.java 2008-12-15 21:41:48 UTC (rev 207)
+++ trunk/corpusreader/src/main/java/tei/cr/Version.java 2008-12-20 17:24:08 UTC (rev 208)
@@ -11,7 +11,7 @@
// don't change this lines; regexp are used during build for updating the fields.
private static final String VERSION = "0.1"; // "8.2";
- private static final String RELEASE_DATE = "20081120-1938";
+ private static final String RELEASE_DATE = "20081219-1918";
private static final String PRODUCT_NAME = "CorpusReader";
private Version() {
Modified: trunk/corpusreader/src/main/java/tei/cr/filters/MarkBalanced.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/filters/MarkBalanced.java 2008-12-15 21:41:48 UTC (rev 207)
+++ trunk/corpusreader/src/main/java/tei/cr/filters/MarkBalanced.java 2008-12-20 17:24:08 UTC (rev 208)
@@ -25,7 +25,7 @@
// ************************************** //
private Stack openedElement = new Stack();
- private char[] codePoint = { '(', ')', '\xAB', '\xBB' };
+ private char[] codePoint = { '(', ')', '', '' };
private boolean[] isOpening = { true, false, true, false };
private String[] markup = { "parenthesis", "PRE laquo POST raquo" };
Added: trunk/corpusreader/src/main/java/tei/cr/utils/FileUtils.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/utils/FileUtils.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/utils/FileUtils.java 2008-12-20 17:24:08 UTC (rev 208)
@@ -0,0 +1,28 @@
+package tei.cr.utils;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+final public class FileUtils {
+
+ public final static List<File> getFilesOfDirRec (String dirName) {
+ List<File> list = new ArrayList<File>();
+ File dir = new File(dirName);
+ if (!dir.isDirectory()) {
+ throw new IllegalArgumentException("\"" + dirName + "\" is not a directory.");
+ }
+ String[] files = dir.list();
+ for (int i = 0; i < files.length; i++) {
+ String fileName = dirName + "/" + files[i];
+ File f = new File(fileName);
+ if (f.isDirectory()) {
+ list.addAll(getFilesOfDirRec(fileName));
+ } else if (f.isFile()) {
+ list.add(f);
+ }
+ }
+ return list;
+ }
+
+}
Modified: trunk/corpusreader/src/main/java/tei/cr/utils/sax/StAX2SAXEvent.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/utils/sax/StAX2SAXEvent.java 2008-12-15 21:41:48 UTC (rev 207)
+++ trunk/corpusreader/src/main/java/tei/cr/utils/sax/StAX2SAXEvent.java 2008-12-20 17:24:08 UTC (rev 208)
@@ -6,6 +6,7 @@
import org.xml.sax.helpers.AttributesImpl;
import tei.cr.utils.xml.QNameUtils;
import java.util.Iterator;
+import java.util.List;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.events.Attribute;
@@ -17,6 +18,13 @@
*/
final public class StAX2SAXEvent {
+ public static void toContentHandler(List<XMLEvent> events, ContentHandler target)
+ throws SAXException {
+ for (int i = 0; i < events.size(); i++) {
+ toContentHandler(events.get(i), target);
+ }
+ }
+
public static void toContentHandler(XMLEvent event, ContentHandler target)
throws SAXException {
switch (event.getEventType()) {
@@ -87,7 +95,7 @@
case XMLStreamConstants.SPACE:
- // ??? diff\xE9rent de "characters.isSpace" ?
+ // ??? diffrent de "characters.isSpace" ?
break;
case XMLStreamConstants.START_DOCUMENT:
Added: trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexBuffer.java 2008-12-20 17:24:08 UTC (rev 208)
@@ -0,0 +1,129 @@
+package tei.cr.utils.syntex;
+
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.NoSuchElementException;
+import javax.xml.stream.events.Attribute;
+import javax.xml.stream.events.EndElement;
+import javax.xml.stream.events.StartElement;
+import javax.xml.stream.events.XMLEvent;
+import tei.cr.utils.stax.StAXBuffer;
+import tei.cr.utils.stax.StAXBufferImpl;
+
+public class SyntexBuffer {
+
+ private final StAXBuffer syntex;
+
+ public SyntexBuffer(String uri) throws FileNotFoundException {
+ syntex = new StAXBufferImpl(uri);
+ }
+
+ public SyntexSentenceIterator getSentenceIterator() {
+ return new SyntexSentenceIterator(syntex);
+ }
+
+ public class SyntexSentenceIterator {
+
+ private int currentEventIndex = 0;
+ private List<XMLEvent> events;
+
+ public SyntexSentenceIterator(StAXBuffer syntex) {
+ events = syntex.getXMLEvents();
+ }
+
+ /**
+ * Return the complete (<code>qualified</code>) id of the next sentence.
+ *
+ * @return the complete id.
+ * @throws NoSuchElementException if there is no more sentence.
+ */
+ public String getNextSentenceQualifiedId() {
+ int index = getNextSentenceIndex();
+ if (index == -1) {
+ throw new NoSuchElementException("No more sentence");
+ }
+ Attribute a = events.get(index).asStartElement().getAttributeByName(SyntexVocabulary.SEQ_ELEMENT_ID_ATTRIBUTE);
+ return a.getValue();
+ }
+
+ /**
+ * Return the document part of the next sentence id.
+ *
+ * @return the complete id.
+ * @throws NoSuchElementException if there is no more sentence.
+ */
+ public String getNextSentenceDocumentId() {
+ String qualifiedId = getNextSentenceQualifiedId();
+ return getDocumentPart(qualifiedId);
+ }
+
+ /**
+ * Return the next sentence
+ *
+ * @return a <code>List</code> of <code>XMLEvent</code> representing a complete sentence analysis by syntex.
+ *
+ * @throws NoSuchElementException if there is no more sentence.
+ */
+ public List<XMLEvent> getNextSentence() {
+ List<XMLEvent> sentence = new ArrayList<XMLEvent>();
+ int start = getNextSentenceIndex();
+ if (start == -1) {
+ throw new NoSuchElementException("No more sentence");
+ }
+ int end = getNextSentenceEndIndex(start);
+ if (end == -1) {
+ throw new IllegalStateException("No EndElement for this sentence. Bug somewhere.");
+ }
+ List<XMLEvent> subList = events.subList(start, end);
+ sentence.addAll(subList);
+ currentEventIndex = end;
+ return sentence;
+ }
+
+ public boolean hasNextSentence() {
+ int index = getNextSentenceIndex();
+ return index != -1;
+ }
+
+ /**
+ * Get the index of the StartElement of the next sentence.
+ * @return
+ */
+ protected int getNextSentenceIndex() {
+ for (int i = currentEventIndex; i+1 < events.size(); i++) {
+ XMLEvent e = events.get(i);
+ if (e.isStartElement()) {
+ StartElement se = e.asStartElement();
+ if (se.getName().getLocalPart().equals(SyntexVocabulary.SENTENCE)) {
+ return i;
+ }
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Get the index of the EndElement of the next sentence.
+ * @return
+ */
+ protected int getNextSentenceEndIndex(int nextSentenceStartIndex) {
+ int end = nextSentenceStartIndex + 1;
+ for (; end < events.size(); end++) {
+ XMLEvent e = events.get(end);
+ if (e.isEndElement()) {
+ EndElement se = e.asEndElement();
+ if (se.getName().getLocalPart().equals(SyntexVocabulary.SENTENCE)) {
+ return end;
+ }
+ }
+ }
+ return -1;
+ }
+
+ protected String getDocumentPart(String qualifiedId) {
+ return qualifiedId.substring(0, qualifiedId.lastIndexOf("_"));
+ }
+
+ }
+}
Added: trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexVocabulary.java
===================================================================
--- trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexVocabulary.java (rev 0)
+++ trunk/corpusreader/src/main/java/tei/cr/utils/syntex/SyntexVocabulary.java 2008-12-20 17:24:08 UTC (rev 208)
@@ -0,0 +1,13 @@
+package tei.cr.utils.syntex;
+
+import javax.xml.namespace.QName;
+
+public class SyntexVocabulary {
+
+ public static final String SENTENCE = "SEQ";
+ public final static QName SEQ_ELEMENT_ID_ATTRIBUTE;
+ static {
+ SEQ_ELEMENT_ID_ATTRIBUTE = new QName("", "id");
+ }
+
+}
Modified: trunk/corpusreader/src/test/java/tei/cr/teiScheme/TestAGG.java
===================================================================
--- trunk/corpusreader/src/test/java/tei/cr/teiScheme/TestAGG.java 2008-12-15 21:41:48 UTC (rev 207)
+++ trunk/corpusreader/src/test/java/tei/cr/teiScheme/TestAGG.java 2008-12-20 17:24:08 UTC (rev 208)
@@ -23,13 +23,13 @@
File f = new File(".");
path = f.getAbsolutePath() + "/src/main/odd/ggx";
}
-
+
protected void setUp() {
}
protected void tearDown() throws Exception {
}
-
+
public void testModules() {
BaseFactory bf = BaseFactory.theFactory();
GraGra graGra = bf.createGraGra();
Added: trunk/corpusreader/src/test/java/tei/cr/utils/syntex/TestSyntexBuffer.java
===================================================================
--- trunk/corpusreader/src/test/java/tei/cr/utils/syntex/TestSyntexBuffer.java (rev 0)
+++ trunk/corpusreader/src/test/java/tei/cr/utils/syntex/TestSyntexBuffer.java 2008-12-20 17:24:08 UTC (rev 208)
@@ -0,0 +1,68 @@
+package tei.cr.utils.syntex;
+
+import java.io.FileNotFoundException;
+import java.util.List;
+
+import javax.xml.stream.events.XMLEvent;
+
+import tei.cr.utils.syntex.SyntexBuffer.SyntexSentenceIterator;
+import junit.framework.TestCase;
+
+public class TestSyntexBuffer extends TestCase {
+
+ SyntexBuffer syntexBuffer;
+
+ protected void setUp() {
+ try {
+ syntexBuffer = new SyntexBuffer(System.getProperty("user.dir") + "/src/test/resources/syntex.xml");
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ }
+
+ protected void tearDown() {
+ syntexBuffer = null;
+ }
+
+ public void testGetSentenceIterator() {
+ syntexBuffer.getSentenceIterator();
+ }
+
+ public void testNextSentenceIndex() {
+ SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
+ assertEquals(3, it.getNextSentenceIndex());
+ }
+
+ public void testNextSentenceEndIndex() {
+ SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
+ int start = it.getNextSentenceIndex();
+ assertEquals(464, it.getNextSentenceEndIndex(start));
+ }
+
+ public void testNextSentenceDocumentId() {
+ SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
+ assertEquals("300001_1782665", it.getNextSentenceDocumentId());
+ }
+
+ public void testNextSentenceQualifiedId() {
+ SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
+ assertEquals("300001_1782665_1", it.getNextSentenceQualifiedId());
+ }
+
+ public void testHasSentence() {
+ SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
+ assertTrue(it.hasNextSentence());
+// List<XMLEvent> sent1 = it.getNextSentence();
+// System.out.println(sent1);
+ assertTrue(it.hasNextSentence());
+// List<XMLEvent> sent2 = it.getNextSentence();
+// System.out.println(sent2);
+ assertFalse(it.hasNextSentence());
+ }
+
+ public void testGetDocumentPart() {
+ SyntexSentenceIterator it = syntexBuffer.getSentenceIterator();
+ String doc = it.getDocumentPart("doc_sentence");
+ assertEquals("doc", doc);
+ }
+}
Added: trunk/corpusreader/src/test/resources/syntex.xml
===================================================================
--- trunk/corpusreader/src/test/resources/syntex.xml (rev 0)
+++ trunk/corpusreader/src/test/resources/syntex.xml 2008-12-20 17:24:08 UTC (rev 208)
@@ -0,0 +1,332 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<syntex>
+ <SEQ id="300001_1782665_1">
+ <TXT>Apr\xE8s l' op\xE9ration " Margarita " lanc\xE9e lundi matin 27 juin dans la r\xE9gion parisienne , \xE0 Marseille , Limoges et Montpellier , Bordeaux et Lille , contre les int\xE9r\xEAts strat\xE9giques et financiers des cartels colombiens en France ( " le Monde " du 28 juin ) , cinquante-sept personnes ont \xE9t\xE9 plac\xE9es en garde \xE0 vue , dont un quart de Colombiens install\xE9s dans l' Hexagone .</TXT>
+ <tokens>
+ <t i="1" l="apr\xE8s" f="Apr\xE8s" c="Prep" p="O"/>
+ <t i="2" l="le" f="l'" c="Det??" p="D"/>
+ <t i="3" l="op\xE9ration" f="op\xE9ration" c="Nom?S" p="N"/>
+ <t i="4" l=""" f=""" c="Typo" p="T"/>
+ <t i="5" l="Margarita" f="Margarita" c="NomPrXXInc" p="NP"/>
+ <t i="6" l=""" f=""" c="Typo" p="T"/>
+ <t i="7" l="lanc\xE9e" f="lanc\xE9e" c="Nom?S" p="N"/>
+ <t i="8" l="lundi" f="lundi" c="NomXXDate" p="N"/>
+ <t i="9" l="matin" f="matin" c="NomXXDate" p="N"/>
+ <t i="10" l="27 juin" f="27 juin" c="NomXXDate" p="N"/>
+ <t i="11" l="dans" f="dans" c="Prep" p="O"/>
+ <t i="12" l="le" f="la" c="Det??" p="D"/>
+ <t i="13" l="r\xE9gion" f="r\xE9gion" c="NomFS" p="N"/>
+ <t i="14" l="parisien" f="parisienne" c="Adj??" p="A"/>
+ <t i="15" l="," f="," c="Typo" p="T"/>
+ <t i="16" l="\xE0" f="\xE0" c="Prep" p="O"/>
+ <t i="17" l="Marseille" f="Marseille" c="NomPr" p="NP"/>
+ <t i="18" l="," f="," c="TypoCoordNomPr" p="T"/>
+ <t i="19" l="Limoges" f="Limoges" c="NomPr" p="NP"/>
+ <t i="20" l="et" f="et" c="CCoordNomPr" p="Cc"/>
+ <t i="21" l="Montpellier" f="Montpellier" c="NomPr" p="NP"/>
+ <t i="22" l="," f="," c="TypoCoordNomPr" p="T"/>
+ <t i="23" l="Bordeaux" f="Bordeaux" c="NomPr" p="NP"/>
+ <t i="24" l="et" f="et" c="CCoordNomPr" p="Cc"/>
+ <t i="25" l="Lille" f="Lille" c="NomPr" p="NP"/>
+ <t i="26" l="," f="," c="Typo" p="T"/>
+ <t i="27" l="contre" f="contre" c="Prep" p="O"/>
+ <t i="28" l="le" f="les" c="Det??" p="D"/>
+ <t i="29" l="int\xE9r\xEAt" f="int\xE9r\xEAts" c="Nom?P" p="N"/>
+ <t i="30" l="strat\xE9gique" f="strat\xE9giques" c="Adj??" p="A"/>
+ <t i="31" l="et" f="et" c="CCoordAdj" p="Cc"/>
+ <t i="32" l="financier" f="financiers" c="Adj??" p="A"/>
+ <t i="33" l="de" f="des" c="Prep" p="O"/>
+ <t i="34" l="cartel" f="cartels" c="Nom?P" p="N"/>
+ <t i="35" l="colombien" f="colombiens" c="NomInc" p="N"/>
+ <t i="36" l="en" f="en" c="Prep" p="O"/>
+ <t i="37" l="France" f="France" c="NomPr" p="NP"/>
+ <t i="38" l="(" f="(" c="Typo" p="T"/>
+ <t i="39" l=""" f=""" c="Typo" p="T"/>
+ <t i="40" l="le" f="le" c="Det??" p="D"/>
+ <t i="41" l="monde" f="Monde" c="NomMS" p="N"/>
+ <t i="42" l=""" f=""" c="Typo" p="T"/>
+ <t i="43" l="de" f="du" c="PrepDet" p="O"/>
+ <t i="44" l="28 juin" f="28 juin" c="NomXXDate" p="N"/>
+ <t i="45" l=")" f=")" c="Typo" p="T"/>
+ <t i="46" l="," f="," c="Typo" p="T"/>
+ <t i="47" l="cinquante-sept" f="cinquante-sept" c="Adj?P" p="A"/>
+ <t i="48" l="personne" f="personnes" c="Nom?P" p="N"/>
+ <t i="49" l="avoir" f="ont" c="VCONJP" p="V"/>
+ <t i="50" l="\xEAtre" f="\xE9t\xE9" c="PpaMSp" p="V"/>
+ <t i="51" l="placer" f="plac\xE9es" c="PpaFP" p="V"/>
+ <t i="52" l="en" f="en" c="Prep" p="O"/>
+ <t i="53" l="garde" f="garde" c="Nom?S" p="N"/>
+ <t i="54" l="\xE0 vue" f="\xE0 vue" c="AdvGP" p="R"/>
+ <t i="55" l="," f="," c="Typo" p="T"/>
+ <t i="56" l="dont" f="dont" c="ProRel" p="P"/>
+ <t i="57" l="un" f="un" c="Det??" p="D"/>
+ <t i="58" l="quart" f="quart" c="NomMS" p="N"/>
+ <t i="59" l="de" f="de" c="Prep" p="O"/>
+ <t i="60" l="Colombien" f="Colombiens" c="NomPrXXInc" p="NP"/>
+ <t i="61" l="installer" f="install\xE9s" c="PpaMP" p="V"/>
+ <t i="62" l="dans" f="dans" c="Prep" p="O"/>
+ <t i="63" l="le" f="l'" c="Det??" p="D"/>
+ <t i="64" l="hexagone" f="Hexagone" c="Nom?S" p="N"/>
+ <t i="65" l="." f="." c="Typo" p="T"/>
+ </tokens>
+ <dependances>
+ <g r="NOMPREP" s="1" c="3"/>
+ <d r="DET" s="2" c="3"/>
+ <g r="DET" s="3" c="2"/>
+ <g r="EPI" s="3" c="5"/>
+ <d r="NOMPREP" s="3" c="1"/>
+ <d r="EPI" s="5" c="3"/>
+ <g r="EPI" s="7" c="8"/>
+ <g r="PREP" s="7" c="11"/>
+ <g r="EPI" s="8" c="9"/>
+ <d r="EPI" s="8" c="7"/>
+ <g r="EPI" s="9" c="10"/>
+ <d r="EPI" s="9" c="8"/>
+ <d r="EPI" s="10" c="9"/>
+ <g r="NOMPREP" s="11" c="13"/>
+ <d r="PREP" s="11" c="7"/>
+ <d r="DET" s="12" c="13"/>
+ <g r="DET" s="13" c="12"/>
+ <g r="ADJ" s="13" c="14"/>
+ <d r="NOMPREP" s="13" c="11"/>
+ <d r="ADJ" s="14" c="13"/>
+ <g r="NOMPREP" s="16" c="20"/>
+ <d r="CC" s="17" c="20"/>
+ <d r="CC" s="19" c="20"/>
+ <g r="CC" s="20" c="17"/>
+ <g r="CC" s="20" c="19"/>
+ <g r="CC" s="20" c="24"/>
+ <d r="NOMPREP" s="20" c="16"/>
+ <d r="CC" s="21" c="24"/>
+ <d r="CC" s="23" c="24"/>
+ <g r="CC" s="24" c="21"/>
+ <g r="CC" s="24" c="23"/>
+ <g r="CC" s="24" c="25"/>
+ <d r="CC" s="24" c="20"/>
+ <d r="CC" s="25" c="24"/>
+ <g r="NOMPREP" s="27" c="29"/>
+ <d r="DET" s="28" c="29"/>
+ <g r="DET" s="29" c="28"/>
+ <g r="ADJ" s="29" c="31"/>
+ <g r="PREP" s="29" c="33"/>
+ <g r="PREP" s="29" c="36"/>
+ <d r="NOMPREP" s="29" c="27"/>
+ <d r="CC" s="30" c="31"/>
+ <g r="CC" s="31" c="30"/>
+ <g r="CC" s="31" c="32"/>
+ <d r="ADJ" s="31" c="29"/>
+ <d r="CC" s="32" c="31"/>
+ <g r="NOMPREP" s="33" c="34"/>
+ <d r="PREP" s="33" c="29"/>
+ <g r="EPI" s="34" c="35"/>
+ <d r="NOMPREP" s="34" c="33"/>
+ <d r="EPI" s="35" c="34"/>
+ <g r="NOMPREP" s="36" c="37"/>
+ <d r="PREP" s="36" c="29"/>
+ <d r="NOMPREP" s="37" c="36"/>
+ <g r="PAR" s="38" c="45"/>
+ <d r="DET" s="40" c="41"/>
+ <g r="DET" s="41" c="40"/>
+ <g r="NOMPREP" s="43" c="44"/>
+ <d r="NOMPREP" s="44" c="43"/>
+ <d r="PAR" s="45" c="38"/>
+ <d r="ADJ" s="47" c="48"/>
+ <g r="ADJ" s="48" c="47"/>
+ <d r="SUJ" s="48" c="49"/>
+ <g r="SUJ" s="49" c="48"/>
+ <g r="AUX" s="49" c="50"/>
+ <g r="AUX" s="50" c="51"/>
+ <d r="AUX" s="50" c="49"/>
+ <g r="PREP" s="51" c="52"/>
+ <d r="AUX" s="51" c="50"/>
+ <g r="NOMPREP" s="52" c="53"/>
+ <d r="PREP" s="52" c="51"/>
+ <d r="NOMPREP" s="53" c="52"/>
+ <d r="DET" s="57" c="58"/>
+ <g r="DET" s="58" c="57"/>
+ <g r="PREP" s="58" c="59"/>
+ <g r="NOMPREP" s="59" c="60"/>
+ <d r="PREP" s="59" c="58"/>
+ <d r="NOMPREP" s="60" c="59"/>
+ <g r="PREP" s="61" c="62"/>
+ <g r="NOMPREP" s="62" c="64"/>
+ <d r="PREP" s="62" c="61"/>
+ <d r="DET" s="63" c="64"/>
+ <g r="DET" s="64" c="63"/>
+ <d r="NOMPREP" s="64" c="62"/>
+ </dependances>
+ </SEQ>
+ <SEQ id="300001_1782665_2">
+ <TXT>Le commissaire Ren\xE9 Wack , directeur de l' Office central pour la r\xE9pression de la grande d\xE9linquance financi\xE8re , a fait , lundi 27 juin \xE0 Nanterre ( Hauts-de-Seine ) , un premier bilan de cette op\xE9ration au cours de laquelle plus de 40 de coca\xEFne et plus dE 1 million de francs ont \xE9t\xE9 saisis.MARCH\xC9S . L' op\xE9ration \xE9tait men\xE9e simultan\xE9ment en France , dans plusieurs pays d' Europe et aux Etats-Unis .</TXT>
+ <tokens>
+ <t i="1" l="le" f="Le" c="Det??" p="D"/>
+ <t i="2" l="commissaire" f="commissaire" c="NomMS" p="N"/>
+ <t i="3" l="Ren\xE9" f="Ren\xE9" c="NomPrXXPrenom" p="NP"/>
+ <t i="4" l="Wack" f="Wack" c="NomPrXXInc" p="NP"/>
+ <t i="5" l="," f="," c="Typo" p="T"/>
+ <t i="6" l="directeur" f="directeur" c="Nom?S" p="N"/>
+ <t i="7" l="de" f="de" c="Prep" p="O"/>
+ <t i="8" l="le" f="l'" c="Det??" p="D"/>
+ <t i="9" l="office" f="Office" c="Nom?S" p="N"/>
+ <t i="10" l="central" f="central" c="Adj??" p="A"/>
+ <t i="11" l="pour" f="pour" c="Prep" p="O"/>
+ <t i="12" l="le" f="la" c="Det??" p="D"/>
+ <t i="13" l="r\xE9pression" f="r\xE9pression" c="NomFS" p="N"/>
+ <t i="14" l="de" f="de" c="Prep" p="O"/>
+ <t i="15" l="le" f="la" c="Det??" p="D"/>
+ <t i="16" l="grand" f="grande" c="Adj??" p="A"/>
+ <t i="17" l="d\xE9linquance" f="d\xE9linquance" c="NomFS" p="N"/>
+ <t i="18" l="financier" f="financi\xE8re" c="Adj??" p="A"/>
+ <t i="19" l="," f="," c="Typo" p="T"/>
+ <t i="20" l="avoir" f="a" c="VCONJS" p="V"/>
+ <t i="21" l="faire" f="fait" c="PpaMS" p="V"/>
+ <t i="22" l="," f="," c="Typo" p="T"/>
+ <t i="23" l="lundi 27 juin" f="lundi 27 juin" c="NomXXDate" p="N"/>
+ <t i="24" l="\xE0" f="\xE0" c="Prep" p="O"/>
+ <t i="25" l="Nanterre" f="Nanterre" c="NomPrXXInc" p="NP"/>
+ <t i="26" l="(" f="(" c="Typo" p="T"/>
+ <t i="27" l="Hauts-de-seine" f="Hauts-de-Seine" c="NomPrXXInc" p="NP"/>
+ <t i="28" l=")" f=")" c="Typo" p="T"/>
+ <t i="29" l="," f="," c="Typo" p="T"/>
+ <t i="30" l="un" f="un" c="Det??" p="D"/>
+ <t i="31" l="premier" f="premier" c="AdjMS" p="A"/>
+ <t i="32" l="bilan" f="bilan" c="NomMS" p="N"/>
+ <t i="33" l="de" f="de" c="Prep" p="O"/>
+ <t i="34" l="ce" f="cette" c="DetFS" p="D"/>
+ <t i="35" l="op\xE9ration" f="op\xE9ration" c="NomFS" p="N"/>
+ <t i="36" l="au cours de" f="au cours de" c="Prep" p="O"/>
+ <t i="37" l="lequel" f="laquelle" c="ProRel" p="P"/>
+ <t i="38" l="plus de" f="plus de" c="Det??" p="D"/>
+ <t i="39" l="40" f="40" c="NomXXNum" p="N"/>
+ <t i="40" l="de" f="de" c="Prep" p="O"/>
+ <t i="41" l="coca\xEFne" f="coca\xEFne" c="Nom?S" p="N"/>
+ <t i="42" l="et" f="et" c="CCoord" p="Cc"/>
+ <t i="43" l="plus de" f="plus dE" c="Det??" p="D"/>
+ <t i="44" l="1" f="1" c="Adj?P" p="A"/>
+ <t i="45" l="million" f="million" c="Nom?S" p="N"/>
+ <t i="46" l="de" f="de" c="Prep" p="O"/>
+ <t i="47" l="francs" f="francs" c="NomXXMon" p="N"/>
+ <t i="48" l="avoir" f="ont" c="VCONJP" p="V"/>
+ <t i="49" l="\xEAtre" f="\xE9t\xE9" c="PpaMS" p="V"/>
+ <t i="50" l="Saisis.march\xE9s" f="saisis.MARCH\xC9S" c="NomPrXXInc" p="NP"/>
+ <t i="51" l="." f="." c="Typo" p="T"/>
+ <t i="52" l="le" f="L'" c="Det??" p="D"/>
+ <t i="53" l="op\xE9ration" f="op\xE9ration" c="Nom?S" p="N"/>
+ <t i="54" l="\xEAtre" f="\xE9tait" c="VCONJSp" p="V"/>
+ <t i="55" l="mener" f="men\xE9e" c="PpaFS" p="V"/>
+ <t i="56" l="simultan\xE9ment" f="simultan\xE9ment" c="Adv" p="R"/>
+ <t i="57" l="en" f="en" c="Prep" p="O"/>
+ <t i="58" l="France" f="France" c="NomPr" p="NP"/>
+ <t i="59" l="," f="," c="TypoCoordPrep" p="T"/>
+ <t i="60" l="dans" f="dans" c="Prep" p="O"/>
+ <t i="61" l="plusieurs" f="plusieurs" c="Det" p="D"/>
+ <t i="62" l="pays" f="pays" c="Nom?P" p="N"/>
+ <t i="63" l="de" f="d'" c="Prep" p="O"/>
+ <t i="64" l="Europe" f="Europe" c="NomPr" p="NP"/>
+ <t i="65" l="et" f="et" c="CCoordPrep" p="Cc"/>
+ <t i="66" l="\xE0" f="aux" c="Prep" p="O"/>
+ <t i="67" l="\xE9tats-unis" f="Etats-Unis" c="NomPr" p="NP"/>
+ <t i="68" l="." f="." c="Typo" p="T"/>
+ </tokens>
+ <dependances>
+ <d r="DET" s="1" c="2"/>
+ <g r="DET" s="2" c="1"/>
+ <g r="EPI" s="2" c="4"/>
+ <d r="SUJ" s="2" c="20"/>
+ <d r="NNPR" s="3" c="4"/>
+ <g r="NNPR" s="4" c="3"/>
+ <d r="EPI" s="4" c="2"/>
+ <g r="PREP" s="6" c="7"/>
+ <g r="NOMPREP" s="7" c="9"/>
+ <d r="PREP" s="7" c="6"/>
+ <d r="DET" s="8" c="9"/>
+ <g r="DET" s="9" c="8"/>
+ <g r="ADJ" s="9" c="10"/>
+ <g r="PREP" s="9" c="11"/>
+ <d r="NOMPREP" s="9" c="7"/>
+ <d r="ADJ" s="10" c="9"/>
+ <g r="NOMPREP" s="11" c="13"/>
+ <d r="PREP" s="11" c="9"/>
+ <d r="DET" s="12" c="13"/>
+ <g r="DET" s="13" c="12"/>
+ <g r="PREP" s="13" c="14"/>
+ <d r="NOMPREP" s="13" c="11"/>
+ <g r="NOMPREP" s="14" c="17"/>
+ <d r="PREP" s="14" c="13"/>
+ <d r="DET" s="15" c="17"/>
+ <d r="ADJ" s="16" c="17"/>
+ <g r="DET" s="17" c="15"/>
+ <g r="ADJ" s="17" c="16"/>
+ <g r="ADJ" s="17" c="18"/>
+ <d r="NOMPREP" s="17" c="14"/>
+ <d r="ADJ" s="18" c="17"/>
+ <g r="SUJ" s="20" c="2"/>
+ <g r="AUX" s="20" c="21"/>
+ <g r="OBJ" s="21" c="32"/>
+ <d r="AUX" s="21" c="20"/>
+ <g r="NOMPREP" s="24" c="25"/>
+ <d r="NOMPREP" s="25" c="24"/>
+ <g r="PAR" s="26" c="28"/>
+ <d r="PAR" s="28" c="26"/>
+ <d r="DET" s="30" c="32"/>
+ <d r="ADJ" s="31" c="32"/>
+ <g r="DET" s="32" c="30"/>
+ <g r="ADJ" s="32" c="31"/>
+ <g r="PREP" s="32" c="33"/>
+ <d r="OBJ" s="32" c="21"/>
+ <g r="NOMPREP" s="33" c="35"/>
+ <d r="PREP" s="33" c="32"/>
+ <d r="DET" s="34" c="35"/>
+ <g r="DET" s="35" c="34"/>
+ <g r="REL" s="35" c="37"/>
+ <d r="NOMPREP" s="35" c="33"/>
+ <g r="NOMPREP" s="36" c="37"/>
+ <d r="NOMPREP" s="37" c="36"/>
+ <d r="REL" s="37" c="35"/>
+ <d r="DET" s="38" c="39"/>
+ <g r="DET" s="39" c="38"/>
+ <g r="PREP" s="39" c="40"/>
+ <g r="NOMPREP" s="40" c="41"/>
+ <d r="PREP" s="40" c="39"/>
+ <d r="NOMPREP" s="41" c="40"/>
+ <d r="DET" s="43" c="45"/>
+ <d r="ADJ" s="44" c="45"/>
+ <g r="DET" s="45" c="43"/>
+ <g r="ADJ" s="45" c="44"/>
+ <g r="PREP" s="45" c="46"/>
+ <g r="NOMPREP" s="46" c="47"/>
+ <d r="PREP" s="46" c="45"/>
+ <d r="NOMPREP" s="47" c="46"/>
+ <g r="AUX" s="48" c="49"/>
+ <g r="ATTS" s="49" c="50"/>
+ <d r="AUX" s="49" c="48"/>
+ <d r="ATTS" s="50" c="49"/>
+ <d r="DET" s="52" c="53"/>
+ <g r="DET" s="53" c="52"/>
+ <d r="SUJ" s="53" c="54"/>
+ <g r="SUJ" s="54" c="53"/>
+ <g r="AUX" s="54" c="55"/>
+ <g r="PREP" s="55" c="65"/>
+ <d r="AUX" s="55" c="54"/>
+ <g r="NOMPREP" s="57" c="58"/>
+ <d r="CC" s="57" c="65"/>
+ <d r="NOMPREP" s="58" c="57"/>
+ <g r="NOMPREP" s="60" c="62"/>
+ <d r="CC" s="60" c="65"/>
+ <d r="DET" s="61" c="62"/>
+ <g r="DET" s="62" c="61"/>
+ <g r="PREP" s="62" c="63"/>
+ <d r="NOMPREP" s="62" c="60"/>
+ <g r="NOMPREP" s="63" c="64"/>
+ <d r="PREP" s="63" c="62"/>
+ <d r="NOMPREP" s="64" c="63"/>
+ <g r="CC" s="65" c="57"/>
+ <g r="CC" s="65" c="60"/>
+ <g r="CC" s="65" c="66"/>
+ <d r="PREP" s="65" c="55"/>
+ <g r="NOMPREP" s="66" c="67"/>
+ <d r="CC" s="66" c="65"/>
+ <d r="NOMPREP" s="67" c="66"/>
+ </dependances>
+ </SEQ>
+</syntex>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2008-12-15 22:21:08
|
Revision: 206
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=206&view=rev
Author: sylvainloiseau
Date: 2008-12-15 21:40:31 +0000 (Mon, 15 Dec 2008)
Log Message:
-----------
New implementation of specificites
Added Paths:
-----------
trunk/corpusreader/src/main/R/specificites.R
Added: trunk/corpusreader/src/main/R/specificites.R
===================================================================
--- trunk/corpusreader/src/main/R/specificites.R (rev 0)
+++ trunk/corpusreader/src/main/R/specificites.R 2008-12-15 21:40:31 UTC (rev 206)
@@ -0,0 +1,106 @@
+specificites <- function(corpus, souscorpus) {
+ ## Sylvain Loiseau
+ ## Derni\xE8re \xE9dition samedi 13 d\xE9cembre 2008, 19:38:22 (UTC+0100)
+ ##
+ ## Indice de sp\xE9cificit\xE9 des formes d'un sous corpus par rapport \xE0 un corpus.
+ ## Mesur\xE9e par la formule hyperg\xE9om\xE9trique,
+ ## retourne un tableau :
+ ## une ligne par forme du sous corpus
+ ## en colonne : indice de sp\xE9cificit\xE9, sous fr\xE9quence, fr\xE9quence.
+ ##
+
+ ## Les fr\xE9quences dans le corpus
+ if (class(corpus) == "data.frame") {
+ frequencesCorpus <- colSums(corpus);
+ print("Le corpus est un data frame");
+ } else {
+ frequencesCorpus = corpus;
+ }
+
+ print(paste("Nombre de variables dans le corpus : ", length(frequencesCorpus)));
+
+ ## Un peu de nettoyage dans le sous corpus : on supprime
+ ## d'\xE9ventuelles forme ou partie sans aucune valeur
+ sousCorpus <- sousCorpus[,colSums(sousCorpus) > 0];
+ sousCorpus <- sousCorpus[rowSums(sousCorpus) > 0,];
+
+ ## Les fr\xE9quences dans le sous-corpus
+ sousFrequences <- colSums(sousCorpus);
+
+ specificitesFrequencyLists(frequencesCorpus, sousFrequences);
+}
+
+specificitesFrequencyLists <- function(frequencesCorpus, sousFrequences) {
+ if (any(is.na(frequencesCorpus))) {
+ stop("Valeurs non num\xE9riques dans la liste des frequences du corpus.");
+ }
+ if (any(is.na(sousFrequences))) {
+ stop("Valeurs non num\xE9riques dans la liste des fr\xE9quences du sous corpus.");
+ }
+
+ ## Nombre de formes \xE0 traiter
+ nbrFormes <- length(sousFrequences);
+ print(paste("Nombre de formes dans le sous corpus :", nbrFormes));
+
+ ## On ne garde des frequences du corpus principal
+ ## que les variables qui sont dans le sous corpus
+ frequencesTotales <- frequencesCorpus[names(sousFrequences)]
+ if (length(frequencesTotales) < nbrFormes) {
+ warning("Attention : toutes les variables du sous corpus n'ont pas \xE9t\xE9 trouv\xE9es dans le corpus.")
+ }
+
+ #frequencesTotales <- frequencesTotales[ !is.na(frequencesTotales) ];
+ #if (any(is.na(frequencesTotales))) {
+ # stop("NA in frequencesTotales");
+ #}
+
+ ## Nombre d'occurrences dans le corpus et son sous-corpus
+ longueurCorpus <- sum(frequencesCorpus)
+ longueurSousCorpus <- sum(sousFrequences)
+ print(paste("Longueur du corpus :", longueurCorpus));
+ print(paste("Longueur du sous corpus :", longueurSousCorpus));
+
+ ## Pour chaque fr\xE9quence totale, on calcule la diff\xE9rence entre la taille du corpus et cette fr\xE9quence
+ ## (n\xE9cessaire pour les fonctions phyper et dhyper)
+ longueurCorpusMoinsFrequencesTotales <- (longueurCorpus - frequencesTotales);
+ # print(paste("Taille de corpus moins fr\xE9quence", length(longueurCorpusMoinsFrequencesTotales)));
+ # if (any(is.na(longueurCorpusMoinsFrequencesTotales))) {
+ # stop("NA in longueurCorpusMoinsFrequencesTotales");
+ # }
+
+ ## on met dans un tableau "cumulative", pour chaque forme, la propabilit\xE9 cumul\xE9e
+ ## (phyper) correspondant \xE0 sa sous fr\xE9quence dans le sous corpus compte tenu de sa
+ ## fr\xE9quence absolue dans le corpus principal et des tailles des deux corpus.
+ ##
+ cumulative <- double(nbrFormes);
+ cumulative <- phyper(sousFrequences, frequencesTotales, longueurCorpusMoinsFrequencesTotales, longueurSousCorpus);
+ # cumulative[is.na(cumulative)] <- 0;
+ # if (any(is.na(cumulative))) {
+ # stop("NA in cumulative");
+ # }
+
+ ## Si la fr\xE9quence cumul\xE9e est inf\xE9rieure \xE0 0.5, c'est qu'il y a *moins* d'occurrences
+ ## que ne le laisserait pr\xE9voir le hasard, si la fr\xE9quence cumul\xE9e est sup\xE9rieure \xE0 0.5
+ ## c'est qu'il y a plus d'occurrences qu'il n'y aurait d\xFB en avoir. On inverse donc les signes
+ ## pour prendre en compte cette diff\xE9rence.
+ specificiteIndex <- double(nbrFormes);
+
+ if (any(is.na(specificiteIndex))) {
+ stop("NA in specificiteIndex");
+ }
+ if (any(is.na(sousFrequences))) {
+ stop("NA in sousFrequences");
+ }
+ # specificiteIndex[cumulative < 0.5] <- specificiteIndex[cumulative < 0.5];
+ specificiteIndex[cumulative >= 0.5] <- (1 - specificiteIndex[cumulative >= 0.5]);
+
+ m <- matrix(
+ c(specificiteIndex, sousFrequences, frequencesTotales),
+ nrow = nbrFormes,
+ ncol=3,
+ dimnames = list(names(sousFrequences), c("Indice de sp\xE9cificit\xE9", "Sous fr\xE9quence", "Fr\xE9quence absolue"))
+ );
+
+ sorted <- m[order(m[,1], decreasing=TRUE),];
+ return(sorted);
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <syl...@us...> - 2008-12-15 22:20:56
|
Revision: 207
http://corpusreader.svn.sourceforge.net/corpusreader/?rev=207&view=rev
Author: sylvainloiseau
Date: 2008-12-15 21:41:48 +0000 (Mon, 15 Dec 2008)
Log Message:
-----------
doc
Modified Paths:
--------------
trunk/corpusreader/src/main/R/specificites.R
Modified: trunk/corpusreader/src/main/R/specificites.R
===================================================================
--- trunk/corpusreader/src/main/R/specificites.R 2008-12-15 21:40:31 UTC (rev 206)
+++ trunk/corpusreader/src/main/R/specificites.R 2008-12-15 21:41:48 UTC (rev 207)
@@ -1,5 +1,6 @@
specificites <- function(corpus, souscorpus) {
## Sylvain Loiseau
+ ## 2006-2008
## Derni\xE8re \xE9dition samedi 13 d\xE9cembre 2008, 19:38:22 (UTC+0100)
##
## Indice de sp\xE9cificit\xE9 des formes d'un sous corpus par rapport \xE0 un corpus.
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|