[Pzfilereader-svn] SF.net SVN: pzfilereader: [151] trunk/PZFileReader/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 151
          http://svn.sourceforge.net/pzfilereader/?rev=151&view=rev
Author:   benoitx
Date:     2006-10-30 08:46:27 -0800 (Mon, 30 Oct 2006)

Log Message:
-----------
first cut at reg expressions...

Added Paths:
-----------
    trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/RegExParser.java
    trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/RegExParserTest.java

Added: trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/RegExParser.java
===================================================================

--- trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/RegExParser.java	                        (rev 0)
+++ trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/RegExParser.java	2006-10-30 16:46:27 UTC (rev 151)
@@ -0,0 +1,114 @@
+/**
+ * 
+ */
+package net.sf.pzfilereader.util;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author xhensevb
+ * 
+ */
+public class RegExParser {
+
+    // ///////////////////////////
+    /**
+     * The rather involved pattern used to match CSV's consists of three
+     * alternations: the first matches aquoted field, the second unquoted, the
+     * third a null field.
+     */
+    public static final String CSV_PATTERN = "\"(.*?)\",|(\\w+),|\"(.*?)\"|(\\w+),|,";
+    public static final String ORIGINAL_CSV_PATTERN = "\"([^\"]+?)\",?|([^,]+),?|,";
+
+    // public static final String CSV_PATTERN = "\"([^\"]+?)\",?|([^,]+),?|,";
+
+//    private static Pattern csvRE = Pattern.compile(CSV_PATTERN);
+
+    public static List splitLine(String line, final char delimiter, char qualifier) {
+        StringBuilder patternBuilder = new StringBuilder();
+
+        if (qualifier == 0) {
+            qualifier = '\"';
+        }
+
+        String qualif = escapeIfRequired(qualifier);
+        String delim = escapeIfRequired(delimiter);
+
+        // first Pattern
+        if (qualifier != 0) {
+            patternBuilder.append(qualif);
+        }
+        patternBuilder.append("(.*?)");
+        if (qualifier != 0) {
+            patternBuilder.append(qualif);
+        }
+        patternBuilder.append(delim);
+
+        // second Pattern
+        patternBuilder.append("|(\\w+)");
+        patternBuilder.append(delim);
+
+        // Third Pattern
+        patternBuilder.append("|");
+        if (qualifier != 0) {
+            patternBuilder.append(qualif);
+        }
+        patternBuilder.append("(.*?)");
+        if (qualifier != 0) {
+            patternBuilder.append(qualif);
+        }
+        patternBuilder.append("|(\\w+)");
+        patternBuilder.append(delim);
+
+        // Fourth Pattern
+        patternBuilder.append("|").append(delim);
+
+        String pat = patternBuilder.toString();
+
+        System.out.println(pat);
+
+        Pattern pattern = Pattern.compile(pat);
+
+        return parse(pattern, line, String.valueOf(delimiter), String.valueOf(qualifier));
+    }
+
+    private static String escapeIfRequired(final char c) {
+        if (c == 0) {
+            return "";
+        }
+        if ("([{\\^-$|]})?*+.\"\'".indexOf(c) >= 0) {
+            return "\\" + c;
+        }
+        return String.valueOf(c);
+    }
+
+    /**
+     * Parse one line.
+     * 
+     * @return List of Strings, minus their double quotes
+     */
+    public static List parse(Pattern pattern, String line, String delimiter, String qualifier) {
+        List list = new ArrayList();
+        Matcher m = pattern.matcher(line);
+        // For each field
+        while (m.find()) {
+            String match = m.group();
+            if (match == null)
+                break;
+            if (match.endsWith(delimiter)) { // trim trailing ,
+                match = match.substring(0, match.length() - 1);
+            }
+            if (match.startsWith(qualifier)) { // assume also ends with
+                match = match.substring(1, match.length() - 1);
+            }
+            if (match.length() == 0)
+                match = null;
+            list.add(match);
+        }
+        return list;
+    }
+
+}

Added: trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/RegExParserTest.java
===================================================================
--- trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/RegExParserTest.java	                        (rev 0)
+++ trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/RegExParserTest.java	2006-10-30 16:46:27 UTC (rev 151)
@@ -0,0 +1,144 @@
+package net.sf.pzfilereader.parserutils;
+
+import java.util.List;
+
+import junit.framework.TestCase;
+import net.sf.pzfilereader.util.ParserUtils;
+import net.sf.pzfilereader.util.RegExParser;
+import net.sf.pzfilereader.utilities.UnitTestUtils;
+
+/**
+ * Test the functionality of the splitLine method. This method returns a List of
+ * Strings. Each element of the list represents a column created by the parser
+ * from the delimited String.
+ * 
+ * @author Paul Zepernick
+ */
+public class RegExParserTest extends TestCase {
+    private static final String[] DELIMITED_DATA_NO_BREAKS = { "Column 1", "Column 2", "Column 3", "Column 4", "Column 5" };
+
+    private static final String[] DELIMITED_DATA_WITH_BREAKS = { "Column 1 \r\n\r\n Test After Break \r\n Another Break",
+            "Column 2", "Column 3 \r\n\r\n Test After Break", "Column 4", "Column 5 \r\n\r\n Test After Break\r\n Another Break" };
+
+    // TODO think of a situation that actually breaks the parse. This still
+    // works because of the way it is coded
+    // to handle the excel CSV. Excel CSV has some elements qualified and others
+    // not
+    private static final String DELIMITED_BAD_DATA = "\"column 1\",\"column 2 ,\"column3\"";
+
+    // 0 = delimiter
+    // 1 = qualifier
+    private static final char[][] DELIM_QUAL_PAIR = { { ',', '\"' }, { '\t', '\"' }, { '|', '\"' }, { '_', '\"' }, { ',', 0 },
+            { '|', 0 }, { '\t', 0 } };
+
+    /**
+     * Test without any line breaks
+     * 
+     */
+    public void testNoLineBreaks() {
+        // loop down all delimiter qualifier pairs to test
+        for (int i = 0; i < DELIM_QUAL_PAIR.length; i++) {
+            final char d = DELIM_QUAL_PAIR[i][0];
+            final char q = DELIM_QUAL_PAIR[i][1];
+
+            final String txtToParse = UnitTestUtils.buildDelimString(DELIMITED_DATA_NO_BREAKS, d, q);
+
+            final List splitLineResults = RegExParser.splitLine(txtToParse, d, q);
+
+            // check to make sure we have the same amount of elements which were
+            // expected
+            assertEquals("Expected size (d = [" + d + "] q = [" + (q != 0 ? String.valueOf(q) : "") + "] txt [" + txtToParse
+                    + "])", DELIMITED_DATA_NO_BREAKS.length, splitLineResults.size());
+
+            // loop through each value and compare what came back
+            for (int j = 0; j < DELIMITED_DATA_NO_BREAKS.length; j++) {
+                assertEquals("Data Element Value Does Not Match (d = [" + d + "] q = [" + q + "] txt [" + txtToParse + "])",
+                        DELIMITED_DATA_NO_BREAKS[j], (String) splitLineResults.get(j));
+            }
+        }
+
+    }
+
+    /**
+     * Test with any line breaks
+     * 
+     */
+    public void testLineBreaks() {
+        // loop down all delimiter qualifier pairs to test
+        for (int i = 0; i < DELIM_QUAL_PAIR.length; i++) {
+            final char d = DELIM_QUAL_PAIR[i][0];
+            final char q = DELIM_QUAL_PAIR[i][1];
+
+            final String txtToParse = UnitTestUtils.buildDelimString(DELIMITED_DATA_WITH_BREAKS, d, q);
+
+            final List splitLineResults = RegExParser.splitLine(txtToParse, d, q);
+
+            // check to make sure we have the same amount of elements which were
+            // expected
+            assertEquals("Did Not Get Amount Of Elements Expected (d = " + d + " q = " + q + ")",
+                    DELIMITED_DATA_WITH_BREAKS.length, splitLineResults.size());
+
+            // loop through each value and compare what came back
+            for (int j = 0; j < DELIMITED_DATA_WITH_BREAKS.length; j++) {
+                assertEquals("Data Element Value Does Not Match (d = " + d + " q = " + q + ")", DELIMITED_DATA_WITH_BREAKS[j],
+                        (String) splitLineResults.get(j));
+            }
+        }
+    }
+
+    /**
+     * Test to make sure we get the correct amount of elements for malformed
+     * data
+     */
+    public void testMalformedData() {
+        final List splitLineResults = ParserUtils.splitLine(DELIMITED_BAD_DATA, ',', '\"');
+
+        assertEquals("Expecting 2 Data Elements From The Malformed Data", 2, splitLineResults.size());
+    }
+
+    /**
+     * Test some extreme cases
+     */
+    public void testSomeExtremeCases() {
+        check("\"a,b,c\"", ',', '\"', new String[] { "a,b,c" });
+        check("\"a,b\",\"c\"", ',', '\"', new String[] { "a,b", "c" });
+        check("a,b,c", ',', '\"', new String[] { "a", "b", "c" });
+        check("  a,b,c", ',', '\"', new String[] { "a", "b", "c" });
+        check("  a,b,c", ',', '\"', new String[] { "a", "b", "c" });
+
+        // what would you expect of these ones?
+
+        // +++++The parser allows qualified and unqualified elements to be
+        // contained
+        // on the same line. so it should break the elements down like so
+        // 1 = a" -->" is part of the data since the element did not start with
+        // a qualifier
+        // 2 = b
+        // 3 = c" --> same as #1
+        check("a\",b,c\"", ',', '\"', new String[] { "a\"", "b", "c\"" });
+
+        check("\"  a,b,c\"", ',', '\"', new String[] { "a,b,c" });
+        // check(" a, b ,c ", ',', '\"', new String[] { "a","b","c" });
+        // ++++++I think this should probably generate this
+        check("  a, b ,c ", ',', '\"', new String[] { "a, b ,c" });
+
+        // Paul... please put some more whacky stuff here...
+
+    }
+
+    private void check(final String txtToParse, final char delim, final char qualifier, final String[] expected) {
+        final List splitLineResults = ParserUtils.splitLine(txtToParse, delim, qualifier);
+
+        assertEquals(
+                "Did Not Get Amount Of Elements Expected (d = " + delim + " q = " + qualifier + ") txt [" + txtToParse + "]",
+                expected.length, splitLineResults.size());
+
+        for (int i = 0; i < expected.length; i++) {
+            assertEquals("expecting...", expected[i], splitLineResults.get(i));
+        }
+    }
+
+    public static void main(final String[] args) {
+        junit.textui.TestRunner.run(RegExParserTest.class);
+    }
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.