[Pzfilereader-svn] SF.net SVN: pzfilereader: [160] trunk/PZFileReader/src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 160
          http://svn.sourceforge.net/pzfilereader/?rev=160&view=rev
Author:   benoitx
Date:     2006-10-31 09:12:14 -0800 (Tue, 31 Oct 2006)

Log Message:
-----------
Paul, I've added some basic tests for null, empty, ",,," kind of things. I've also had a go at a parser, the regular expression is a dead-end or will become **extremely** complex due to our special and whacky cases... The basic tests make quite a few things break in the current version.  I'll run a couple of speed tests to see where we're going...

Modified Paths:
--------------
    trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/ParserUtilsSplitLineTest.java

Added Paths:
-----------
    trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/BXParser.java
    trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/BXParserTest.java

Added: trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/BXParser.java
===================================================================

--- trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/BXParser.java	                        (rev 0)
+++ trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/BXParser.java	2006-10-31 17:12:14 UTC (rev 160)
@@ -0,0 +1,110 @@
+/**
+ * 
+ */
+package net.sf.pzfilereader.util;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author xhensevb
+ * 
+ */
+public class BXParser {
+    public static List splitLine(String line, final char delimiter, char qualifier) {
+        List list = new ArrayList();
+
+        if (delimiter == 0) {
+            list.add(line);
+            return list;
+        } else if (line == null) {
+            return list;
+        }
+
+        final String trimmedLine = line.trim();
+        int size = trimmedLine.length();
+
+        if (size == 0) {
+            list.add(null);
+            return list;
+        }
+
+        boolean insideQualifier = false;
+        char previousChar = 0;
+        int startBlock = 0;
+        int endBlock = 0;
+        boolean blockWasInQualifier = false;
+
+        final String doubleQualifier = String.valueOf(qualifier) + String.valueOf(qualifier);
+        for (int i = 0; i < size; i++) {
+
+            final char currentChar = trimmedLine.charAt(i);
+
+            if (currentChar != delimiter && currentChar != qualifier) {
+                previousChar = currentChar;
+                endBlock = i + 1;
+                continue;
+            }
+
+            if (currentChar == delimiter) {
+                // we've found the delimiter (eg ,)
+                if (!insideQualifier) {
+                    String trimmed = trimmedLine.substring(startBlock, endBlock > startBlock ? endBlock : startBlock + 1);
+                    if (!blockWasInQualifier) {
+                        trimmed = trimmed.trim();
+                    }
+
+                    trimmed = trimmed.replaceAll(doubleQualifier, String.valueOf(qualifier));
+
+                    if (trimmed.length() == 1 && trimmed.charAt(0) == delimiter) {
+                        list.add(null);
+                    } else if (trimmed.length() == 1 && trimmed.charAt(0) == qualifier) {
+                        list.add("");
+                    } else {
+                        list.add(trimmed);
+                    }
+                    blockWasInQualifier = false;
+                    startBlock = i + 1;
+                }
+            } else if (currentChar == qualifier) {
+                if (!insideQualifier && previousChar != qualifier) {
+                    if (previousChar == delimiter || previousChar == 0 || previousChar == ' ') {
+                        insideQualifier = true;
+                        startBlock = i + 1;
+                    } else {
+                        endBlock = i + 1;
+                    }
+                } else {
+                    insideQualifier = false;
+                    blockWasInQualifier = true;
+                    endBlock = i;
+                    // last column (e.g. finishes with ")
+                    if (i == size - 1) {
+                        list.add(trimmedLine.substring(startBlock, size - 1));
+                        startBlock = i + 1;
+                    }
+                }
+            }
+            // antepenultimateChar = previousChar;
+            previousChar = currentChar;
+        }
+
+        if (startBlock < size) {
+            String str = trimmedLine.substring(startBlock, size);
+            str = str.replaceAll(doubleQualifier, String.valueOf(qualifier));
+            if (blockWasInQualifier) {
+                if (str.charAt(str.length() - 1) == qualifier) {
+                    list.add(str.substring(0, str.length() - 1));
+                } else {
+                    list.add(str);
+                }
+            } else {
+                list.add(str.trim());
+            }
+        } else if (trimmedLine.charAt(size - 1) == delimiter) {
+            list.add(null);
+        }
+
+        return list;
+    }
+}

Added: trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/BXParserTest.java
===================================================================
--- trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/BXParserTest.java	                        (rev 0)
+++ trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/BXParserTest.java	2006-10-31 17:12:14 UTC (rev 160)
@@ -0,0 +1,151 @@
+package net.sf.pzfilereader.parserutils;
+
+import java.util.List;
+
+import junit.framework.TestCase;
+import net.sf.pzfilereader.util.BXParser;
+import net.sf.pzfilereader.util.ParserUtils;
+import net.sf.pzfilereader.util.RegExParser;
+import net.sf.pzfilereader.utilities.UnitTestUtils;
+
+/**
+ * Test the functionality of the splitLine method. This method returns a List of
+ * Strings. Each element of the list represents a column created by the parser
+ * from the delimited String.
+ * 
+ * @author Paul Zepernick
+ */
+public class BXParserTest extends TestCase {
+    private static final String[] DELIMITED_DATA_NO_BREAKS = { "Column 1", "Column 2", "Column 3", "Column 4", "Column 5" };
+
+    private static final String[] DELIMITED_DATA_WITH_BREAKS = { "Column 1 \r\n\r\n Test After Break \r\n Another Break",
+            "Column 2", "Column 3 \r\n\r\n Test After Break", "Column 4", "Column 5 \r\n\r\n Test After Break\r\n Another Break" };
+
+    // TODO think of a situation that actually breaks the parse. This still
+    // works because of the way it is coded
+    // to handle the excel CSV. Excel CSV has some elements qualified and others
+    // not
+    private static final String DELIMITED_BAD_DATA = "\"column 1\",\"column 2 ,\"column3\"";
+
+    // 0 = delimiter
+    // 1 = qualifier
+    private static final char[][] DELIM_QUAL_PAIR = { { ',', '\"' }, { '\t', '\"' }, { '|', '\"' }, { '_', '\"' }, { ',', 0 },
+            { '|', 0 }, { '\t', 0 } };
+
+    /**
+     * Test without any line breaks
+     * 
+     */
+    public void testNoLineBreaks() {
+        // loop down all delimiter qualifier pairs to test
+        for (int i = 0; i < DELIM_QUAL_PAIR.length; i++) {
+            final char d = DELIM_QUAL_PAIR[i][0];
+            final char q = DELIM_QUAL_PAIR[i][1];
+
+            final String txtToParse = UnitTestUtils.buildDelimString(DELIMITED_DATA_NO_BREAKS, d, q);
+
+            final List splitLineResults = BXParser.splitLine(txtToParse, d, q);
+
+            // check to make sure we have the same amount of elements which were
+            // expected
+            assertEquals("Expected size (d = [" + d + "] q = [" + (q != 0 ? String.valueOf(q) : "") + "] txt [" + txtToParse
+                    + "])", DELIMITED_DATA_NO_BREAKS.length, splitLineResults.size());
+
+            // loop through each value and compare what came back
+            for (int j = 0; j < DELIMITED_DATA_NO_BREAKS.length; j++) {
+                assertEquals("Data Element Value Does Not Match (d = [" + d + "] q = [" + q + "] txt [" + txtToParse + "])",
+                        DELIMITED_DATA_NO_BREAKS[j], (String) splitLineResults.get(j));
+            }
+        }
+
+    }
+
+    /**
+     * Test with any line breaks
+     * 
+     */
+    public void NOtestLineBreaks() {
+        // loop down all delimiter qualifier pairs to test
+        for (int i = 0; i < DELIM_QUAL_PAIR.length; i++) {
+            final char d = DELIM_QUAL_PAIR[i][0];
+            final char q = DELIM_QUAL_PAIR[i][1];
+
+            final String txtToParse = UnitTestUtils.buildDelimString(DELIMITED_DATA_WITH_BREAKS, d, q);
+
+            final List splitLineResults = BXParser.splitLine(txtToParse, d, q);
+
+            // check to make sure we have the same amount of elements which were
+            // expected
+            assertEquals("Did Not Get Amount Of Elements Expected (d = " + d + " q = " + q + ")",
+                    DELIMITED_DATA_WITH_BREAKS.length, splitLineResults.size());
+
+            // loop through each value and compare what came back
+            for (int j = 0; j < DELIMITED_DATA_WITH_BREAKS.length; j++) {
+                assertEquals("Data Element Value Does Not Match (d = " + d + " q = " + q + ")", DELIMITED_DATA_WITH_BREAKS[j],
+                        (String) splitLineResults.get(j));
+            }
+        }
+    }
+
+    /**
+     * Test to make sure we get the correct amount of elements for malformed
+     * data
+     */
+    public void testMalformedData() {
+        final List splitLineResults = BXParser.splitLine(DELIMITED_BAD_DATA, ',', '\"');
+
+        assertEquals("Expecting 2 Data Elements From The Malformed Data", 2, splitLineResults.size());
+    }
+
+    /**
+     * Test some extreme cases
+     */
+    public void testSomeExtremeCases() {
+        check(null, ',', '\"', new String[] {});
+        check("a", ',', '\"', new String[] { "a" });
+        check("", ',', '\"', new String[] { null });
+        check(" ", ',', '\"', new String[] { null });
+        check("    ", ',', '\"', new String[] { null });
+        check(",", ',', '\"', new String[] { null, null });
+        check(",,", ',', '\"', new String[] { null, null, null });
+        check(",a,", ',', '\"', new String[] { null, "a", null });
+
+        check("\"a,b,c\"", ',', '\"', new String[] { "a,b,c" });
+        check("\"a,b\",\"c\"", ',', '\"', new String[] { "a,b", "c" });
+        check("\"a , b\",\"c\"", ',', '\"', new String[] { "a , b", "c" });
+        check("a,b,c", ',', '\"', new String[] { "a", "b", "c" });
+        check("a b,c", ',', '\"', new String[] { "a b", "c" });
+        check("  a,b,c ", ',', '\"', new String[] { "a", "b", "c" });
+        check("  a, b ,c", ',', '\"', new String[] { "a", "b", "c" });
+
+        // example typically from Excel.
+        check("\"test1\",test2,\"0.00\",\"another, element here\",lastone", ',', '\"', new String[] { "test1", "test2", "0.00",
+                "another, element here", "lastone" });
+
+        check("a\",b,c\"", ',', '\"', new String[] { "a\"", "b", "c\"" });
+        check("  a, b ,c ", ',', '\"', new String[] { "a", "b", "c" });
+        check("\"a\",     b  ,    \"c\"", ',', '\"', new String[] { "a", "b", "c" });
+
+        check("\"\",,,,\"last one\"", ',', '\"', new String[] { "", null, null, null, "last one" });
+        check("\"first\",\"second\",", ',', '\"', new String[] { "first", "second", null });
+        check("\"  a,b,c\"", ',', '\"', new String[] { "  a,b,c" });
+        check("\"  a,b,c\",d", ',', '\"', new String[] { "  a,b,c", "d" });
+        check("\"a, b,\"\"c\"", ',', '\"', new String[] { "a, b,\"c" });
+    }
+
+    private void check(final String txtToParse, final char delim, final char qualifier, final String[] expected) {
+        final List splitLineResults = BXParser.splitLine(txtToParse, delim, qualifier);
+
+        assertEquals(
+                "Did Not Get Amount Of Elements Expected (d = " + delim + " q = " + qualifier + ") txt [" + txtToParse + "]",
+                expected.length, splitLineResults.size());
+
+        for (int i = 0; i < expected.length; i++) {
+            assertEquals("expecting...", expected[i], splitLineResults.get(i));
+        }
+    }
+
+    public static void main(final String[] args) {
+        junit.textui.TestRunner.run(BXParserTest.class);
+    }
+}

Modified: trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/ParserUtilsSplitLineTest.java
===================================================================
--- trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/ParserUtilsSplitLineTest.java	2006-10-31 16:07:23 UTC (rev 159)
+++ trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/ParserUtilsSplitLineTest.java	2006-10-31 17:12:14 UTC (rev 160)
@@ -99,49 +99,36 @@
      * Test some extreme cases
      */
     public void testSomeExtremeCases() {
-        // back to Basic...
         check(null, ',', '\"', new String[] {});
         check("a", ',', '\"', new String[] { "a" });
         check("", ',', '\"', new String[] { null });
+        check(" ", ',', '\"', new String[] { null });
+        check("    ", ',', '\"', new String[] { null });
         check(",", ',', '\"', new String[] { null, null });
         check(",,", ',', '\"', new String[] { null, null, null });
         check(",a,", ',', '\"', new String[] { null, "a", null });
 
-        //
-        
         check("\"a,b,c\"", ',', '\"', new String[] { "a,b,c" });
         check("\"a,b\",\"c\"", ',', '\"', new String[] { "a,b", "c" });
+        check("\"a , b\",\"c\"", ',', '\"', new String[] { "a , b", "c" });
         check("a,b,c", ',', '\"', new String[] { "a", "b", "c" });
-        check("  a,b,c", ',', '\"', new String[] { "a", "b", "c" });
-        check("  a,b,c", ',', '\"', new String[] { "a", "b", "c" });
+        check("a b,c", ',', '\"', new String[] { "a b", "c" });
+        check("  a,b,c ", ',', '\"', new String[] { "a", "b", "c" });
+        check("  a, b ,c", ',', '\"', new String[] { "a", "b", "c" });
 
         // example typically from Excel.
         check("\"test1\",test2,\"0.00\",\"another, element here\",lastone", ',', '\"', new String[] { "test1", "test2", "0.00",
                 "another, element here", "lastone" });
 
-        // what would you expect of these ones?
-
-        // +++++The parser allows qualified and unqualified elements to be
-        // contained
-        // on the same line. so it should break the elements down like so
-        // 1 = a" -->" is part of the data since the element did not start with
-        // a qualifier
-        // 2 = b
-        // 3 = c" --> same as #1
-        // a",b,c"
         check("a\",b,c\"", ',', '\"', new String[] { "a\"", "b", "c\"" });
-        //should not trim leading space inside of a qualified element       
-        check("\"  a,b,c\"", ',', '\"', new String[] { "  a,b,c" });
-        check("  a, b ,c ", ',', '\"', new String[] { "a","b","c" });
-        check("\"a\",     b  ,    \"c\"", ',', '\"', new String[] {"a","b","c"});
-        //check malformed data
-        //TODO - I believe this should be producing 2 elements.  As soon as their is a 
-        //delimter followed by a qualifier a new element shoudl be created
-        //+++Any thoughts Benoit?
-        check("\"a, b,\"c\"", ',', '\"', new String[] {"a, b,\"c"});
-        check("\"\",,,,\"last one\"", ',', '\"', new String[] {"","","","","last one"});
-        check("\"first\",\"second\",", ',', '\"', new String[] {"first","second",""});
+        check("  a, b ,c ", ',', '\"', new String[] { "a", "b", "c" });
+        check("\"a\",     b  ,    \"c\"", ',', '\"', new String[] { "a", "b", "c" });
 
+        check("\"\",,,,\"last one\"", ',', '\"', new String[] { "", null, null, null, "last one" });
+        check("\"first\",\"second\",", ',', '\"', new String[] { "first", "second", null });
+        check("\"  a,b,c\"", ',', '\"', new String[] { "  a,b,c" });
+        check("\"  a,b,c\",d", ',', '\"', new String[] { "  a,b,c", "d" });
+        check("\"a, b,\"\"c\"", ',', '\"', new String[] { "a, b,\"c" });
     }
 
     /**


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.