From: <be...@us...> - 2006-10-31 17:12:27
|
Revision: 160 http://svn.sourceforge.net/pzfilereader/?rev=160&view=rev Author: benoitx Date: 2006-10-31 09:12:14 -0800 (Tue, 31 Oct 2006) Log Message: ----------- Paul, I've added some basic tests for null, empty, ",,," kind of things. I've also had a go at a parser, the regular expression is a dead-end or will become **extremely** complex due to our special and whacky cases... The basic tests make quite a few things break in the current version. I'll run a couple of speed tests to see where we're going... Modified Paths: -------------- trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/ParserUtilsSplitLineTest.java Added Paths: ----------- trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/BXParser.java trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/BXParserTest.java Added: trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/BXParser.java =================================================================== --- trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/BXParser.java (rev 0) +++ trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/BXParser.java 2006-10-31 17:12:14 UTC (rev 160) @@ -0,0 +1,110 @@ +/** + * + */ +package net.sf.pzfilereader.util; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author xhensevb + * + */ +public class BXParser { + public static List splitLine(String line, final char delimiter, char qualifier) { + List list = new ArrayList(); + + if (delimiter == 0) { + list.add(line); + return list; + } else if (line == null) { + return list; + } + + final String trimmedLine = line.trim(); + int size = trimmedLine.length(); + + if (size == 0) { + list.add(null); + return list; + } + + boolean insideQualifier = false; + char previousChar = 0; + int startBlock = 0; + int endBlock = 0; + boolean blockWasInQualifier = false; + + final String doubleQualifier = String.valueOf(qualifier) + String.valueOf(qualifier); + for (int i = 0; i < size; i++) { + + final char currentChar = trimmedLine.charAt(i); + + if (currentChar != delimiter && currentChar != qualifier) { + previousChar = currentChar; + endBlock = i + 1; + continue; + } + + if (currentChar == delimiter) { + // we've found the delimiter (eg ,) + if (!insideQualifier) { + String trimmed = trimmedLine.substring(startBlock, endBlock > startBlock ? endBlock : startBlock + 1); + if (!blockWasInQualifier) { + trimmed = trimmed.trim(); + } + + trimmed = trimmed.replaceAll(doubleQualifier, String.valueOf(qualifier)); + + if (trimmed.length() == 1 && trimmed.charAt(0) == delimiter) { + list.add(null); + } else if (trimmed.length() == 1 && trimmed.charAt(0) == qualifier) { + list.add(""); + } else { + list.add(trimmed); + } + blockWasInQualifier = false; + startBlock = i + 1; + } + } else if (currentChar == qualifier) { + if (!insideQualifier && previousChar != qualifier) { + if (previousChar == delimiter || previousChar == 0 || previousChar == ' ') { + insideQualifier = true; + startBlock = i + 1; + } else { + endBlock = i + 1; + } + } else { + insideQualifier = false; + blockWasInQualifier = true; + endBlock = i; + // last column (e.g. finishes with ") + if (i == size - 1) { + list.add(trimmedLine.substring(startBlock, size - 1)); + startBlock = i + 1; + } + } + } + // antepenultimateChar = previousChar; + previousChar = currentChar; + } + + if (startBlock < size) { + String str = trimmedLine.substring(startBlock, size); + str = str.replaceAll(doubleQualifier, String.valueOf(qualifier)); + if (blockWasInQualifier) { + if (str.charAt(str.length() - 1) == qualifier) { + list.add(str.substring(0, str.length() - 1)); + } else { + list.add(str); + } + } else { + list.add(str.trim()); + } + } else if (trimmedLine.charAt(size - 1) == delimiter) { + list.add(null); + } + + return list; + } +} Added: trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/BXParserTest.java =================================================================== --- trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/BXParserTest.java (rev 0) +++ trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/BXParserTest.java 2006-10-31 17:12:14 UTC (rev 160) @@ -0,0 +1,151 @@ +package net.sf.pzfilereader.parserutils; + +import java.util.List; + +import junit.framework.TestCase; +import net.sf.pzfilereader.util.BXParser; +import net.sf.pzfilereader.util.ParserUtils; +import net.sf.pzfilereader.util.RegExParser; +import net.sf.pzfilereader.utilities.UnitTestUtils; + +/** + * Test the functionality of the splitLine method. This method returns a List of + * Strings. Each element of the list represents a column created by the parser + * from the delimited String. + * + * @author Paul Zepernick + */ +public class BXParserTest extends TestCase { + private static final String[] DELIMITED_DATA_NO_BREAKS = { "Column 1", "Column 2", "Column 3", "Column 4", "Column 5" }; + + private static final String[] DELIMITED_DATA_WITH_BREAKS = { "Column 1 \r\n\r\n Test After Break \r\n Another Break", + "Column 2", "Column 3 \r\n\r\n Test After Break", "Column 4", "Column 5 \r\n\r\n Test After Break\r\n Another Break" }; + + // TODO think of a situation that actually breaks the parse. This still + // works because of the way it is coded + // to handle the excel CSV. Excel CSV has some elements qualified and others + // not + private static final String DELIMITED_BAD_DATA = "\"column 1\",\"column 2 ,\"column3\""; + + // 0 = delimiter + // 1 = qualifier + private static final char[][] DELIM_QUAL_PAIR = { { ',', '\"' }, { '\t', '\"' }, { '|', '\"' }, { '_', '\"' }, { ',', 0 }, + { '|', 0 }, { '\t', 0 } }; + + /** + * Test without any line breaks + * + */ + public void testNoLineBreaks() { + // loop down all delimiter qualifier pairs to test + for (int i = 0; i < DELIM_QUAL_PAIR.length; i++) { + final char d = DELIM_QUAL_PAIR[i][0]; + final char q = DELIM_QUAL_PAIR[i][1]; + + final String txtToParse = UnitTestUtils.buildDelimString(DELIMITED_DATA_NO_BREAKS, d, q); + + final List splitLineResults = BXParser.splitLine(txtToParse, d, q); + + // check to make sure we have the same amount of elements which were + // expected + assertEquals("Expected size (d = [" + d + "] q = [" + (q != 0 ? String.valueOf(q) : "") + "] txt [" + txtToParse + + "])", DELIMITED_DATA_NO_BREAKS.length, splitLineResults.size()); + + // loop through each value and compare what came back + for (int j = 0; j < DELIMITED_DATA_NO_BREAKS.length; j++) { + assertEquals("Data Element Value Does Not Match (d = [" + d + "] q = [" + q + "] txt [" + txtToParse + "])", + DELIMITED_DATA_NO_BREAKS[j], (String) splitLineResults.get(j)); + } + } + + } + + /** + * Test with any line breaks + * + */ + public void NOtestLineBreaks() { + // loop down all delimiter qualifier pairs to test + for (int i = 0; i < DELIM_QUAL_PAIR.length; i++) { + final char d = DELIM_QUAL_PAIR[i][0]; + final char q = DELIM_QUAL_PAIR[i][1]; + + final String txtToParse = UnitTestUtils.buildDelimString(DELIMITED_DATA_WITH_BREAKS, d, q); + + final List splitLineResults = BXParser.splitLine(txtToParse, d, q); + + // check to make sure we have the same amount of elements which were + // expected + assertEquals("Did Not Get Amount Of Elements Expected (d = " + d + " q = " + q + ")", + DELIMITED_DATA_WITH_BREAKS.length, splitLineResults.size()); + + // loop through each value and compare what came back + for (int j = 0; j < DELIMITED_DATA_WITH_BREAKS.length; j++) { + assertEquals("Data Element Value Does Not Match (d = " + d + " q = " + q + ")", DELIMITED_DATA_WITH_BREAKS[j], + (String) splitLineResults.get(j)); + } + } + } + + /** + * Test to make sure we get the correct amount of elements for malformed + * data + */ + public void testMalformedData() { + final List splitLineResults = BXParser.splitLine(DELIMITED_BAD_DATA, ',', '\"'); + + assertEquals("Expecting 2 Data Elements From The Malformed Data", 2, splitLineResults.size()); + } + + /** + * Test some extreme cases + */ + public void testSomeExtremeCases() { + check(null, ',', '\"', new String[] {}); + check("a", ',', '\"', new String[] { "a" }); + check("", ',', '\"', new String[] { null }); + check(" ", ',', '\"', new String[] { null }); + check(" ", ',', '\"', new String[] { null }); + check(",", ',', '\"', new String[] { null, null }); + check(",,", ',', '\"', new String[] { null, null, null }); + check(",a,", ',', '\"', new String[] { null, "a", null }); + + check("\"a,b,c\"", ',', '\"', new String[] { "a,b,c" }); + check("\"a,b\",\"c\"", ',', '\"', new String[] { "a,b", "c" }); + check("\"a , b\",\"c\"", ',', '\"', new String[] { "a , b", "c" }); + check("a,b,c", ',', '\"', new String[] { "a", "b", "c" }); + check("a b,c", ',', '\"', new String[] { "a b", "c" }); + check(" a,b,c ", ',', '\"', new String[] { "a", "b", "c" }); + check(" a, b ,c", ',', '\"', new String[] { "a", "b", "c" }); + + // example typically from Excel. + check("\"test1\",test2,\"0.00\",\"another, element here\",lastone", ',', '\"', new String[] { "test1", "test2", "0.00", + "another, element here", "lastone" }); + + check("a\",b,c\"", ',', '\"', new String[] { "a\"", "b", "c\"" }); + check(" a, b ,c ", ',', '\"', new String[] { "a", "b", "c" }); + check("\"a\", b , \"c\"", ',', '\"', new String[] { "a", "b", "c" }); + + check("\"\",,,,\"last one\"", ',', '\"', new String[] { "", null, null, null, "last one" }); + check("\"first\",\"second\",", ',', '\"', new String[] { "first", "second", null }); + check("\" a,b,c\"", ',', '\"', new String[] { " a,b,c" }); + check("\" a,b,c\",d", ',', '\"', new String[] { " a,b,c", "d" }); + check("\"a, b,\"\"c\"", ',', '\"', new String[] { "a, b,\"c" }); + } + + private void check(final String txtToParse, final char delim, final char qualifier, final String[] expected) { + final List splitLineResults = BXParser.splitLine(txtToParse, delim, qualifier); + + assertEquals( + "Did Not Get Amount Of Elements Expected (d = " + delim + " q = " + qualifier + ") txt [" + txtToParse + "]", + expected.length, splitLineResults.size()); + + for (int i = 0; i < expected.length; i++) { + assertEquals("expecting...", expected[i], splitLineResults.get(i)); + } + } + + public static void main(final String[] args) { + junit.textui.TestRunner.run(BXParserTest.class); + } +} Modified: trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/ParserUtilsSplitLineTest.java =================================================================== --- trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/ParserUtilsSplitLineTest.java 2006-10-31 16:07:23 UTC (rev 159) +++ trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/ParserUtilsSplitLineTest.java 2006-10-31 17:12:14 UTC (rev 160) @@ -99,49 +99,36 @@ * Test some extreme cases */ public void testSomeExtremeCases() { - // back to Basic... check(null, ',', '\"', new String[] {}); check("a", ',', '\"', new String[] { "a" }); check("", ',', '\"', new String[] { null }); + check(" ", ',', '\"', new String[] { null }); + check(" ", ',', '\"', new String[] { null }); check(",", ',', '\"', new String[] { null, null }); check(",,", ',', '\"', new String[] { null, null, null }); check(",a,", ',', '\"', new String[] { null, "a", null }); - // - check("\"a,b,c\"", ',', '\"', new String[] { "a,b,c" }); check("\"a,b\",\"c\"", ',', '\"', new String[] { "a,b", "c" }); + check("\"a , b\",\"c\"", ',', '\"', new String[] { "a , b", "c" }); check("a,b,c", ',', '\"', new String[] { "a", "b", "c" }); - check(" a,b,c", ',', '\"', new String[] { "a", "b", "c" }); - check(" a,b,c", ',', '\"', new String[] { "a", "b", "c" }); + check("a b,c", ',', '\"', new String[] { "a b", "c" }); + check(" a,b,c ", ',', '\"', new String[] { "a", "b", "c" }); + check(" a, b ,c", ',', '\"', new String[] { "a", "b", "c" }); // example typically from Excel. check("\"test1\",test2,\"0.00\",\"another, element here\",lastone", ',', '\"', new String[] { "test1", "test2", "0.00", "another, element here", "lastone" }); - // what would you expect of these ones? - - // +++++The parser allows qualified and unqualified elements to be - // contained - // on the same line. so it should break the elements down like so - // 1 = a" -->" is part of the data since the element did not start with - // a qualifier - // 2 = b - // 3 = c" --> same as #1 - // a",b,c" check("a\",b,c\"", ',', '\"', new String[] { "a\"", "b", "c\"" }); - //should not trim leading space inside of a qualified element - check("\" a,b,c\"", ',', '\"', new String[] { " a,b,c" }); - check(" a, b ,c ", ',', '\"', new String[] { "a","b","c" }); - check("\"a\", b , \"c\"", ',', '\"', new String[] {"a","b","c"}); - //check malformed data - //TODO - I believe this should be producing 2 elements. As soon as their is a - //delimter followed by a qualifier a new element shoudl be created - //+++Any thoughts Benoit? - check("\"a, b,\"c\"", ',', '\"', new String[] {"a, b,\"c"}); - check("\"\",,,,\"last one\"", ',', '\"', new String[] {"","","","","last one"}); - check("\"first\",\"second\",", ',', '\"', new String[] {"first","second",""}); + check(" a, b ,c ", ',', '\"', new String[] { "a", "b", "c" }); + check("\"a\", b , \"c\"", ',', '\"', new String[] { "a", "b", "c" }); + check("\"\",,,,\"last one\"", ',', '\"', new String[] { "", null, null, null, "last one" }); + check("\"first\",\"second\",", ',', '\"', new String[] { "first", "second", null }); + check("\" a,b,c\"", ',', '\"', new String[] { " a,b,c" }); + check("\" a,b,c\",d", ',', '\"', new String[] { " a,b,c", "d" }); + check("\"a, b,\"\"c\"", ',', '\"', new String[] { "a, b,\"c" }); } /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |