From: <be...@us...> - 2006-10-30 16:46:45
|
Revision: 151 http://svn.sourceforge.net/pzfilereader/?rev=151&view=rev Author: benoitx Date: 2006-10-30 08:46:27 -0800 (Mon, 30 Oct 2006) Log Message: ----------- first cut at reg expressions... Added Paths: ----------- trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/RegExParser.java trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/RegExParserTest.java Added: trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/RegExParser.java =================================================================== --- trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/RegExParser.java (rev 0) +++ trunk/PZFileReader/src/main/java/net/sf/pzfilereader/util/RegExParser.java 2006-10-30 16:46:27 UTC (rev 151) @@ -0,0 +1,114 @@ +/** + * + */ +package net.sf.pzfilereader.util; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author xhensevb + * + */ +public class RegExParser { + + // /////////////////////////// + /** + * The rather involved pattern used to match CSV's consists of three + * alternations: the first matches aquoted field, the second unquoted, the + * third a null field. + */ + public static final String CSV_PATTERN = "\"(.*?)\",|(\\w+),|\"(.*?)\"|(\\w+),|,"; + public static final String ORIGINAL_CSV_PATTERN = "\"([^\"]+?)\",?|([^,]+),?|,"; + + // public static final String CSV_PATTERN = "\"([^\"]+?)\",?|([^,]+),?|,"; + +// private static Pattern csvRE = Pattern.compile(CSV_PATTERN); + + public static List splitLine(String line, final char delimiter, char qualifier) { + StringBuilder patternBuilder = new StringBuilder(); + + if (qualifier == 0) { + qualifier = '\"'; + } + + String qualif = escapeIfRequired(qualifier); + String delim = escapeIfRequired(delimiter); + + // first Pattern + if (qualifier != 0) { + patternBuilder.append(qualif); + } + patternBuilder.append("(.*?)"); + if (qualifier != 0) { + patternBuilder.append(qualif); + } + patternBuilder.append(delim); + + // second Pattern + patternBuilder.append("|(\\w+)"); + patternBuilder.append(delim); + + // Third Pattern + patternBuilder.append("|"); + if (qualifier != 0) { + patternBuilder.append(qualif); + } + patternBuilder.append("(.*?)"); + if (qualifier != 0) { + patternBuilder.append(qualif); + } + patternBuilder.append("|(\\w+)"); + patternBuilder.append(delim); + + // Fourth Pattern + patternBuilder.append("|").append(delim); + + String pat = patternBuilder.toString(); + + System.out.println(pat); + + Pattern pattern = Pattern.compile(pat); + + return parse(pattern, line, String.valueOf(delimiter), String.valueOf(qualifier)); + } + + private static String escapeIfRequired(final char c) { + if (c == 0) { + return ""; + } + if ("([{\\^-$|]})?*+.\"\'".indexOf(c) >= 0) { + return "\\" + c; + } + return String.valueOf(c); + } + + /** + * Parse one line. + * + * @return List of Strings, minus their double quotes + */ + public static List parse(Pattern pattern, String line, String delimiter, String qualifier) { + List list = new ArrayList(); + Matcher m = pattern.matcher(line); + // For each field + while (m.find()) { + String match = m.group(); + if (match == null) + break; + if (match.endsWith(delimiter)) { // trim trailing , + match = match.substring(0, match.length() - 1); + } + if (match.startsWith(qualifier)) { // assume also ends with + match = match.substring(1, match.length() - 1); + } + if (match.length() == 0) + match = null; + list.add(match); + } + return list; + } + +} Added: trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/RegExParserTest.java =================================================================== --- trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/RegExParserTest.java (rev 0) +++ trunk/PZFileReader/src/test/java/net/sf/pzfilereader/parserutils/RegExParserTest.java 2006-10-30 16:46:27 UTC (rev 151) @@ -0,0 +1,144 @@ +package net.sf.pzfilereader.parserutils; + +import java.util.List; + +import junit.framework.TestCase; +import net.sf.pzfilereader.util.ParserUtils; +import net.sf.pzfilereader.util.RegExParser; +import net.sf.pzfilereader.utilities.UnitTestUtils; + +/** + * Test the functionality of the splitLine method. This method returns a List of + * Strings. Each element of the list represents a column created by the parser + * from the delimited String. + * + * @author Paul Zepernick + */ +public class RegExParserTest extends TestCase { + private static final String[] DELIMITED_DATA_NO_BREAKS = { "Column 1", "Column 2", "Column 3", "Column 4", "Column 5" }; + + private static final String[] DELIMITED_DATA_WITH_BREAKS = { "Column 1 \r\n\r\n Test After Break \r\n Another Break", + "Column 2", "Column 3 \r\n\r\n Test After Break", "Column 4", "Column 5 \r\n\r\n Test After Break\r\n Another Break" }; + + // TODO think of a situation that actually breaks the parse. This still + // works because of the way it is coded + // to handle the excel CSV. Excel CSV has some elements qualified and others + // not + private static final String DELIMITED_BAD_DATA = "\"column 1\",\"column 2 ,\"column3\""; + + // 0 = delimiter + // 1 = qualifier + private static final char[][] DELIM_QUAL_PAIR = { { ',', '\"' }, { '\t', '\"' }, { '|', '\"' }, { '_', '\"' }, { ',', 0 }, + { '|', 0 }, { '\t', 0 } }; + + /** + * Test without any line breaks + * + */ + public void testNoLineBreaks() { + // loop down all delimiter qualifier pairs to test + for (int i = 0; i < DELIM_QUAL_PAIR.length; i++) { + final char d = DELIM_QUAL_PAIR[i][0]; + final char q = DELIM_QUAL_PAIR[i][1]; + + final String txtToParse = UnitTestUtils.buildDelimString(DELIMITED_DATA_NO_BREAKS, d, q); + + final List splitLineResults = RegExParser.splitLine(txtToParse, d, q); + + // check to make sure we have the same amount of elements which were + // expected + assertEquals("Expected size (d = [" + d + "] q = [" + (q != 0 ? String.valueOf(q) : "") + "] txt [" + txtToParse + + "])", DELIMITED_DATA_NO_BREAKS.length, splitLineResults.size()); + + // loop through each value and compare what came back + for (int j = 0; j < DELIMITED_DATA_NO_BREAKS.length; j++) { + assertEquals("Data Element Value Does Not Match (d = [" + d + "] q = [" + q + "] txt [" + txtToParse + "])", + DELIMITED_DATA_NO_BREAKS[j], (String) splitLineResults.get(j)); + } + } + + } + + /** + * Test with any line breaks + * + */ + public void testLineBreaks() { + // loop down all delimiter qualifier pairs to test + for (int i = 0; i < DELIM_QUAL_PAIR.length; i++) { + final char d = DELIM_QUAL_PAIR[i][0]; + final char q = DELIM_QUAL_PAIR[i][1]; + + final String txtToParse = UnitTestUtils.buildDelimString(DELIMITED_DATA_WITH_BREAKS, d, q); + + final List splitLineResults = RegExParser.splitLine(txtToParse, d, q); + + // check to make sure we have the same amount of elements which were + // expected + assertEquals("Did Not Get Amount Of Elements Expected (d = " + d + " q = " + q + ")", + DELIMITED_DATA_WITH_BREAKS.length, splitLineResults.size()); + + // loop through each value and compare what came back + for (int j = 0; j < DELIMITED_DATA_WITH_BREAKS.length; j++) { + assertEquals("Data Element Value Does Not Match (d = " + d + " q = " + q + ")", DELIMITED_DATA_WITH_BREAKS[j], + (String) splitLineResults.get(j)); + } + } + } + + /** + * Test to make sure we get the correct amount of elements for malformed + * data + */ + public void testMalformedData() { + final List splitLineResults = ParserUtils.splitLine(DELIMITED_BAD_DATA, ',', '\"'); + + assertEquals("Expecting 2 Data Elements From The Malformed Data", 2, splitLineResults.size()); + } + + /** + * Test some extreme cases + */ + public void testSomeExtremeCases() { + check("\"a,b,c\"", ',', '\"', new String[] { "a,b,c" }); + check("\"a,b\",\"c\"", ',', '\"', new String[] { "a,b", "c" }); + check("a,b,c", ',', '\"', new String[] { "a", "b", "c" }); + check(" a,b,c", ',', '\"', new String[] { "a", "b", "c" }); + check(" a,b,c", ',', '\"', new String[] { "a", "b", "c" }); + + // what would you expect of these ones? + + // +++++The parser allows qualified and unqualified elements to be + // contained + // on the same line. so it should break the elements down like so + // 1 = a" -->" is part of the data since the element did not start with + // a qualifier + // 2 = b + // 3 = c" --> same as #1 + check("a\",b,c\"", ',', '\"', new String[] { "a\"", "b", "c\"" }); + + check("\" a,b,c\"", ',', '\"', new String[] { "a,b,c" }); + // check(" a, b ,c ", ',', '\"', new String[] { "a","b","c" }); + // ++++++I think this should probably generate this + check(" a, b ,c ", ',', '\"', new String[] { "a, b ,c" }); + + // Paul... please put some more whacky stuff here... + + } + + private void check(final String txtToParse, final char delim, final char qualifier, final String[] expected) { + final List splitLineResults = ParserUtils.splitLine(txtToParse, delim, qualifier); + + assertEquals( + "Did Not Get Amount Of Elements Expected (d = " + delim + " q = " + qualifier + ") txt [" + txtToParse + "]", + expected.length, splitLineResults.size()); + + for (int i = 0; i < expected.length; i++) { + assertEquals("expecting...", expected[i], splitLineResults.get(i)); + } + } + + public static void main(final String[] args) { + junit.textui.TestRunner.run(RegExParserTest.class); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |