#290 getting values

closed
jdbc (34)
1
2013-05-20
2013-05-16
Kunal
No

Hi,
I would like to use hsql as a back-end database for a data mining application because of its reported high speed.
Unfortunately I'm having problems importing the large (15 millions rows and 103 columns and 10.5 GB) csv files that hold the data.
I see that the hsqldb datafile limit is reached withing a few hours.
data file reached maximum size C:\opt\icedb.data
java.sql.BatchUpdateException: Data File size limit is reached
at org.hsqldb.jdbc.JDBCPreparedStatement.executeBatch(Unknown Source)
at InsertTextFileIntoDB.insertValueIntoDB(InsertTextFileIntoDB.java:242)

System Information :
RAM 2GB, 32 Bit OS, 200 GB Hard Disk

My code is as follows:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
import org.hsqldb.*;

public class InsertTextFileIntoDB
{
public static int batchSize = 1000;
public static SimpleDateFormat sdffullOrg = new SimpleDateFormat("dd-MMM-yyyy");
public static long estimateBestSizeOfBlocks(File filetobesorted)
{
long sizeoffile = filetobesorted.length();
System.out.println("sizeoffile : "+(sizeoffile/1024)/1024+" MB");

final int MAXTEMPFILES = 1024;
long blocksize = sizeoffile / MAXTEMPFILES ;
System.out.println("blocksize : "+(blocksize/1024)/1024+" MB");
Runtime runtime = Runtime.getRuntime();
long freemem = runtime.freeMemory();
System.out.println("max memory: " + (runtime.maxMemory()/1024)/1024+" MB");
System.out.println("free memory : "+(freemem/1024)/1024+" MB");
System.out.println("allocated memory:: " + (runtime.totalMemory()/1024)/1024+" MB");
System.out.println("freememory : "+(freemem/1024)/1024+" MB");
if( blocksize < freemem/4)
blocksize = freemem/4;
else {
if(blocksize >= freemem)
System.err.println("We expect to run out of memory. ");
}
System.out.println("After blocksize(Chunks Files Size ) : "+(blocksize/1024)/1024+" MB");
return blocksize;
}

public static void main(String args[])
{

System.out.println("Start Sorting......................");
long t1 = new Date().getTime();
Map<Integer,String> colDataType = new HashMap<Integer, String>();
String colPosition = args[0];
String dataType = "Date".equalsIgnoreCase(args[1].toString()) ? "DATE," : "Number".equalsIgnoreCase(args[1].toString()) ? "INTEGER," : " VARCHAR(4000),";
/*colDataType.put(8, "DATE");
colDataType.put(16, "DATE");
colDataType.put(24, "DATE");
colDataType.put(32, "DATE");
colDataType.put(40, "DATE");
colDataType.put(48, "DATE");
colDataType.put(56, "DATE");
colDataType.put(64, "DATE");
colDataType.put(72, "DATE");
colDataType.put(80, "DATE");
colDataType.put(88, "DATE");
colDataType.put(96, "DATE");
colDataType.put(104, "DATE");
*/
Connection con = null;
String fileName = "DB_10GB";
File inputfile = new File("F:\\kunal-backup\\New folder\\ABYSS\\BigData\\"+fileName+".txt");
long sizeoffile = inputfile.length();
System.out.println("sizeoffile : "+sizeoffile/1024+" KB( "+(sizeoffile/1024)/1024+" MB)");
//long blocksize = 1024*1024*5;
long blocksize = estimateBestSizeOfBlocks(inputfile);
try{

Class.forName("org.hsqldb.jdbcDriver");
con = DriverManager.getConnection("jdbc:hsqldb:file:C:\\opt\\icedb","ice","ice");
con.setAutoCommit(false);
//execute(con, "SET FILES CACHE SIZE 1000000");
execute(con, "SET FILES LOG FALSE");
// execute(con, "SET FILES SCALE 128");

ArrayList<Map<String,Object>> tmplist = new ArrayList<Map<String,Object>>();
int count = 0;
int increaseCount = 0;
boolean createTableFlag = false;
String fieldsNameWithDataType = "";
String fieldsName ="";
String fieldsChar ="";
LineNumberReader _lineReader = new LineNumberReader(new FileReader(inputfile));
String line = "";
Map<String, Object> map = new HashMap<String, Object>();
String[] columnsData = null;
while( line != null )
{

long currentblocksize = 0;// in bytes
while((currentblocksize < blocksize) && ((line = _lineReader.readLine()) != null) )
{
int index = 0;
map = new HashMap<String, Object>();
columnsData = line.split(",");
count = columnsData.length;
for(String str : columnsData)
{
index++;
map.put("COL"+index,str.trim());
if(!createTableFlag)
{
fieldsNameWithDataType += "COL"+index+" "+(colDataType.containsKey(index) ? colDataType.get(index)+"," :"VARCHAR(4000)," );
fieldsName += "COL"+index+", ";
fieldsChar += "?,";
}
}
if(!createTableFlag)
{
createTable(fileName, fieldsNameWithDataType,con);
}
createTableFlag = true;
increaseCount = count;
tmplist.add(map);
currentblocksize += line.length();

}
insertValueIntoDB(tmplist,fieldsNameWithDataType,fieldsName,fieldsChar,fileName,colDataType,con);
tmplist.clear();
}

}catch(Exception ee){
ee.printStackTrace();
}finally{
try{
con.commit();
con.setAutoCommit(true);
execute(con, "SET FILES LOG TRUE");
execute(con, "CHECKPOINT");
execute(con, "SHUTDOWN IMMEDIATELY");
con.close();
}catch(Exception e){
e.printStackTrace();
}
}
long t2 = new Date().getTime();
System.out.println("Total Time taken = " +((t2-t1)/1000) + " Sec("+((t2-t1)/1000)/60 + " Minutes)");
String query = "SELECT * FROM "+fileName;
//processBaseResultset(query);
System.out.println("End Sorting......................");
}

public static boolean createTable(String fileName, String fieldsNameWithDataType,Connection con)
{
Statement statement = null;
boolean res = false;

try{
statement = con.createStatement();
String query = "CREATE CACHED TABLE "+fileName+"("+fieldsNameWithDataType+"COL VARCHAR(4000))";
System.out.println("query : "+query);
int res1 = statement.executeUpdate(query);
System.out.println("Table Created................. "+res1);
}catch (SQLException e) {
e.printStackTrace();
}catch(Exception ee){
ee.printStackTrace();
}
finally{
try{
statement.close();

}catch(Exception e){
e.printStackTrace();
}
}
return res;
}

public static boolean insertValueIntoDB(ArrayList<Map<String,Object>> tmplist,String fieldsNameWithDataType,String fieldsName,
String fieldsChar,String fileName,Map<Integer,String> colDataType,Connection con)
{
boolean res = false;
PreparedStatement ps = null;
int index = 0;

try{

con.setAutoCommit(false);

String query = "INSERT INTO "+fileName+"("+fieldsName+"COL) VALUES("+fieldsChar+"?)";

ps = con.prepareStatement(query);
int count = countChars(fieldsChar,'?');
for(int i = 0;i<tmplist.size();i++)
{
index++;
for(int j=0; j<count;j++){
ps.setObject(j+1, tmplist.get(i).get("COL"+(j+1)));
}
ps.setObject(count+1, tmplist.get(i).get("COL"+(count+1)));
ps.addBatch();

if(index <= batchSize)
{
ps.executeBatch();
con.commit();
}else{
index = 0;
}
System.out.print(".");

}

}catch (SQLException e) {
e.printStackTrace();
}catch(Exception ee){
ee.printStackTrace();
}
finally{
try{
con.setAutoCommit(true);
con.commit();
ps.close();
}catch(Exception e){
e.printStackTrace();
}
}

return res;
}

public static int countChars(String input,char find){
if(input.indexOf(find) != -1){
return countChars(input.substring(0, input.indexOf(find)), find)+
countChars(input.substring(input.indexOf(find)+1),find) + 1;
}
else {
return 0;
}

}

public static void execute(Connection conn,String command) throws SQLException
{
Statement st = conn.createStatement();
st.execute(command);
st.close();
}
}

Does anyone have any suggestions or workarounds?

Kind regards,
Kunal

Discussion

  • Kunal

    Kunal - 2013-05-16
     
  • Kunal

    Kunal - 2013-05-16
    • priority: 5 --> 9
     
  • Kunal

    Kunal - 2013-05-17
    • assigned_to: nobody --> fredt
     
  • Fred Toussi

    Fred Toussi - 2013-05-17

    Will be checked and replied to in the next few days.

     
  • Kunal

    Kunal - 2013-05-17

    thanks

     
  • Kunal

    Kunal - 2013-05-17
     
  • Kunal

    Kunal - 2013-05-17

    attached my script file

     
  • Fred Toussi

    Fred Toussi - 2013-05-19

    The program relies on the csv file to run. Please provide a small csv with 1000 rows of data as a zip file.

     
  • Fred Toussi

    Fred Toussi - 2013-05-19

    What is the size of your .data file after the program terminates with the error?

    If the size of your .data file reaches 64 GB, then this is the limit. You can increase this limit by uncommenting this line.

    // execute(con, "SET FILES SCALE 128");

    If the size reached is much less than 64GB, the provide the csv.

    There may actually be an error in your code and rows may be inserted more than once.

     
  • Kunal

    Kunal - 2013-05-20

    Hi fredt,
    Thanks for u help problem is solved .
    But another error coming after inserting the data(15 millions rows and 103 columns and 10.5 GB), then getting the value using sql Query(select * from db_10gb order by col3 limit 10 offset 0). I got a error message as below
    Exception in thread "Thread-4" java.lang.OutOfMemoryError: Java heap space
    at javax.swing.text.GapContent.getChars(Unknown Source)
    at javax.swing.text.GapContent.getString(Unknown Source)
    at javax.swing.text.AbstractDocument.getText(Unknown Source)
    at javax.swing.text.JTextComponent.getText(Unknown Source)
    at org.hsqldb.util.DatabaseManagerSwing$7.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)

     
  • Kunal

    Kunal - 2013-05-20
    • summary: Bulk Data Insert Problem --> getting values
     
  • Fred Toussi

    Fred Toussi - 2013-05-20
    • priority: 9 --> 1
    • status: open --> closed
     
  • Fred Toussi

    Fred Toussi - 2013-05-20

    Your table does not have any indexes. Therefore any SELECT with LIMIT will load the full set of data into memory to sort and you will run out of memory.

    Before you insert into the table, add an index on the column you will use for ORDER BY. You can add more than one indexe (on different columns) but the data insert will get slower when there are indexes.

    Do not add indexes after inserting the data. This will take a very long time.

    See the Data Access and Change chapter of the Guide under Ordering and Slicing sections.

    You can use SELECT statements that create very large result sets that do not fit in the memory. See SET SESSION RESULT MEMORY ROWS in the Sessions and Transactions chapter. Also

     

Log in to post a comment.