Parsing empty cells in csv file and storing into mysql - java

I have a csv file which has few cells in some columns empty.Modifying the csv file is not an option as it has around 50k total records.
Whenever I am executing below code it is throwing error as "Incorrect integer value: '' for column 'ParentId' at row 1".My database table has null value as yes for column 'ParentID'.But still it is throwing error.How can I edit this code so that it fills in correct values without giving error?
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.util.Date;
import org.apache.commons.lang.StringUtils;
import au.com.bytecode.opencsv.CSVReader;
public class CSVLoader {
static int count;
private static final
//String SQL_INSERT = "INSERT INTO ${table}(${keys}) VALUES(${values})";
String SQL_INSERT = "INSERT INTO ${table} VALUES(${values})";
private static final String TABLE_REGEX = "\\$\\{table\\}";
//private static final String KEYS_REGEX = "\\$\\{keys\\}";
private static final String VALUES_REGEX = "\\$\\{values\\}";
private Connection connection;
private char seprator;
/**
* Public constructor to build CSVLoader object with
* Connection details. The connection is closed on success
* or failure.
* #param connection
*/
public CSVLoader(Connection connection) {
this.connection = connection;
//Set default separator
this.seprator = ',';
}
/**
* Parse CSV file using OpenCSV library and load in
* given database table.
* #param csvFile Input CSV file
* #param tableName Database table name to import data
* #param truncateBeforeLoad Truncate the table before inserting
* new records.
* #throws Exception
*/
public void loadCSV(String csvFile, String tableName,
boolean truncateBeforeLoad) throws Exception {
CSVReader csvReader = null;
if(null == this.connection) {
throw new Exception("Not a valid connection.");
}
try {
csvReader = new CSVReader(new FileReader(csvFile), this.seprator);
} catch (Exception e) {
e.printStackTrace();
throw new Exception("Error occured while executing file. "
+ e.getMessage());
}
//String[] headerRow = csvReader.readNext();
String[] headerRow = csvReader.readNext();
count++;
if (null == headerRow) {
throw new FileNotFoundException(
"No columns defined in given CSV file." +
"Please check the CSV file format.");
}
/*String questionmarks = StringUtils.repeat("?,", headerRow.length);
System.out.println(headerRow.length);
questionmarks = (String) questionmarks.subSequence(0, questionmarks
.length() - 1);
System.out.println(SQL_INSERT);
String query = SQL_INSERT.replaceFirst(TABLE_REGEX, tableName);
//query = query
// .replaceFirst(KEYS_REGEX, StringUtils.join(headerRow, ","));
query = query.replaceFirst(VALUES_REGEX, questionmarks);
System.out.println("Query: " + query);
*/
//Stirng str1 = "insert into posts values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
String[] nextLine;
Connection con = null;
PreparedStatement ps = null;
try {
con = this.connection;
con.setAutoCommit(false);
ps = con.prepareStatement("insert into posts (Id,PostTypeId,AcceptedAnswerId,ParentId,CreationDate,Score,ViewCount,Body,OwnerUserId,OwnerDisplayName,LastEditorUserId,LastEditorDisplayName,LastEditDate,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate,CommunityOwnedDate,RowNum) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");
if(truncateBeforeLoad) {
//delete data from table before loading csv
con.createStatement().execute("DELETE FROM " + tableName);
}
final int batchSize = 1000;
int count = 0;
Date date = null;
while ((nextLine = csvReader.readNext()) != null) {
if (null != nextLine) {
int index = 1;
for (String string : nextLine) {
date = DateUtil.convertToDate(string);
if (null != date) {
ps.setDate(index++, new java.sql.Date(date
.getTime()));
} else {
ps.setString(index++, string);
}
}
System.out.println(count);
ps.addBatch();
System.out.println(count);
}
if (++count % batchSize == 0) {
System.out.println(count);
ps.executeBatch();
}
}
ps.executeBatch(); // insert remaining records
con.commit();
} catch (Exception e) {
con.rollback();
e.printStackTrace();
throw new Exception(
"Error occured while loading data from file to database."
+ e.getMessage());
} finally {
if (null != ps)
ps.close();
if (null != con)
con.close();
csvReader.close();
}
}
public char getSeprator() {
return seprator;
}
public void setSeprator(char seprator) {
this.seprator = seprator;
}
}

It might be the case where int column is actually having blank spaces, in such case you can place manual check as below and still go ahead. Hope this answers your doubt.
// check if value is null or blank spaces
if(certain_value== null || certain.value.trim().length==0){
ps.setNull(4, java.sql.Types.INTEGER); // This will set null value for int type
}

Related

Bulk insert from a csv to a table in an Oracle DB using Java

I am attempting to insert a table in an Oracle DB using java. I am reading a csv file line by line using OpenCSV. The csv is about 50000 rows and 9 columns. Here is some of my code:
/* Create Connection objects */
Class.forName ("oracle.jdbc.OracleDriver");
Connection conn = DriverManager.getConnection("jdbc:oracle:thin:#HoSt", "UsErNaMe", "PaSsWoRd");
PreparedStatement sql_statement = null;
String jdbc_insert_sql = "INSERT INTO METATADA_AUTOSYS"
+ "(MACH_NAME,JOB_NAME,SCRIPT_COMMAND,APPLICATION_NAME,JOB_ID,STATUS,CREATE_DATE,LAST_START_DT,LAST_END_DT) VALUES"
+ "(?,?,?,?,?,?,?,?,?)";
sql_statement = conn.prepareStatement(jdbc_insert_sql);
/* Read CSV file in OpenCSV */
String inputCSVFile = "C:/Users/conwacx/Desktop/meta_auto_v3/Autosys_Metadata.csv";
CSVReader reader = new CSVReader(new FileReader(inputCSVFile));
String [] nextLine;
int lnNum = 0;
int batchSize = 5000;
//loop file , add records to batch
try{
while ((nextLine = reader.readNext()) != null) {
lnNum++;
/* Bind CSV file input to table columns */
sql_statement.setString(1, nextLine[0]);
sql_statement.setString(2,nextLine[1]);
sql_statement.setString(3,nextLine[2]);
sql_statement.setString(4,nextLine[3]);
sql_statement.setString(5,nextLine[4]); //setInt(Integer.parseInt(nextLine[4].trim());
sql_statement.setString(6,nextLine[5]);
sql_statement.setObject(7, nextLine[5]);
sql_statement.setString(8,nextLine[7]);
sql_statement.setString(9,nextLine[8]);
sql_statement.addBatch();
// Add the record to batch
if (++batchSize % 5000 == 0){
sql_statement.executeBatch();
}
}
sql_statement.executeBatch();
}
catch(SQLException e){
e.printStackTrace();
}
//Perform a bulk batch insert
int[] totalRecords = new int[7];
try {
totalRecords = sql_statement.executeBatch();
} catch(BatchUpdateException e) {
//handle exception for failed records here
totalRecords = e.getUpdateCounts();
} catch(SQLException ex){
ex.printStackTrace();
}
System.out.println ("Total records inserted in bulk from CSV file " + totalRecords.length);
/* Close prepared statement */
sql_statement.close();
/* COMMIT transaction */
conn.commit();
/* Close connection */
conn.close();
}
I am not receiving an error while running this. It is printing:
Total records inserted in bulk from CSV file 0
The table is not being updated with the new values in Oracle. Any suggestions?
Execute sql_statement.executeBatch() only once if your batch size is reached.
executeBatch() is returning an array with the results (how many rows are affected).
So you have to add each element of the array to compute the total count.
The condition for execute the batch is also wrong.
I cant proof, but i would change your example like this (only changed section):
public void insertData() throws ClassNotFoundException, SQLException, IOException {
/* Create Connection objects */
Class.forName("oracle.jdbc.OracleDriver");
String jdbc_insert_sql = "INSERT INTO METATADA_AUTOSYS"
+ "(MACH_NAME,JOB_NAME,SCRIPT_COMMAND,APPLICATION_NAME,JOB_ID,STATUS,CREATE_DATE,LAST_START_DT,LAST_END_DT) VALUES"
+ "(?,?,?,?,?,?,?,?,?)";
int totalRecords = 0;
final int batchSize = 5000;
/* Read CSV file in OpenCSV */
String inputCSVFile = "C:/Users/conwacx/Desktop/meta_auto_v3/Autosys_Metadata.csv";
try (CSVReader reader = new CSVReader(new FileReader(inputCSVFile))) {
try (Connection conn = DriverManager.getConnection("jdbc:oracle:thin:#HoSt", "UsErNaMe", "PaSsWoRd")) {
try (PreparedStatement sql_statement = conn.prepareStatement(jdbc_insert_sql);) {
String[] nextLine;
int lnNum = 0;
// loop file , add records to batch
try {
while ((nextLine = reader.readNext()) != null) {
lnNum++;
/* Bind CSV file input to table columns */
sql_statement.setString(1, nextLine[0]);
sql_statement.setString(2, nextLine[1]);
sql_statement.setString(3, nextLine[2]);
sql_statement.setString(4, nextLine[3]);
sql_statement.setString(5, nextLine[4]);
sql_statement.setString(6, nextLine[5]);
sql_statement.setObject(7, nextLine[5]);
sql_statement.setString(8, nextLine[7]);
sql_statement.setString(9, nextLine[8]);
sql_statement.addBatch();
// Add the record to batch
if (lnNum >= batchSize) {
// Perform a bulk batch insert
totalRecords += doExecute(sql_statement);
lnNum = 0;
}
}
// insert the last rows
if ( lnNum >= 0 ) {
totalRecords += doExecute(sql_statement);
}
} catch (SQLException e) {
e.printStackTrace();
}
}
System.out.println("Total records inserted in bulk from CSV file " + totalRecords);
/* COMMIT transaction */
conn.commit();
}
}
}
private int doExecute(PreparedStatement sql_statement) {
int totalRecords = 0;
int[] results = null;
try {
results = sql_statement.executeBatch();
for (int i = 0; i < results.length; i++) {
totalRecords += results[i];
}
} catch (BatchUpdateException e) {
// handle exception for failed records here
results = e.getUpdateCounts();
for (int i = 0; i < results.length; i++) {
totalRecords += results[i];
}
} catch (SQLException ex) {
ex.printStackTrace();
}
return totalRecords;
}

Error in inserting data into Database from CSV

I'm trying to import data from csv to database. Below is my code.
The problem is while importing some files it shows below error and other gets loaded successfully.
java.sql.SQLException: syntax error, unexpected STYLE_, expecting identifier_chain2 or OFFSET_ [line 1, column 303] (Session: 1596182455914086215)
at com.exasol.jdbc.ExceptionFactory.createSQLException(ExceptionFactory.java:164)
at com.exasol.jdbc.ExceptionFactory.createSQLException(ExceptionFactory.java:21)
at com.exasol.jdbc.AbstractEXAPreparedStatement.<init>(AbstractEXAPreparedStatement.java:62)
at com.exasol.jdbc.AbstractEXAPreparedStatement_14.<init>(AbstractEXAPreparedStatement_14.java:14)
at com.exasol.jdbc.EXAPreparedStatement.<init>(EXAPreparedStatement.java:12)
at com.exasol.jdbc.DialectGeneric.createPreparedStatement(DialectGeneric.java:10)
at com.exasol.jdbc.AbstractEXAConnection.prepareStatement(AbstractEXAConnection.java:578)
Code:
import java.io.FileReader;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.Statement;
import com.opencsv.CSVReader;
public class ImportCsv
{
public static void main(String[] args)
{
readCsv();
readCsvUsingLoad();
}
private static void readCsv()
{
try (CSVReader reader = new CSVReader(new FileReader("upload.csv"), ',');
Connection connection = DBConnection.getConnection();)
{
String insertQuery = "Insert into txn_tbl (txn_id,txn_amount, card_number, terminal_id) values (null,?,?,?)";
PreparedStatement pstmt = connection.prepareStatement(insertQuery);
String[] rowData = null;
int i = 0;
while((rowData = reader.readNext()) != null)
{
for (String data : rowData)
{
pstmt.setString((i % 3) + 1, data);
if (++i % 3 == 0)
pstmt.addBatch();// add batch
if (i % 30 == 0)// insert when the batch size is 10
pstmt.executeBatch();
}
}
System.out.println("Data Successfully Uploaded");
}
catch (Exception e)
{
e.printStackTrace();
}
}
private static void readCsvUsingLoad()
{
try (Connection connection = DBConnection.getConnection())
{
String loadQuery = "LOAD DATA LOCAL INFILE '" + "C:\\upload.csv" + "' INTO TABLE txn_tbl FIELDS TERMINATED BY ','" + " LINES TERMINATED BY '\n' (txn_amount, card_number, terminal_id) ";
System.out.println(loadQuery);
Statement stmt = connection.createStatement();
stmt.execute(loadQuery);
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
May I know please what this error trying to say ? It doesn't seems to me data problem because when I compare with other files who gets loaded successfully there is no difference.
Any suggestion please ? For this error what needs to be looked ?

Doesn't insert csv file records in java (jdbc)

I have an error when my CSVLoader file does not insert the data in my database, I get this in my console
FieldName (UploadDownloadFileServlet) = fileName
FileName (UploadDownloadFileServlet) = prueba2.csv
ContentType (UploadDownloadFileServlet) = application/vnd.ms-excel
Size in bytes (UploadDownloadFileServlet) = 47
Absolute Path at server (UploadDownloadFileServlet) = C:\Users\SISTEMAS\workspaceEclipse.metadata.plugins\org.eclipse.wst.server.core\tmp0\wtpwebapps\SAC-sin-impresion\uploadedCsvFiles\prueba2.csv
Query: INSERT INTO MATRICULA(C:UsersSISTEMASworkspaceEclipse.metadata.pluginsorg.eclipse.wst.server.coretmp0wtpwebappsSAC-sin-impresionuploadedCsvFilesprueba2.csv) VALUES(?)
[CSVLoader]: 0 records loaded into MATRICULA DB table
THIS IS MY CODE:
public void loadCSV(InputStream csvFile, String tableName,
boolean truncateBeforeLoad) throws Exception {
CSVReader csvReader = null;
if(null == this.connection) {
throw new Exception("Not a valid connection.");
}
try {
/* Modified by rammar.
*
* I was having issues with the CSVReader using the "\" to escape characters.
* A MySQL CSV file contains quote-enclosed fields and non-quote-enclosed NULL
* values written as "\N". The CSVReader was removing the "\". To detect "\N"
* I must remove the escape character, and the only character you can replace
* it with that you are pretty much guaranteed will not be used to escape
* text is '\0'.
* I read this on:
* http://stackoverflow.com/questions/6008395/opencsv-in-java-ignores-backslash-in-a-field-value
* based on:
* http://sourceforge.net/p/opencsv/support-requests/5/
*/
// PREVIOUS VERSION: csvReader = new CSVReader(new FileReader(csvFile), this.seprator);
csvReader = new CSVReader(new InputStreamReader(csvFile), this.seprator, '"', '\0');
} catch (Exception e) {
e.printStackTrace();
throw new Exception("Error occured while executing file. "
+ e.getMessage());
}
String[] headerRow = csvReader.readNext();
if (null == headerRow) {
throw new FileNotFoundException(
"No columns defined in given CSV file." +
"Please check the CSV file format.");
}
String questionmarks = StringUtils.repeat("?,", headerRow.length);
questionmarks = (String) questionmarks.subSequence(0, questionmarks
.length() - 1);
/* NOTE from Ron: Header column names must match SQL table fields */
String query = SQL_INSERT.replaceFirst(TABLE_REGEX, tableName);
query = query
.replaceFirst(KEYS_REGEX, StringUtils.join(headerRow, ","));
query = query.replaceFirst(VALUES_REGEX, questionmarks);
System.out.println("Query: " + query); // Modified by rammar to suppress output
String[] nextLine;
Connection con = null;
PreparedStatement ps = null;
try {
con = this.connection;
con.setAutoCommit(false);
ps = con.prepareStatement(query);
if(truncateBeforeLoad) {
//delete data from table before loading csv
con.createStatement().execute("DELETE FROM " + tableName);
}
final int batchSize = 1000;
int count = 0;
Date date = null;
while ((nextLine = csvReader.readNext()) != null) {
if (null != nextLine) {
int index = 1;
for (String string : nextLine) {
date = DateUtil.convertToDate(string);
if (null != date) {
ps.setDate(index++, new java.sql.Date(date
.getTime()));
} else {
/* Section modified by rammar to allow NULL values
* to be input into the DB. */
if (string.length() > 0 && !string.equals("\\N")) {
ps.setString(index++, string);
} else {
ps.setNull(index++, Types.VARCHAR);
//ps.setString(index++, null); // can use this syntax also - not sure which is better
}
}
}
ps.addBatch();
}
if (++count % batchSize == 0) {
ps.executeBatch();
}
}
ps.executeBatch(); // insert remaining records
System.out.println("[" + this.getClass().getSimpleName() + "]: " +
count + " records loaded into " + tableName + " DB table");
con.commit();
} catch (Exception e) {
con.rollback();
e.printStackTrace();
throw new Exception(
"Error occured while loading data from file to database."
+ e.getMessage());
} finally {
/*if (null != ps)
ps.close();
*/
/*if (null != con)
con.close();*/
csvReader.close();
}
}

Java CSVReader ignore commas in double quotes

I have a CSV file that I am having trouble parsing. I am using the opencsv library. Here is what my data looks like and what I am trying to achieve.
RPT_PE,CLASS,RPT_MKT,PROV_CTRCT,CENTER_NM,GK_TY,MBR_NM,MBR_PID
"20150801","NULL","33612","00083249P PCP602","JOE SMITH ARNP","NULL","FRANK, LUCAS E","50004655200"
The issue I am having is the member name ("FRANK, LUCAS E") is being split into two columns and the member name should be one. Again I'm using opencsv and a comma as the separator. Is there any way I can ignore the commas inside the double-quotes?
public void loadCSV(String csvFile, String tableName,
boolean truncateBeforeLoad) throws Exception {
CSVReader csvReader = null;
if (null == this.connection) {
throw new Exception("Not a valid connection.");
}
try {
csvReader = new CSVReader(new FileReader(csvFile), this.seprator);
} catch (Exception e) {
e.printStackTrace();
throw new Exception("Error occured while executing file. "
+ e.getMessage());
}
String[] headerRow = csvReader.readNext();
if (null == headerRow) {
throw new FileNotFoundException(
"No columns defined in given CSV file."
+ "Please check the CSV file format.");
}
String questionmarks = StringUtils.repeat("?,", headerRow.length);
questionmarks = (String) questionmarks.subSequence(0, questionmarks
.length() - 1);
String query = SQL_INSERT.replaceFirst(TABLE_REGEX, tableName);
System.out.println("Base Query: " + query);
String headerRowMod = Arrays.toString(headerRow).replaceAll(", ]", "]");
String[] strArray = headerRowMod.split(",");
query = query
.replaceFirst(KEYS_REGEX, StringUtils.join(strArray, ","));
System.out.println("Add Headers: " + query);
query = query.replaceFirst(VALUES_REGEX, questionmarks);
System.out.println("Add questionmarks: " + query);
String[] nextLine;
Connection con = null;
PreparedStatement ps = null;
try {
con = this.connection;
con.setAutoCommit(false);
ps = con.prepareStatement(query);
if (truncateBeforeLoad) {
//delete data from table before loading csv
con.createStatement().execute("DELETE FROM " + tableName);
}
final int batchSize = 1000;
int count = 0;
Date date = null;
while ((nextLine = csvReader.readNext()) != null) {
System.out.println("Next Line: " + Arrays.toString(nextLine));
if (null != nextLine) {
int index = 1;
for (String string : nextLine) {
date = DateUtil.convertToDate(string);
if (null != date) {
ps.setDate(index++, new java.sql.Date(date
.getTime()));
} else {
ps.setString(index++, string);
}
}
ps.addBatch();
}
if (++count % batchSize == 0) {
ps.executeBatch();
}
}
ps.executeBatch(); // insert remaining records
con.commit();
} catch (SQLException | IOException e) {
con.rollback();
e.printStackTrace();
throw new Exception(
"Error occured while loading data from file to database."
+ e.getMessage());
} finally {
if (null != ps) {
ps.close();
}
if (null != con) {
con.close();
}
csvReader.close();
}
}
public char getSeprator() {
return seprator;
}
public void setSeprator(char seprator) {
this.seprator = seprator;
}
public char getQuoteChar() {
return quoteChar;
}
public void setQuoteChar(char quoteChar) {
this.quoteChar = quoteChar;
}
}
Did you try the the following?
CSVReader reader = new CSVReader(new FileReader("yourfile.csv"), ',');
I wrote a following program and it works for me, I got the following result:
[20150801] [NULL] [33612] [00083249P PCP602] [JOE SMITH ARNP] [NULL]
[FRANK, LUCAS E] [50004655200]
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import au.com.bytecode.opencsv.CSVReader;
public class CVSTest {
/**
* #param args
*/
public static void main(String[] args) {
CSVReader reader = null;
try {
reader = new CSVReader(new FileReader(
"C:/Work/Dev/Projects/Pure_Test/Test/src/cvs"), ',');
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
String[] nextLine;
try {
while ((nextLine = reader.readNext()) != null) {
// nextLine[] is an array of values from the line
System.out.println("[" + nextLine[0] + "] [" + nextLine[1]
+ "] [" + nextLine[2] + "] [" + nextLine[3] + "] ["
+ nextLine[4] + "] [" + nextLine[5] + "] ["
+ nextLine[6] + "] [" + nextLine[7] + "]");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
According to the documentation, you can supply custom separator and quote characters in the constructor, which should deal with it:
CSVReader(Reader reader, char separator, char quotechar)
Construct your reader with , as separator and " as quotechar.
It is simple to load your CSV as an SQL table into HSQLDB, then select rows from the table to insert into another database. HSQLDB handles commas inside quotes. You need to define your text source as "quoted". See this:
http://hsqldb.org/doc/2.0/guide/texttables-chapt.html
Your case should be handled out of the box with no special configuration required.
If you can't make it work, then just switch to uniVocity-parsers to do this for you - it's twice as fast in comparison to OpenCSV, requires much less code and is packed with features.
CsvParserSettings settings = new CsvParserSettings(); // you have many configuration options here - check the tutorial.
CsvParser parser = new CsvParser(settings);
List<String[]> allRows = parser.parseAll(new FileReader(new File("C:/Work/Dev/Projects/Pure_Test/Test/src/cvs")));
Disclosure: I am the author of this library. It's open-source and free (Apache V2.0 license).

How to improve the speed of this code?

I'm trying to import all googlebooks-1gram files into a postgresql database. I wrote the following Java code for that:
public class ToPostgres {
public static void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
try {
PreparedStatement wordInsert = c.prepareStatement(
"INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
);
PreparedStatement countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
String lastWord = "";
Long lastId = -1L;
for (String filename: files) {
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line = "";
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
Long id = -1L;
if (lastWord.equals(data[0])) {
id = lastId;
} else {
wordInsert.setString(1, data[0]);
wordInsert.executeUpdate();
ResultSet resultSet = wordInsert.getGeneratedKeys();
if (resultSet != null && resultSet.next())
{
id = resultSet.getLong(1);
}
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
lastId = id;
}
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
However, when running this for ~3 hours it only placed 1.000.000 entries in the wordcounts table. When I check the amount of lines in the entire 1gram dataset it's 500.000.000 lines. So to import everything would take about 62.5 days, I can accept that it imports in about a week, but 2 months? I think I'm doing something seriously wrong here(I do have a server that runs 24/7, so I can actually run it for this long, but faster would be nice XD)
EDIT: This code is how I solved it:
public class ToPostgres {
public static void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
c.setAutoCommit(false);
try {
PreparedStatement wordInsert = c.prepareStatement(
"INSERT INTO words (id, word) VALUES (?,?)"
);
PreparedStatement countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
String lastWord = "";
Long id = 0L;
for (String filename: files) {
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line = "";
int i = 0;
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
if (!lastWord.equals(data[0])) {
id++;
wordInsert.setLong(1, id);
wordInsert.setString(2, data[0]);
wordInsert.executeUpdate();
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
if (i % 10000 == 0) {
c.commit();
}
if (i % 100000 == 0) {
System.out.println(i+" mark file "+filename);
}
i++;
}
c.commit();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
I reached 1.5 million rows in about 15 minutes now. That's fast enough for me, thanks all!
JDBC connections have autocommit enabled by default, which carries a per-statement overhead. Try disabling it:
c.setAutoCommit(false)
then commit in batches, something along the lines of:
long ops = 0;
for(String filename : files) {
// ...
while ((line = input.readLine()) != null) {
// insert some stuff...
ops ++;
if(ops % 1000 == 0) {
c.commit();
}
}
}
c.commit();
If your table has indexes, it might be faster to delete them, insert the data, and recreate the indexes later.
Setting autocommit off, and doing a manual commit every 10 000 records or so (look into the documentation for a reasonable value - there is some limit) could speed up as well.
Generating the index/foreign key yourself, and keeping track of it should be faster than wordInsert.getGeneratedKeys(); but I'm not sure, whether it is possible from your content.
There is an approach called 'bulk insert'. I don't remember the details, but its a starting point for a search.
Write it to do threading, running 4 threads at the same time, or split it up in sections (read from config file) and distribute it to X machines and have them get the data togeather.
Use batch statements to execute multiple inserts at the same time, rather than one INSERT at a time.
In addition I would remove the part of your algorithm which updates the word count after each insert into the words table, instead just calculate all of the word counts once inserting the words is complete.
Another approach would be to do bulk inserts rather than single inserts. See this question Whats the fastest way to do a bulk insert into Postgres? for more information.
Create threads
String lastWord = "";
Long lastId = -1L;
PreparedStatement wordInsert;
PreparedStatement countInsert ;
public class ToPostgres {
public void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
try {
wordInsert = c.prepareStatement(
"INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
);
countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
for (String filename: files) {
new MyThread(filename). start();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
class MyThread extends Thread{
String file;
public MyThread(String file) {
this.file = file;
}
#Override
public void run() {
try {
super.run();
BufferedReader input = new BufferedReader(new FileReader(new File(file)));
String line = "";
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
Long id = -1L;
if (lastWord.equals(data[0])) {
id = lastId;
} else {
wordInsert.setString(1, data[0]);
wordInsert.executeUpdate();
ResultSet resultSet = wordInsert.getGeneratedKeys();
if (resultSet != null && resultSet.next())
{
id = resultSet.getLong(1);
}
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
lastId = id;
}
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}

Categories

Resources