Java- Cannot insert batch of 1000 to table - java

Let's say I have a program where I am inserting records into MYSQL table in ddatabase in Java.
Instead of inserting row by row, I insert by a batch of 1000 records. Using ExecuteBatch method, it doesn't seem to work as it still inserts row by row.
Code(only the snippet):
public void readDataBase(String path,String word) throws Exception {
try {
Class.forName("com.mysql.jdbc.Driver");
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.20/fulltext_ltat?"
+ "user=root&password=root");
String sql="insert IGNORE into fulltext_ltat.indextable values (default,?, ?) ";
preparedStatement = connect.prepareStatement(sql);
for(int i=0;i<1000;i++) {
preparedStatement.setString(1, path);
preparedStatement.setString(2, word);
preparedStatement.addBatch();
if (i % 1000 == 0) {
preparedStatement.executeBatch();
System.out.print("Add Thousand");
}
}
} catch (SQLException e) {
e.printStackTrace();
} finally {
try {
preparedStatement.close();
connect.close();
}
catch (SQLException e) {
e.printStackTrace();
}
}
}
Code: Main method calling the above
public static void main(String[] args) throws Exception {
StopWatch stopwatch = new StopWatch();
stopwatch.start();
File folder = new File("D:\\PDF1");
File[] listOfFiles = folder.listFiles();
for (File file : listOfFiles) {
if (file.isFile()) {
HashSet<String> uniqueWords = new HashSet<>();
String path = "D:\\PDF1\\" + file.getName();
try (PDDocument document = PDDocument.load(new File(path))) {
if (!document.isEncrypted()) {
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);
String lines[] = pdfFileInText.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
for (String word : words) {
uniqueWords.add(word)
;
}
}
// System.out.println(uniqueWords);
}
} catch (IOException e) {
System.err.println("Exception while trying to read pdf document - " + e);
}
Object[] words = uniqueWords.toArray();
MysqlAccessIndex connection = new MysqlAccessIndex();
for(int i = 1 ; i <= words.length - 1 ; i++ ) {
connection.readDataBase(path, words[i].toString());
}
System.out.println("Completed");
}
}
The moment I run the program, the if statement is always executing rather than checking if there are 1000 records then only execute to insert to db.
Am I doing anything wrong?

i % 1000 == 0 is true when i==0, so you only execute the batch in the first iteration of the loop.
You should execute the batch after the loop:
for (int i=0;i<1000;i++) {
preparedStatement.setString(1, path);
preparedStatement.setString(2, word);
preparedStatement.addBatch();
}
preparedStatement.executeBatch();
System.out.print("Add Thousand");
Now, if you had 10000 records, and you wanted to execute batch insert every 1000, you could write:
for (int i=0;i<10000;i++) {
preparedStatement.setString(1, path);
preparedStatement.setString(2, word);
preparedStatement.addBatch();
if ((i + 1) % 1000 == 0) {
preparedStatement.executeBatch();
System.out.print("Add Thousand");
}
}
EDIT: In order not to insert the same word multiple times to the table, pass an array to your method:
Change
for(int i = 1 ; i <= words.length - 1 ; i++ ) {
connection.readDataBase(path, words[i].toString());
}
to
connection.readDataBase(path, words);
and
public void readDataBase(String path,String word) throws Exception {
to
public void readDataBase(String path,String[] words) throws Exception {
and finally the batch insert loop would become:
for (int i=0;i<words.length;i++) {
preparedStatement.setString(1, path);
preparedStatement.setString(2, words[i]);
preparedStatement.addBatch();
if ((i + 1) % 1000 == 0) {
preparedStatement.executeBatch();
System.out.print("Add Thousand");
}
}
if (words.length % 1000 > 0) {
preparedStatement.executeBatch();
System.out.print("Add Remaining");
}

In the configuration property url add: allowMultiQueries=true

Related

Bulk insert from a csv to a table in an Oracle DB using Java

I am attempting to insert a table in an Oracle DB using java. I am reading a csv file line by line using OpenCSV. The csv is about 50000 rows and 9 columns. Here is some of my code:
/* Create Connection objects */
Class.forName ("oracle.jdbc.OracleDriver");
Connection conn = DriverManager.getConnection("jdbc:oracle:thin:#HoSt", "UsErNaMe", "PaSsWoRd");
PreparedStatement sql_statement = null;
String jdbc_insert_sql = "INSERT INTO METATADA_AUTOSYS"
+ "(MACH_NAME,JOB_NAME,SCRIPT_COMMAND,APPLICATION_NAME,JOB_ID,STATUS,CREATE_DATE,LAST_START_DT,LAST_END_DT) VALUES"
+ "(?,?,?,?,?,?,?,?,?)";
sql_statement = conn.prepareStatement(jdbc_insert_sql);
/* Read CSV file in OpenCSV */
String inputCSVFile = "C:/Users/conwacx/Desktop/meta_auto_v3/Autosys_Metadata.csv";
CSVReader reader = new CSVReader(new FileReader(inputCSVFile));
String [] nextLine;
int lnNum = 0;
int batchSize = 5000;
//loop file , add records to batch
try{
while ((nextLine = reader.readNext()) != null) {
lnNum++;
/* Bind CSV file input to table columns */
sql_statement.setString(1, nextLine[0]);
sql_statement.setString(2,nextLine[1]);
sql_statement.setString(3,nextLine[2]);
sql_statement.setString(4,nextLine[3]);
sql_statement.setString(5,nextLine[4]); //setInt(Integer.parseInt(nextLine[4].trim());
sql_statement.setString(6,nextLine[5]);
sql_statement.setObject(7, nextLine[5]);
sql_statement.setString(8,nextLine[7]);
sql_statement.setString(9,nextLine[8]);
sql_statement.addBatch();
// Add the record to batch
if (++batchSize % 5000 == 0){
sql_statement.executeBatch();
}
}
sql_statement.executeBatch();
}
catch(SQLException e){
e.printStackTrace();
}
//Perform a bulk batch insert
int[] totalRecords = new int[7];
try {
totalRecords = sql_statement.executeBatch();
} catch(BatchUpdateException e) {
//handle exception for failed records here
totalRecords = e.getUpdateCounts();
} catch(SQLException ex){
ex.printStackTrace();
}
System.out.println ("Total records inserted in bulk from CSV file " + totalRecords.length);
/* Close prepared statement */
sql_statement.close();
/* COMMIT transaction */
conn.commit();
/* Close connection */
conn.close();
}
I am not receiving an error while running this. It is printing:
Total records inserted in bulk from CSV file 0
The table is not being updated with the new values in Oracle. Any suggestions?
Execute sql_statement.executeBatch() only once if your batch size is reached.
executeBatch() is returning an array with the results (how many rows are affected).
So you have to add each element of the array to compute the total count.
The condition for execute the batch is also wrong.
I cant proof, but i would change your example like this (only changed section):
public void insertData() throws ClassNotFoundException, SQLException, IOException {
/* Create Connection objects */
Class.forName("oracle.jdbc.OracleDriver");
String jdbc_insert_sql = "INSERT INTO METATADA_AUTOSYS"
+ "(MACH_NAME,JOB_NAME,SCRIPT_COMMAND,APPLICATION_NAME,JOB_ID,STATUS,CREATE_DATE,LAST_START_DT,LAST_END_DT) VALUES"
+ "(?,?,?,?,?,?,?,?,?)";
int totalRecords = 0;
final int batchSize = 5000;
/* Read CSV file in OpenCSV */
String inputCSVFile = "C:/Users/conwacx/Desktop/meta_auto_v3/Autosys_Metadata.csv";
try (CSVReader reader = new CSVReader(new FileReader(inputCSVFile))) {
try (Connection conn = DriverManager.getConnection("jdbc:oracle:thin:#HoSt", "UsErNaMe", "PaSsWoRd")) {
try (PreparedStatement sql_statement = conn.prepareStatement(jdbc_insert_sql);) {
String[] nextLine;
int lnNum = 0;
// loop file , add records to batch
try {
while ((nextLine = reader.readNext()) != null) {
lnNum++;
/* Bind CSV file input to table columns */
sql_statement.setString(1, nextLine[0]);
sql_statement.setString(2, nextLine[1]);
sql_statement.setString(3, nextLine[2]);
sql_statement.setString(4, nextLine[3]);
sql_statement.setString(5, nextLine[4]);
sql_statement.setString(6, nextLine[5]);
sql_statement.setObject(7, nextLine[5]);
sql_statement.setString(8, nextLine[7]);
sql_statement.setString(9, nextLine[8]);
sql_statement.addBatch();
// Add the record to batch
if (lnNum >= batchSize) {
// Perform a bulk batch insert
totalRecords += doExecute(sql_statement);
lnNum = 0;
}
}
// insert the last rows
if ( lnNum >= 0 ) {
totalRecords += doExecute(sql_statement);
}
} catch (SQLException e) {
e.printStackTrace();
}
}
System.out.println("Total records inserted in bulk from CSV file " + totalRecords);
/* COMMIT transaction */
conn.commit();
}
}
}
private int doExecute(PreparedStatement sql_statement) {
int totalRecords = 0;
int[] results = null;
try {
results = sql_statement.executeBatch();
for (int i = 0; i < results.length; i++) {
totalRecords += results[i];
}
} catch (BatchUpdateException e) {
// handle exception for failed records here
results = e.getUpdateCounts();
for (int i = 0; i < results.length; i++) {
totalRecords += results[i];
}
} catch (SQLException ex) {
ex.printStackTrace();
}
return totalRecords;
}

Java - executebatch is not working when inserting records to db

I am trying to insert records into my table in MYSQL after extracting the words from a file and stored them in a hashset.
I tried using executeBatch() to insert into my db after getting 500 records but when the execution finished, I checked my table and there's no record inserted at all.
Note: When I use ExecuteUpdate() then the records will show in my table. But not ExecuteBatch() since I want to insert by batch, not one by one.
May I know what did I do wrong?
Code:
public void readDataBase(String path,String word) throws Exception {
try {
// Result set get the result of the SQL query
int i=0;
// This will load the MySQL driver, each DB has its own driver
Class.forName("com.mysql.jdbc.Driver");
// Setup the connection with the DB
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.20/fulltext_ltat?"
+ "user=root&password=root");
// Statements allow to issue SQL queries to the database
// statement = connect.createStatement();
System.out.print("Connected");
// Result set get the result of the SQL query
preparedStatement = connect
.prepareStatement("insert IGNORE into fulltext_ltat.indextable values (default,?, ?) ");
preparedStatement.setString( 1, path);
preparedStatement.setString(2, word);
preparedStatement.addBatch();
i++;
// preparedStatement.executeUpdate();
if(i%500==0){
preparedStatement.executeBatch();
}
preparedStatement.close();
// writeResultSet(resultSet);
} catch (Exception e) {
throw e;
} finally {
close();
}
}
This is my loop to call that method(words is just an array that contains the words which is going to be inserted to the table):
for(int i = 1 ; i <= words.length - 1 ; i++ ) {
connection.readDataBase(path, words[i].toString());
}
My main method:
public static void main(String[] args) throws Exception {
StopWatch stopwatch = new StopWatch();
stopwatch.start();
File folder = new File("D:\\PDF1");
File[] listOfFiles = folder.listFiles();
for (File file : listOfFiles) {
if (file.isFile()) {
HashSet<String> uniqueWords = new HashSet<>();
String path = "D:\\PDF1\\" + file.getName();
try (PDDocument document = PDDocument.load(new File(path))) {
if (!document.isEncrypted()) {
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);
String lines[] = pdfFileInText.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
for (String word : words) {
uniqueWords.add(word)
;
}
}
// System.out.println(uniqueWords);
}
} catch (IOException e) {
System.err.println("Exception while trying to read pdf document - " + e);
}
Object[] words = uniqueWords.toArray();
MysqlAccessIndex connection = new MysqlAccessIndex();
for(int i = 1 ; i <= words.length - 1 ; i++ ) {
connection.readDataBase(path, words[i].toString());
}
System.out.println("Completed");
}
}
Your pattern for doing batch updates is off. You should be opening the connection and preparing the statement only once. Then, iterate multiple times, binding parameters, and add that statement to the batch.
// define a collection of paths and words somewhere
List<String> paths = new ArrayList<>();
List<String> words = new ArrayList<>();
try {
// presumably you only want to insert so many records
int LIMIT = 10000;
Class.forName("com.mysql.jdbc.Driver");
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.20/fulltext_ltat?"
+ "user=root&password=root");
String sql = "INSERT IGNORE INTO fulltext_ltat.indextable VALUES (default, ?, ?);";
preparedStatement = connect.prepareStatement(sql);
for (int i=0; i < LIMIT; ++i) {
preparedStatement.setString(1, paths.get(i));
preparedStatement.setString(2, word.get(i));
preparedStatement.addBatch();
if (i % 500 == 0) {
preparedStatement.executeBatch();
}
}
// execute remaining batches
preparedStatement.executeBatch();
}
catch (SQLException e) {
e.printStackTrace();
}
finally {
try {
preparedStatement.close();
connect.close();
}
catch (SQLException e) {
e.printStackTrace();
}
}
One key change I made here is to add logic for when you should stop doing inserts. Currently, your code looks to have an infinite loop, which means it would run forever. This is probably not what you were intending to do.
where is your loop. try this
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.20/fulltext_ltat?"
+ "user=root&password=root&rewriteBatchedStatements=true");

Java - Improving performance for building up index table

I am working on a full text indexing using inverted file method where it extracts all the word in a document, and inserts each word one by one into my table in MYSQL.
So far, my program works perfectly fine but I am stuck in thinking how it could be optimize further to improve the time it takes to insert into db. I am aware inverted file has a disadvantage of slow time for building up the index table.
Here is my code:
public class IndexTest {
public static void main(String[] args) throws Exception {
StopWatch stopwatch = new StopWatch();
stopwatch.start();
File folder = new File("D:\\PDF1");
File[] listOfFiles = folder.listFiles();
for (File file : listOfFiles) {
if (file.isFile()) {
HashSet<String> uniqueWords = new HashSet<>();
String path = "D:\\PDF1\\" + file.getName();
try (PDDocument document = PDDocument.load(new File(path))) {
if (!document.isEncrypted()) {
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);
String lines[] = pdfFileInText.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
for (String word : words) {
uniqueWords.add(word);
}
}
// System.out.println(uniqueWords);
}
} catch (IOException e) {
System.err.println("Exception while trying to read pdf document - " + e);
}
Object[] words = uniqueWords.toArray();
String unique = uniqueWords.toString();
// System.out.println(words[1].toString());
for(int i = 1 ; i <= words.length - 1 ; i++ ) {
MysqlAccessIndex connection = new MysqlAccessIndex();
connection.readDataBase(path, words[i].toString());
}
System.out.println("Completed");
}
}
stopwatch.stop();
long timeTaken = stopwatch.getTime();
System.out.println(timeTaken);
MYSQL connection:
public class MysqlAccessIndex {
public Connection connect = null;
public Statement statement = null;
public PreparedStatement preparedStatement = null;
public ResultSet resultSet = null;
public MysqlAccessIndex() throws Exception {
Class.forName("com.mysql.jdbc.Driver");
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.178/fulltext_ltat?"
+ "user=root&password=root123");
// statement = connect.createStatement();
System.out.print("Connected");
}
public void readDataBase(String path,String word) throws Exception {
try {
preparedStatement = connect
.prepareStatement("insert IGNORE into fulltext_ltat.test_text values (?, ?) ");
preparedStatement.setString(1, path);
preparedStatement.setString(2, word);
preparedStatement.executeUpdate();
} catch (Exception e) {
throw e;
} finally {
close();
}
}
Is it possible if I could use some sort of multi threading to say insert three words in three rows at the same time to speed up the insert process or some sort?
I would appreciate any suggestion.
I think solution to your problem - is to use bulk insert.
You could try to do something like this:
public void readDataBase(String path, HashSet<String> uniqueWords) throws Exception {
PreparedStatement preparedStatement;
try {
String compiledQuery = "insert IGNORE into fulltext_ltat.test_text values (?, ?) ";
preparedStatement = connect.prepareStatement(compiledQuery);
for(String word : uniqueWords) {
preparedStatement.setString(1, path);
preparedStatement.setString(2, word);
preparedStatement.addBatch();
}
long start = System.currentTimeMillis();
int[] inserted = preparedStatement.executeBatch();
} catch (Exception e) {
throw e;
} finally {
close();
}
}
Modify your readDataBase method to have HashSet<String> uniqueWords in params.
After that you should add preparedStatement.addBatch() call after each item to insert and execute preparedStatement.executeBatch() instead of preparedStatement.executeUpdate() in the end.
I hope it would help.

How can I retry the statements that were not executed after a batch execution fails?

I'm need to update a table with data from a CSV. All data is validated before the update takes place: a validation method (witch is not presented bellow) checks if some assumptions are true and "flags" the object as valid or invalid. I've already test it a lot and it's working exactly as I want.
Even so, I would like to guarantee that all Statements will be executed even if there's a fail on a batch, something that I was not able to think about. If this happens, I want the batch in witch this fail statement is to be skipped and that the next one is executed.
public void updateTable(List<PersonBean> personList) {
Connection connection = null;
PreparedStatement ps = null;
String updateDBPersonSQL = "UPDATE Person set merge_parent_id = ? WHERE id = ?";
try {
logger.info("DATA UPDATING STARTED");
input = new FileInputStream("resources/propertiesFiles/applications.properties");
properties.load(input);
final int batchSize = Integer.parseInt(properties.getProperty("batchSize"));
connection = DBConnection.getConnection();
connection.setAutoCommit(false);
int validObj = 0;
ps = connection.prepareStatement(updateDBPersonSQL);
for (int i = 0; i < personList.size(); i++) {
PersonBean person = personList.get(i);
if (person.getValidationStatus().equals("valid")) {
ps.setInt(1, person.getMerge_parent_id());
ps.setInt(2, person.getId());
ps.addBatch();
validObj++;
if (validObj % batchSize == 0 && validObj != 0) {
ps.executeBatch();
connection.commit();
logger.info((batchSize) + " rows updated");
}
}
}
int [] batchCount = ps.executeBatch();
connection.commit();
logger.info(batchCount.length + " rows updated");
writeValidationStatusToCSV(personList);
} catch (BatchUpdateException e) {
int [] updateCount = e.getUpdateCounts();
for (int i = 0; i < updateCount.length; i++) {
if (updateCount[i] >= 0) {
logger.info(updateCount.length + " objects updated.");
} else if (updateCount[i] == Statement.EXECUTE_FAILED) {
?????
}
}
logger.error(updateCount.length);
logger.error("BatchUpdateException: " + e);
logger.error("getNextException: " + e.getNextException());
try {
connection.rollback();
} catch (SQLException e1) {
logger.error("Rollback error: " + e1, e1);
}
} finally {
if (ps!= null) {
try {
ps.close();
} catch (SQLException e) {
logger.info(e);
}
}
}
logger.info("DATA UPDATING FINISHED");
}
I saw a lot of material about how to handle the exception, but none explained or pointed me to the direction of how to retry the next Statements, it means, how to execute the next batch.
How do I manage to do this?
EDIT: I'm using Postgresql
I manage to retry the next batches by surrounding the batch execution with try and catch statements. This way I'm able to catch the BatchUpdateException and call a continue statement.
try {
ps.executeBatch();
connection.commit();
/*Some more code*/
} catch (BatchUpdateException e) {
connection.rollback();
/*Some more code*/
continue;
}
I also used some control logic to "flag" the statements and batches that were already executed and logged them, making it easier to troubleshoot if some statement fails.
Here is the full code:
public void updateTable(List<PersonBean> personList) throws Exception {
logger.info("TABLE UPDATE STARTED");
List <PersonBean> personListValidated = createValidStmtList(personList);
Connection connection = null;
PreparedStatement ps = null;
String updatePersonSQL = "UPDATE Person SET merge_parent_id = ? WHERE id = ?";
input = new FileInputStream("resources/propertiesFiles/applications.properties");
properties.load(input);
final int batchSize = Integer.parseInt(properties.getProperty("batchSize"));
/*A list was used to "flag" the batches that were already executed. BatchStatus objs have only two parameters, number (incremented as the batches are being executed) and status (success or fail).*/
List <BatchStatus> batchStatusList = new ArrayList<BatchStatus>();
/*This variables will be used to help flag the batches and statements that were already executed.*/
int batchCount = 0;
int stmtAddedToBatchCount = 0;
try {
connection = DBConnection.getConnection();
connection.setAutoCommit(false);
ps = connection.prepareStatement(updatePersonSQL);
/*personListValidated contains the objects that will be updated in the table. Instead of doing the validation on the update method, I decomposed
* this part in other 2 methods, making it easier to control of the statements added to the batch.
*/
for (int i = 0; i < personListValidated.size(); i++) {
PersonBean personValid = personListValidated.get(i);
ps.setInt(1, personValid.getMerge_parent_id());
ps.setInt(2, personValid.getId());
ps.addBatch();
personValid.setToBatch("true");
stmtAddedToBatchCount++;
logger.info("Row added to batch (count: " + stmtAddedToBatchCount + ")");
if (stmtAddedToBatchCount % batchSize == 0) {
batchCount++;
try {
ps.executeBatch();
connection.commit();
for (int j = stmtAddedToBatchCount - batchSize; j < stmtAddedToBatchCount; j++){
personValid = personListValidated.get(j);
personValid.setValidationStatus("success");
}
BatchStatus batchStatusObj = new BatchStatus(batchCount, "sucess");
batchStatusList.add(batchStatusObj);
logger.info(batchStatusList.get(batchCount - 1));
} catch (BatchUpdateException e) {
connection.rollback();
for (int j = stmtAddedToBatchCount - batchSize; j < stmtAddedToBatchCount; j++){
personValid = personListValidated.get(j);
personValid.setValidationStatus("fail");
}
BatchStatus batchStatusObj = new BatchStatus(batchCount, "fail");
batchStatusList.add(batchStatusObj);
logger.info(batchStatusList.get(batchCount - 1));
logger.error("Bacth execution fail: " + e, e);
continue;
}
}
}
} catch (SQLException e) {
logger.error(e, e);
}
int[] lastBatchCount = null;
/*Try and catch to handle the statements executed on the last batch*/
try {
lastBatchCount = ps.executeBatch();
connection.commit();
for (int j = batchStatusList.size() * batchSize; j < stmtAddedToBatchCount; j++){
PersonBean personValid = personListValidated.get(j);
personValid.setValidationStatus("success");
}
logger.info(lastBatchCount.length + " rows inserted on the last batch");
logger.info("Last batch excuted");
} catch (BatchUpdateException e) {
connection.rollback();
for (int j = batchStatusList.size() * batchSize; j < stmtAddedToBatchCount; j++){
PersonBean personValid = personListValidated.get(j);
personValid.setValidationStatus("fail");
}
logger.error("Last batch fail to execute: " + e, e);
}
writeValidationStatusToCSV(personList);
logger.info("TABLE UPDATE FINISHED");
}

How to improve the speed of this code?

I'm trying to import all googlebooks-1gram files into a postgresql database. I wrote the following Java code for that:
public class ToPostgres {
public static void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
try {
PreparedStatement wordInsert = c.prepareStatement(
"INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
);
PreparedStatement countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
String lastWord = "";
Long lastId = -1L;
for (String filename: files) {
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line = "";
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
Long id = -1L;
if (lastWord.equals(data[0])) {
id = lastId;
} else {
wordInsert.setString(1, data[0]);
wordInsert.executeUpdate();
ResultSet resultSet = wordInsert.getGeneratedKeys();
if (resultSet != null && resultSet.next())
{
id = resultSet.getLong(1);
}
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
lastId = id;
}
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
However, when running this for ~3 hours it only placed 1.000.000 entries in the wordcounts table. When I check the amount of lines in the entire 1gram dataset it's 500.000.000 lines. So to import everything would take about 62.5 days, I can accept that it imports in about a week, but 2 months? I think I'm doing something seriously wrong here(I do have a server that runs 24/7, so I can actually run it for this long, but faster would be nice XD)
EDIT: This code is how I solved it:
public class ToPostgres {
public static void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
c.setAutoCommit(false);
try {
PreparedStatement wordInsert = c.prepareStatement(
"INSERT INTO words (id, word) VALUES (?,?)"
);
PreparedStatement countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
String lastWord = "";
Long id = 0L;
for (String filename: files) {
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line = "";
int i = 0;
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
if (!lastWord.equals(data[0])) {
id++;
wordInsert.setLong(1, id);
wordInsert.setString(2, data[0]);
wordInsert.executeUpdate();
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
if (i % 10000 == 0) {
c.commit();
}
if (i % 100000 == 0) {
System.out.println(i+" mark file "+filename);
}
i++;
}
c.commit();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
I reached 1.5 million rows in about 15 minutes now. That's fast enough for me, thanks all!
JDBC connections have autocommit enabled by default, which carries a per-statement overhead. Try disabling it:
c.setAutoCommit(false)
then commit in batches, something along the lines of:
long ops = 0;
for(String filename : files) {
// ...
while ((line = input.readLine()) != null) {
// insert some stuff...
ops ++;
if(ops % 1000 == 0) {
c.commit();
}
}
}
c.commit();
If your table has indexes, it might be faster to delete them, insert the data, and recreate the indexes later.
Setting autocommit off, and doing a manual commit every 10 000 records or so (look into the documentation for a reasonable value - there is some limit) could speed up as well.
Generating the index/foreign key yourself, and keeping track of it should be faster than wordInsert.getGeneratedKeys(); but I'm not sure, whether it is possible from your content.
There is an approach called 'bulk insert'. I don't remember the details, but its a starting point for a search.
Write it to do threading, running 4 threads at the same time, or split it up in sections (read from config file) and distribute it to X machines and have them get the data togeather.
Use batch statements to execute multiple inserts at the same time, rather than one INSERT at a time.
In addition I would remove the part of your algorithm which updates the word count after each insert into the words table, instead just calculate all of the word counts once inserting the words is complete.
Another approach would be to do bulk inserts rather than single inserts. See this question Whats the fastest way to do a bulk insert into Postgres? for more information.
Create threads
String lastWord = "";
Long lastId = -1L;
PreparedStatement wordInsert;
PreparedStatement countInsert ;
public class ToPostgres {
public void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
try {
wordInsert = c.prepareStatement(
"INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
);
countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
for (String filename: files) {
new MyThread(filename). start();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
class MyThread extends Thread{
String file;
public MyThread(String file) {
this.file = file;
}
#Override
public void run() {
try {
super.run();
BufferedReader input = new BufferedReader(new FileReader(new File(file)));
String line = "";
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
Long id = -1L;
if (lastWord.equals(data[0])) {
id = lastId;
} else {
wordInsert.setString(1, data[0]);
wordInsert.executeUpdate();
ResultSet resultSet = wordInsert.getGeneratedKeys();
if (resultSet != null && resultSet.next())
{
id = resultSet.getLong(1);
}
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
lastId = id;
}
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}

Categories

Resources