java- Check if file is successfully indexed and update table status - java

Let's say I want to index a file. The file is stored in a filequeue table. The table structure looks like below:
UniqueID FilePath Status
1 C:\Folder1\abc.pdf Active
2 C:\Folder1\def.pdf Active
3 C:\Folder1\efg.pdf Error
There are four different status : Active, Processing, Success and Error
Active: When the file is inserted to the table pending for indexing process
Processing: When indexing process is starting, the table status is updated to Processing.
Success: After the indexing process is completed,table status should be updated to processing.
Error: If by any chance, the processing fail for some reason.
For some reason, let's say abc.pdf does not exist. And when I scan the table, it will retrieve all filepath with status = Active and starts iterating each one of them and do the index function. During this process, it will update the status to Processing and then to Complete if there are no issues.
However, it will throw an error FileNotFoundException
on abc.pdf which is fine since the file does not exist but it still updates the status to Complete. It should update to Error status instead.
I was thinking of using an if else statement and it looks like this:
public void doScan_DB() throws Exception {
boolean fileprocessstatus=false;
try {
Statement statement = con.connect().createStatement();
ResultSet rs = statement.executeQuery("select * from filequeue where Status='Active'");
while (rs.next()) {
// get the filepath of the PDF document
String path1 = rs.getString(2);
// while running the process, update status : Processing
updateProcess_DB();
// call the index function
Indexing conn = new Indexing();
conn.doScan(path1);
fileProcessStatus =true;
// After completing the process, update status: Complete
if(fileProcessStatus=true){
updateComplete_DB();
}else{
//call function to update status to error if index fails
}
}
}catch(SQLException|IOException e){
e.printStackTrace();
}
my DoScan() method:
public void doScan(String path) throws Exception{
/* File folder = new File("D:\\PDF1");
File[] listOfFiles = folder.listFiles();
for (File file : listOfFiles) {
if (file.isFile()) {
// HashSet<String> uniqueWords = new HashSet<>();
String path = "D:\\PDF1\\" + file.getName();*/
ArrayList<String> list = new ArrayList<String>();
try (PDDocument document = PDDocument.load(new File(path))) {
if (!document.isEncrypted()) {
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);
String lines[] = pdfFileInText.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
// words.replaceAll("([\\W]+$)|(^[\\W]+)", ""));
for (String word : words) {
// check if one or more special characters at end of string then remove OR
// check special characters in beginning of the string then remove
// uniqueWords.add(word.replaceAll("([\\W]+$)|(^[\\W]+)", ""));
list.add(word.replaceAll("([\\W]+$)|(^[\\W]+)", ""));
// uniqueWords.add(word.replaceAll("([\\W]+$)|(^[\\W]+)", ""));
}
}
}
} catch (IOException e) {
System.err.println("Exception while trying to read pdf document - " + e);
}
String[] words1 =list.toArray(new String[list.size()]);
// String[] words2 =uniqueWords.toArray(new String[uniqueWords.size()]);
// MysqlAccessIndex connection = new MysqlAccessIndex();
index(words1,path);
System.out.println("Completed");
}
}
UpdateError_DB() :
public void updateError_DB(){
try{
Statement statement = con.connect().createStatement();
statement.execute("update filequeue SET STATUS ='Error' where STATUS ='Processing' ");
}catch(Exception e){
e.printStackTrace();
}
}
UpdateComplete_DB():
public void updateComplete_DB() {
try {
Statement statement = con.connect().createStatement();
statement.execute("update filequeue SET STATUS ='Complete' where STATUS ='Processing' ");
} catch (Exception e) {
e.printStackTrace();
}
}
However, it doesn't really fix the issue of update the status correctly.
Is there a way to achieve what I want?

Here is the solution based on my understanding of your scenario.
doScan_DB() method:
public void doScan_DB() throws Exception {
boolean fileprocessstatus = false;
try {
Statement statement = con.connect().createStatement();
ResultSet rs = statement.executeQuery("select * from filequeue where Status='Active'");
while (rs.next()) {
//Get the uniqueID of active filepath
String uniqueID = rs.getString(1);
// get the filepath of the PDF document
String path1 = rs.getString(2);
// while running the process, update status : Processing
updateProcess_DB(uniqueID);
// call the index function
Indexing conn = new Indexing();
if (conn.doScan(path1)) {
updateComplete_DB(uniqueID);
} else {
updateError_DB(uniqueID);
}
}
} catch (SQLException | IOException e) {
e.printStackTrace();
}
}
doScan() method:
public boolean doScan(String path) {
/*
* File folder = new File("D:\\PDF1"); File[] listOfFiles = folder.listFiles();
*
* for (File file : listOfFiles) { if (file.isFile()) { // HashSet<String>
* uniqueWords = new HashSet<>();
*
* String path = "D:\\PDF1\\" + file.getName();
*/
ArrayList<String> list = new ArrayList<String>();
boolean isSuccess = true;
try {
File f = new File(path);
if (!f.exists()) {
isSuccess = false;
} else {
PDDocument document = PDDocument.load(f);
if (!document.isEncrypted()) {
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);
String lines[] = pdfFileInText.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
// words.replaceAll("([\\W]+$)|(^[\\W]+)", ""));
for (String word : words) {
// check if one or more special characters at end of string then
// remove OR
// check special characters in beginning of the string then
// remove
// uniqueWords.add(word.replaceAll("([\\W]+$)|(^[\\W]+)", ""));
list.add(word.replaceAll("([\\W]+$)|(^[\\W]+)", ""));
// uniqueWords.add(word.replaceAll("([\\W]+$)|(^[\\W]+)", ""));
}
}
}
}
String[] words1 = list.toArray(new String[list.size()]);
// String[] words2 =uniqueWords.toArray(new String[uniqueWords.size()]);
// MysqlAccessIndex connection = new MysqlAccessIndex();
index(words1, path);
} catch (Exception e) {
System.err.println("Exception while trying to read pdf document - " + e);
isSuccess = false;
}
return isSuccess;
}
Now all the update methods.Here I'll be using JDBC PreparedStatement in place of JDBC Statement. I'm assuming your UniqueId is of Int type.
Note: Make necessary changes according to your environment
public void updateComplete_DB(String uniqueID) {
try {
String sql="UPDATE filequeue SET STATUS ='Complete' WHERE STATUS ='Processing' AND UniqueID=?";
PreparedStatement statement = con.connect().prepareStatement(sql);
statement.setInt(1,Integer.parseInt(uniqueID));
int rows = statement.executeUpdate();
System.out.prrintln("No. of rows updated:"+rows)
} catch (Exception e) {
e.printStackTrace();
}
}
public void updateProcess_DB(String uniqueID) {
try {
String sql="UPDATE filequeue SET STATUS ='Processing' WHERE UniqueID=?";
PreparedStatement statement = con.connect().prepareStatement(sql);
statement.setInt(1,Integer.parseInt(uniqueID));
int rows = statement.executeUpdate();
System.out.prrintln("No. of rows updated:"+rows)
} catch (Exception e) {
e.printStackTrace();
}
}
public void updateError_DB(String uniqueID) {
try {
String sql="UPDATE filequeue SET STATUS ='Error' WHERE STATUS ='Processing' AND UniqueID=?";
PreparedStatement statement = con.connect().prepareStatement(sql);
statement.setInt(1,Integer.parseInt(uniqueID));
int rows = statement.executeUpdate();
System.out.prrintln("No. of rows updated:"+rows)
} catch (Exception e) {
e.printStackTrace();
}
}

Related

Bulk insert from a csv to a table in an Oracle DB using Java

I am attempting to insert a table in an Oracle DB using java. I am reading a csv file line by line using OpenCSV. The csv is about 50000 rows and 9 columns. Here is some of my code:
/* Create Connection objects */
Class.forName ("oracle.jdbc.OracleDriver");
Connection conn = DriverManager.getConnection("jdbc:oracle:thin:#HoSt", "UsErNaMe", "PaSsWoRd");
PreparedStatement sql_statement = null;
String jdbc_insert_sql = "INSERT INTO METATADA_AUTOSYS"
+ "(MACH_NAME,JOB_NAME,SCRIPT_COMMAND,APPLICATION_NAME,JOB_ID,STATUS,CREATE_DATE,LAST_START_DT,LAST_END_DT) VALUES"
+ "(?,?,?,?,?,?,?,?,?)";
sql_statement = conn.prepareStatement(jdbc_insert_sql);
/* Read CSV file in OpenCSV */
String inputCSVFile = "C:/Users/conwacx/Desktop/meta_auto_v3/Autosys_Metadata.csv";
CSVReader reader = new CSVReader(new FileReader(inputCSVFile));
String [] nextLine;
int lnNum = 0;
int batchSize = 5000;
//loop file , add records to batch
try{
while ((nextLine = reader.readNext()) != null) {
lnNum++;
/* Bind CSV file input to table columns */
sql_statement.setString(1, nextLine[0]);
sql_statement.setString(2,nextLine[1]);
sql_statement.setString(3,nextLine[2]);
sql_statement.setString(4,nextLine[3]);
sql_statement.setString(5,nextLine[4]); //setInt(Integer.parseInt(nextLine[4].trim());
sql_statement.setString(6,nextLine[5]);
sql_statement.setObject(7, nextLine[5]);
sql_statement.setString(8,nextLine[7]);
sql_statement.setString(9,nextLine[8]);
sql_statement.addBatch();
// Add the record to batch
if (++batchSize % 5000 == 0){
sql_statement.executeBatch();
}
}
sql_statement.executeBatch();
}
catch(SQLException e){
e.printStackTrace();
}
//Perform a bulk batch insert
int[] totalRecords = new int[7];
try {
totalRecords = sql_statement.executeBatch();
} catch(BatchUpdateException e) {
//handle exception for failed records here
totalRecords = e.getUpdateCounts();
} catch(SQLException ex){
ex.printStackTrace();
}
System.out.println ("Total records inserted in bulk from CSV file " + totalRecords.length);
/* Close prepared statement */
sql_statement.close();
/* COMMIT transaction */
conn.commit();
/* Close connection */
conn.close();
}
I am not receiving an error while running this. It is printing:
Total records inserted in bulk from CSV file 0
The table is not being updated with the new values in Oracle. Any suggestions?
Execute sql_statement.executeBatch() only once if your batch size is reached.
executeBatch() is returning an array with the results (how many rows are affected).
So you have to add each element of the array to compute the total count.
The condition for execute the batch is also wrong.
I cant proof, but i would change your example like this (only changed section):
public void insertData() throws ClassNotFoundException, SQLException, IOException {
/* Create Connection objects */
Class.forName("oracle.jdbc.OracleDriver");
String jdbc_insert_sql = "INSERT INTO METATADA_AUTOSYS"
+ "(MACH_NAME,JOB_NAME,SCRIPT_COMMAND,APPLICATION_NAME,JOB_ID,STATUS,CREATE_DATE,LAST_START_DT,LAST_END_DT) VALUES"
+ "(?,?,?,?,?,?,?,?,?)";
int totalRecords = 0;
final int batchSize = 5000;
/* Read CSV file in OpenCSV */
String inputCSVFile = "C:/Users/conwacx/Desktop/meta_auto_v3/Autosys_Metadata.csv";
try (CSVReader reader = new CSVReader(new FileReader(inputCSVFile))) {
try (Connection conn = DriverManager.getConnection("jdbc:oracle:thin:#HoSt", "UsErNaMe", "PaSsWoRd")) {
try (PreparedStatement sql_statement = conn.prepareStatement(jdbc_insert_sql);) {
String[] nextLine;
int lnNum = 0;
// loop file , add records to batch
try {
while ((nextLine = reader.readNext()) != null) {
lnNum++;
/* Bind CSV file input to table columns */
sql_statement.setString(1, nextLine[0]);
sql_statement.setString(2, nextLine[1]);
sql_statement.setString(3, nextLine[2]);
sql_statement.setString(4, nextLine[3]);
sql_statement.setString(5, nextLine[4]);
sql_statement.setString(6, nextLine[5]);
sql_statement.setObject(7, nextLine[5]);
sql_statement.setString(8, nextLine[7]);
sql_statement.setString(9, nextLine[8]);
sql_statement.addBatch();
// Add the record to batch
if (lnNum >= batchSize) {
// Perform a bulk batch insert
totalRecords += doExecute(sql_statement);
lnNum = 0;
}
}
// insert the last rows
if ( lnNum >= 0 ) {
totalRecords += doExecute(sql_statement);
}
} catch (SQLException e) {
e.printStackTrace();
}
}
System.out.println("Total records inserted in bulk from CSV file " + totalRecords);
/* COMMIT transaction */
conn.commit();
}
}
}
private int doExecute(PreparedStatement sql_statement) {
int totalRecords = 0;
int[] results = null;
try {
results = sql_statement.executeBatch();
for (int i = 0; i < results.length; i++) {
totalRecords += results[i];
}
} catch (BatchUpdateException e) {
// handle exception for failed records here
results = e.getUpdateCounts();
for (int i = 0; i < results.length; i++) {
totalRecords += results[i];
}
} catch (SQLException ex) {
ex.printStackTrace();
}
return totalRecords;
}

Java - executebatch is not working when inserting records to db

I am trying to insert records into my table in MYSQL after extracting the words from a file and stored them in a hashset.
I tried using executeBatch() to insert into my db after getting 500 records but when the execution finished, I checked my table and there's no record inserted at all.
Note: When I use ExecuteUpdate() then the records will show in my table. But not ExecuteBatch() since I want to insert by batch, not one by one.
May I know what did I do wrong?
Code:
public void readDataBase(String path,String word) throws Exception {
try {
// Result set get the result of the SQL query
int i=0;
// This will load the MySQL driver, each DB has its own driver
Class.forName("com.mysql.jdbc.Driver");
// Setup the connection with the DB
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.20/fulltext_ltat?"
+ "user=root&password=root");
// Statements allow to issue SQL queries to the database
// statement = connect.createStatement();
System.out.print("Connected");
// Result set get the result of the SQL query
preparedStatement = connect
.prepareStatement("insert IGNORE into fulltext_ltat.indextable values (default,?, ?) ");
preparedStatement.setString( 1, path);
preparedStatement.setString(2, word);
preparedStatement.addBatch();
i++;
// preparedStatement.executeUpdate();
if(i%500==0){
preparedStatement.executeBatch();
}
preparedStatement.close();
// writeResultSet(resultSet);
} catch (Exception e) {
throw e;
} finally {
close();
}
}
This is my loop to call that method(words is just an array that contains the words which is going to be inserted to the table):
for(int i = 1 ; i <= words.length - 1 ; i++ ) {
connection.readDataBase(path, words[i].toString());
}
My main method:
public static void main(String[] args) throws Exception {
StopWatch stopwatch = new StopWatch();
stopwatch.start();
File folder = new File("D:\\PDF1");
File[] listOfFiles = folder.listFiles();
for (File file : listOfFiles) {
if (file.isFile()) {
HashSet<String> uniqueWords = new HashSet<>();
String path = "D:\\PDF1\\" + file.getName();
try (PDDocument document = PDDocument.load(new File(path))) {
if (!document.isEncrypted()) {
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);
String lines[] = pdfFileInText.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
for (String word : words) {
uniqueWords.add(word)
;
}
}
// System.out.println(uniqueWords);
}
} catch (IOException e) {
System.err.println("Exception while trying to read pdf document - " + e);
}
Object[] words = uniqueWords.toArray();
MysqlAccessIndex connection = new MysqlAccessIndex();
for(int i = 1 ; i <= words.length - 1 ; i++ ) {
connection.readDataBase(path, words[i].toString());
}
System.out.println("Completed");
}
}
Your pattern for doing batch updates is off. You should be opening the connection and preparing the statement only once. Then, iterate multiple times, binding parameters, and add that statement to the batch.
// define a collection of paths and words somewhere
List<String> paths = new ArrayList<>();
List<String> words = new ArrayList<>();
try {
// presumably you only want to insert so many records
int LIMIT = 10000;
Class.forName("com.mysql.jdbc.Driver");
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.20/fulltext_ltat?"
+ "user=root&password=root");
String sql = "INSERT IGNORE INTO fulltext_ltat.indextable VALUES (default, ?, ?);";
preparedStatement = connect.prepareStatement(sql);
for (int i=0; i < LIMIT; ++i) {
preparedStatement.setString(1, paths.get(i));
preparedStatement.setString(2, word.get(i));
preparedStatement.addBatch();
if (i % 500 == 0) {
preparedStatement.executeBatch();
}
}
// execute remaining batches
preparedStatement.executeBatch();
}
catch (SQLException e) {
e.printStackTrace();
}
finally {
try {
preparedStatement.close();
connect.close();
}
catch (SQLException e) {
e.printStackTrace();
}
}
One key change I made here is to add logic for when you should stop doing inserts. Currently, your code looks to have an infinite loop, which means it would run forever. This is probably not what you were intending to do.
where is your loop. try this
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.20/fulltext_ltat?"
+ "user=root&password=root&rewriteBatchedStatements=true");

Java - Improving performance for building up index table

I am working on a full text indexing using inverted file method where it extracts all the word in a document, and inserts each word one by one into my table in MYSQL.
So far, my program works perfectly fine but I am stuck in thinking how it could be optimize further to improve the time it takes to insert into db. I am aware inverted file has a disadvantage of slow time for building up the index table.
Here is my code:
public class IndexTest {
public static void main(String[] args) throws Exception {
StopWatch stopwatch = new StopWatch();
stopwatch.start();
File folder = new File("D:\\PDF1");
File[] listOfFiles = folder.listFiles();
for (File file : listOfFiles) {
if (file.isFile()) {
HashSet<String> uniqueWords = new HashSet<>();
String path = "D:\\PDF1\\" + file.getName();
try (PDDocument document = PDDocument.load(new File(path))) {
if (!document.isEncrypted()) {
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);
String lines[] = pdfFileInText.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
for (String word : words) {
uniqueWords.add(word);
}
}
// System.out.println(uniqueWords);
}
} catch (IOException e) {
System.err.println("Exception while trying to read pdf document - " + e);
}
Object[] words = uniqueWords.toArray();
String unique = uniqueWords.toString();
// System.out.println(words[1].toString());
for(int i = 1 ; i <= words.length - 1 ; i++ ) {
MysqlAccessIndex connection = new MysqlAccessIndex();
connection.readDataBase(path, words[i].toString());
}
System.out.println("Completed");
}
}
stopwatch.stop();
long timeTaken = stopwatch.getTime();
System.out.println(timeTaken);
MYSQL connection:
public class MysqlAccessIndex {
public Connection connect = null;
public Statement statement = null;
public PreparedStatement preparedStatement = null;
public ResultSet resultSet = null;
public MysqlAccessIndex() throws Exception {
Class.forName("com.mysql.jdbc.Driver");
connect = DriverManager
.getConnection("jdbc:mysql://126.32.3.178/fulltext_ltat?"
+ "user=root&password=root123");
// statement = connect.createStatement();
System.out.print("Connected");
}
public void readDataBase(String path,String word) throws Exception {
try {
preparedStatement = connect
.prepareStatement("insert IGNORE into fulltext_ltat.test_text values (?, ?) ");
preparedStatement.setString(1, path);
preparedStatement.setString(2, word);
preparedStatement.executeUpdate();
} catch (Exception e) {
throw e;
} finally {
close();
}
}
Is it possible if I could use some sort of multi threading to say insert three words in three rows at the same time to speed up the insert process or some sort?
I would appreciate any suggestion.
I think solution to your problem - is to use bulk insert.
You could try to do something like this:
public void readDataBase(String path, HashSet<String> uniqueWords) throws Exception {
PreparedStatement preparedStatement;
try {
String compiledQuery = "insert IGNORE into fulltext_ltat.test_text values (?, ?) ";
preparedStatement = connect.prepareStatement(compiledQuery);
for(String word : uniqueWords) {
preparedStatement.setString(1, path);
preparedStatement.setString(2, word);
preparedStatement.addBatch();
}
long start = System.currentTimeMillis();
int[] inserted = preparedStatement.executeBatch();
} catch (Exception e) {
throw e;
} finally {
close();
}
}
Modify your readDataBase method to have HashSet<String> uniqueWords in params.
After that you should add preparedStatement.addBatch() call after each item to insert and execute preparedStatement.executeBatch() instead of preparedStatement.executeUpdate() in the end.
I hope it would help.

Parsing empty cells in csv file and storing into mysql

I have a csv file which has few cells in some columns empty.Modifying the csv file is not an option as it has around 50k total records.
Whenever I am executing below code it is throwing error as "Incorrect integer value: '' for column 'ParentId' at row 1".My database table has null value as yes for column 'ParentID'.But still it is throwing error.How can I edit this code so that it fills in correct values without giving error?
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.util.Date;
import org.apache.commons.lang.StringUtils;
import au.com.bytecode.opencsv.CSVReader;
public class CSVLoader {
static int count;
private static final
//String SQL_INSERT = "INSERT INTO ${table}(${keys}) VALUES(${values})";
String SQL_INSERT = "INSERT INTO ${table} VALUES(${values})";
private static final String TABLE_REGEX = "\\$\\{table\\}";
//private static final String KEYS_REGEX = "\\$\\{keys\\}";
private static final String VALUES_REGEX = "\\$\\{values\\}";
private Connection connection;
private char seprator;
/**
* Public constructor to build CSVLoader object with
* Connection details. The connection is closed on success
* or failure.
* #param connection
*/
public CSVLoader(Connection connection) {
this.connection = connection;
//Set default separator
this.seprator = ',';
}
/**
* Parse CSV file using OpenCSV library and load in
* given database table.
* #param csvFile Input CSV file
* #param tableName Database table name to import data
* #param truncateBeforeLoad Truncate the table before inserting
* new records.
* #throws Exception
*/
public void loadCSV(String csvFile, String tableName,
boolean truncateBeforeLoad) throws Exception {
CSVReader csvReader = null;
if(null == this.connection) {
throw new Exception("Not a valid connection.");
}
try {
csvReader = new CSVReader(new FileReader(csvFile), this.seprator);
} catch (Exception e) {
e.printStackTrace();
throw new Exception("Error occured while executing file. "
+ e.getMessage());
}
//String[] headerRow = csvReader.readNext();
String[] headerRow = csvReader.readNext();
count++;
if (null == headerRow) {
throw new FileNotFoundException(
"No columns defined in given CSV file." +
"Please check the CSV file format.");
}
/*String questionmarks = StringUtils.repeat("?,", headerRow.length);
System.out.println(headerRow.length);
questionmarks = (String) questionmarks.subSequence(0, questionmarks
.length() - 1);
System.out.println(SQL_INSERT);
String query = SQL_INSERT.replaceFirst(TABLE_REGEX, tableName);
//query = query
// .replaceFirst(KEYS_REGEX, StringUtils.join(headerRow, ","));
query = query.replaceFirst(VALUES_REGEX, questionmarks);
System.out.println("Query: " + query);
*/
//Stirng str1 = "insert into posts values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
String[] nextLine;
Connection con = null;
PreparedStatement ps = null;
try {
con = this.connection;
con.setAutoCommit(false);
ps = con.prepareStatement("insert into posts (Id,PostTypeId,AcceptedAnswerId,ParentId,CreationDate,Score,ViewCount,Body,OwnerUserId,OwnerDisplayName,LastEditorUserId,LastEditorDisplayName,LastEditDate,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate,CommunityOwnedDate,RowNum) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");
if(truncateBeforeLoad) {
//delete data from table before loading csv
con.createStatement().execute("DELETE FROM " + tableName);
}
final int batchSize = 1000;
int count = 0;
Date date = null;
while ((nextLine = csvReader.readNext()) != null) {
if (null != nextLine) {
int index = 1;
for (String string : nextLine) {
date = DateUtil.convertToDate(string);
if (null != date) {
ps.setDate(index++, new java.sql.Date(date
.getTime()));
} else {
ps.setString(index++, string);
}
}
System.out.println(count);
ps.addBatch();
System.out.println(count);
}
if (++count % batchSize == 0) {
System.out.println(count);
ps.executeBatch();
}
}
ps.executeBatch(); // insert remaining records
con.commit();
} catch (Exception e) {
con.rollback();
e.printStackTrace();
throw new Exception(
"Error occured while loading data from file to database."
+ e.getMessage());
} finally {
if (null != ps)
ps.close();
if (null != con)
con.close();
csvReader.close();
}
}
public char getSeprator() {
return seprator;
}
public void setSeprator(char seprator) {
this.seprator = seprator;
}
}
It might be the case where int column is actually having blank spaces, in such case you can place manual check as below and still go ahead. Hope this answers your doubt.
// check if value is null or blank spaces
if(certain_value== null || certain.value.trim().length==0){
ps.setNull(4, java.sql.Types.INTEGER); // This will set null value for int type
}

How to improve the speed of this code?

I'm trying to import all googlebooks-1gram files into a postgresql database. I wrote the following Java code for that:
public class ToPostgres {
public static void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
try {
PreparedStatement wordInsert = c.prepareStatement(
"INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
);
PreparedStatement countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
String lastWord = "";
Long lastId = -1L;
for (String filename: files) {
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line = "";
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
Long id = -1L;
if (lastWord.equals(data[0])) {
id = lastId;
} else {
wordInsert.setString(1, data[0]);
wordInsert.executeUpdate();
ResultSet resultSet = wordInsert.getGeneratedKeys();
if (resultSet != null && resultSet.next())
{
id = resultSet.getLong(1);
}
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
lastId = id;
}
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
However, when running this for ~3 hours it only placed 1.000.000 entries in the wordcounts table. When I check the amount of lines in the entire 1gram dataset it's 500.000.000 lines. So to import everything would take about 62.5 days, I can accept that it imports in about a week, but 2 months? I think I'm doing something seriously wrong here(I do have a server that runs 24/7, so I can actually run it for this long, but faster would be nice XD)
EDIT: This code is how I solved it:
public class ToPostgres {
public static void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
c.setAutoCommit(false);
try {
PreparedStatement wordInsert = c.prepareStatement(
"INSERT INTO words (id, word) VALUES (?,?)"
);
PreparedStatement countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
String lastWord = "";
Long id = 0L;
for (String filename: files) {
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line = "";
int i = 0;
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
if (!lastWord.equals(data[0])) {
id++;
wordInsert.setLong(1, id);
wordInsert.setString(2, data[0]);
wordInsert.executeUpdate();
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
if (i % 10000 == 0) {
c.commit();
}
if (i % 100000 == 0) {
System.out.println(i+" mark file "+filename);
}
i++;
}
c.commit();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
I reached 1.5 million rows in about 15 minutes now. That's fast enough for me, thanks all!
JDBC connections have autocommit enabled by default, which carries a per-statement overhead. Try disabling it:
c.setAutoCommit(false)
then commit in batches, something along the lines of:
long ops = 0;
for(String filename : files) {
// ...
while ((line = input.readLine()) != null) {
// insert some stuff...
ops ++;
if(ops % 1000 == 0) {
c.commit();
}
}
}
c.commit();
If your table has indexes, it might be faster to delete them, insert the data, and recreate the indexes later.
Setting autocommit off, and doing a manual commit every 10 000 records or so (look into the documentation for a reasonable value - there is some limit) could speed up as well.
Generating the index/foreign key yourself, and keeping track of it should be faster than wordInsert.getGeneratedKeys(); but I'm not sure, whether it is possible from your content.
There is an approach called 'bulk insert'. I don't remember the details, but its a starting point for a search.
Write it to do threading, running 4 threads at the same time, or split it up in sections (read from config file) and distribute it to X machines and have them get the data togeather.
Use batch statements to execute multiple inserts at the same time, rather than one INSERT at a time.
In addition I would remove the part of your algorithm which updates the word count after each insert into the words table, instead just calculate all of the word counts once inserting the words is complete.
Another approach would be to do bulk inserts rather than single inserts. See this question Whats the fastest way to do a bulk insert into Postgres? for more information.
Create threads
String lastWord = "";
Long lastId = -1L;
PreparedStatement wordInsert;
PreparedStatement countInsert ;
public class ToPostgres {
public void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
try {
wordInsert = c.prepareStatement(
"INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
);
countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
for (String filename: files) {
new MyThread(filename). start();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
class MyThread extends Thread{
String file;
public MyThread(String file) {
this.file = file;
}
#Override
public void run() {
try {
super.run();
BufferedReader input = new BufferedReader(new FileReader(new File(file)));
String line = "";
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
Long id = -1L;
if (lastWord.equals(data[0])) {
id = lastId;
} else {
wordInsert.setString(1, data[0]);
wordInsert.executeUpdate();
ResultSet resultSet = wordInsert.getGeneratedKeys();
if (resultSet != null && resultSet.next())
{
id = resultSet.getLong(1);
}
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
lastId = id;
}
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}

Categories

Resources