Unable to convert the CLOB data into String - java

I am trying to convert java.sql.Clob data into String by using SubString method (This method giving good performance compared with other). The clob data having near or morethan to 32MB. AS my observation substring method able to to return upto 33554342 bytes only.
if clob data is crossing 33554342 bytes then this it's throwing below sql exception
ORA-24817: Unable to allocate the given chunk for current lob operation
EDIT
CODE:
public static void main(String[] args) throws SQLException {
Main main = new Main();
Connection con = main.getConnection();
if (con == null) {
return;
}
PreparedStatement pstmt = null;
ResultSet rs = null;
String sql = "SELECT Table_ID,CLOB_FILE FROM TableName WHERE SOMECONDITION ";
String table_Id = null;
String directClobInStr = null;
CLOB clobObj = null;
String clobStr = null;
Object obj= null;
try {
pstmt = con.prepareStatement(sql);
rs = pstmt.executeQuery();
while (rs.next()) {
table_Id = rs.getString( "Table_ID" ) ;
directClobInStr = rs.getString( "clob_FILE" ) ;
obj = rs.getObject( "CLOB_FILE");
clobObj = (CLOB) obj;
System.out.println("Table id " + table_Id);
System.out.println("directClobInStr " + directClobInStr);
clobStr = clobObj.getSubString(1L, (int)clobObj.length() );//33554342
System.out.println("clobDataStr = " + clobStr);
}
}
catch (SQLException e) {
e.printStackTrace();
return;
}
catch (Exception e) {
e.printStackTrace();
return;
}
finally {
try {
rs.close();
pstmt.close();
con.close();
}
catch (Exception e) {
System.out.println(e.getMessage());
}
}
}
NOTE:- here obj = rs.getObject( "CLOB_FILE"); working but I am not expecting this. because I am getting ResultSet object from somewhere as Object. I have to convert and get the data from CLOB
Any Idea how to achieve this?

Instead:
clobStr = clobObj.getSubString(1L, (int)clobObj.length() );
Try something like:
int toread = (int) clobObj.length();
int read = 0;
final int block_size = 8*1024*1024;
StringBuilder str = new StringBuilder(toread);
while (toread > 0) {
int current_block = Math.min(toread, block_size);
str.append(clobObj.getSubString(read+1, current_block));
read += current_block;
toread -= current_block;
}
clobStr = str.toString();
It extracts substrings using a loop (8MB per iteration).
But remember that, as far as I known, Java Strings are limited to 2 GB (this is the reason why read is declared as int instead of long) and Oracle CLOBs are limited to 128 TB.

Related

How to deal with multiple queries in a java servlet

I'm trying to send an increase count variable of a picture (which is increased by just increasing +1 everytime a new session hits a picture). I'm getting the following error message however, i'm checking for an empty result set. My thought process is that I can try to select the picturesNo that has been called and if it can't find that pictureNo we simply insert the first count to the table, and if it can find it, we then update this.
Error message:
"SQLException: Illegal operation on empty result set."
Code to increase the count for the session
HttpSession session = request.getSession() ;
Integer counter = (Integer) session.getAttribute("counter");
String accCount = (String) session.getAttribute("attributeKey") ;
String url = "http://localhost:8080/techfore";
String encodedURL = url + ";jsessionid=" + request.getSession().getId();
if (accCount == null || encodedURL == null) { // New session?
response.sendRedirect("/techfore/WelcomePage");
}
else{
if(counter == 0) {
counter = new Integer(counter.intValue() + 1);
session.setAttribute("counter", counter);
}
}
Utilities.initalCount(out, pictureName, counter);
Code to run the queries
public static void initalCount(PrintWriter out, String pictureName, int accessCount) {
Connection con = null;
try { // Connect to the database
con = openConnection(out);
}
catch (Exception e) { // Failed to open the connection
out.println("<P>" + e.getMessage());
}
try {
Statement stmt = con.createStatement();
String query0;
ResultSet rs1;
query0="SELECT PictureNo FROM Statistics WHERE PictureNo = (SELECT PictureNo FROM Pictures WHERE ShortName = '"+pictureName+"')";
rs1 = stmt.executeQuery(query0);
if(rs1.next()){
//yes exist
String description = rs1.getString("Description");
int pictureNo = rs1.getInt("PictureNo");
IncreaseCount(out, pictureNo, accessCount);
}
else {
//if rs is null insert
int pictureNo = rs1.getInt("PictureNo");
AddCount(out, pictureNo, accessCount);
}
stmt.close() ;
}
catch(SQLException ex) {
out.println("<P>SQLException: " + ex.getMessage()) ;
}
}
public static void AddCount(PrintWriter out, int pictureNo, int accessCount) {
Connection con = null;
try { // Connect to the database
con = openConnection(out);
}
catch (Exception e) { // Failed to open the connection
out.println("<P>" + e.getMessage());
return;
}
try {
Statement stmt = con.createStatement();
String query;
ResultSet rs1;
query="INSERT INTO Statistics VALUES "+pictureNo+","+accessCount+" ";
stmt.executeUpdate(query);
stmt.close() ;
}
catch(SQLException ex) {
out.println("<P>SQLException: " + ex.getMessage()) ;
}
}
public static void IncreaseCount(PrintWriter out, int pictureNo, int accessCount) {
Connection con = null;
try { // Connect to the database
con = openConnection(out);
}
catch (Exception e) { // Failed to open the connection
out.println("<P>" + e.getMessage());
return;
}
try {
Statement stmt = con.createStatement();
String query;
ResultSet rs1;
query="UPDATE Statistics SET AccessCount = "+accessCount+" + 1 WHERE PictureNo = "+pictureNo+"";
stmt.executeUpdate(query);
stmt.close() ;
}
catch(SQLException ex) {
out.println("<P>SQLException: " + ex.getMessage()) ;
}
}
New insert
query="INSERT INTO Statistics VALUES (SELECT PictureNo FROM Pictures WHERE FileName = '"+pictureName+"'),"+accessCount+" ";

Iterate over result set keys - is it possible?

I have a method that takes a database connection, query, and parameters and parses that query into a result set object. This is great but the problem is to get each value out of a result set I have to write one line of code for every row of data I am pulling to then save it in a JSON container. Is there a way to do this systematically so I can automatically parse the data type and create the JSON object based upon the keys fetched from the result set w/o manually specifying the keys?
public static JSONArray q2rs2j(Connection connection, String query, List<String> params) throws Exception {
JSONArray tContainer = new JSONArray();
PreparedStatement pStatement = connection.prepareStatement(query);
int pit = 1;
if(params != null) {
for (String param : params) {
try {
double paramAsDouble = Double.parseDouble(param);
try {
int paramAsInt = Integer.parseInt(param);
pStatement.setInt(pit, paramAsInt);
} catch (NumberFormatException e) {
pStatement.setDouble(pit, paramAsDouble);
}
} catch (NumberFormatException e) {
pStatement.setString(pit, param);
}
pit++;
}
}
ResultSet resultSet = pStatement.executeQuery();
try {
while (resultSet.next()) {
// Iterate through KEYS in the resultSet.next() row
while (hasKey) {
// Store key Name and key Value in variables - todo: determine data type via try parsing as Int, double, etc
String thisKeyName = (nextKeyName);
String thisKeyValue = (nextKeyValue);
JSONObject tObject = new JSONObject();
tObject
.put(nextKeyName, nextKeyValue);
}
tContainer.put(tObject);
}
resultSet.close();
} catch (Exception e) { e.printStackTrace(); }
return tContainer;
}
ResultSetMetaData provides SQL types and java class names.
try (ResultSet resultSet = pStatement.executeQuery()) {
ResultSetMetaData meta = resultSet.getMetaData();
int ncols = meta.getColumnCount();
while (resultSet.next()) {
JSONObject tObject = new JSONObject();
for (int colno = 1; colno <= ncols; ++colno) {
String label = meta.getColumnLabel(colno); // Key
String name = meta.getColumnName(colno);
String sqlType = meta.getColumnType();
String type = meta.getColumnClassName();
String thisKeyName = label;
Object thisKeyValue = result.getObject(colno);
if (sqlType.contains("CHAR")) {
thisKeyVaule = result.getString(colno);
tObject.put(nextKeyName, nextKeyValue);
} else if (sqlType.contains("INT")) {
thisKeyVaule = result.getInt(colno);
tObject.put(nextKeyName, nextKeyValue);
} else {
tObject.put(nextKeyName, nextKeyValue);
}
}
tContainer.put(tObject);
}
}
Using try-with-resources allows automatic closing (useful for Connection, Statement, and ResultSet) - even when on return, break or thrown exception.

How to generalize resultset queries?

at the moment I'm working on a script that reads several values from different tables of one database. Every time I start a request, I have to open a statement and create a new resultset which leads to horrible, repetative code. What would be a good way of generalizing this and how can this be done?
Some elements from my code. At the moment there's just one statement and the closing has to be inserted. One of the primary reasons I ask this question.
public static void main(String[] args) throws Exception
{
Connection c = null;
Statement stmt = null;
try
{
//set up database connection
Class.forName("org.sqlite.JDBC");
c = DriverManager.getConnection("jdbc:sqlite:/nfs/home/mals/p/pu2002/workspace/Database2");
c.setAutoCommit(false);
stmt = c.createStatement();
//end
//get task id to work with
String Task_id = null;
if(args.length != 0) //if an argument was passed, Task_id will be the first element of the array args (arguments)
{
Task_id = args[0];
}
else if(args.length == 0) //if no arguments were passed, the highest number in the column id from tasks_task will be selected and set as Task_id
{
ResultSet TTask_id = stmt.executeQuery("SELECT max(id) FROM tasks_task");
int t_id = TTask_id.getInt(1);
Task_id = String.valueOf(t_id);
TTask_id.close();
}
//end
//get solution IDs from taks_ids
ArrayList<Integer> List_solIDs = new ArrayList<Integer>(); //create an empty array list
ResultSet SSolution_task_id = stmt.executeQuery("SELECT id FROM solutions_solution WHERE task_id ="+Task_id + " AND final = 1;"); //Sqlite3-Ausdruck SELECT..., Task IDs verändern pro Aufgabe - "SELECT * FROM solutions_solution where task_id ="+Task_id +";"
while (SSolution_task_id.next()) //loops through all elements of SSolution_task_id
{
List_solIDs.add(SSolution_task_id.getInt("id")); //adds all elements of the resultset SSolution_task_id to the list List_solIDs
}
SSolution_task_id.close();
//end
//get logs according to content type
int count = List_solIDs.size();
String log_javaBuilder = null;
List<String> log_JunitChecker = new ArrayList<String>();
for (int i = 0; i < count; i++)
{
boolean sol_id_valid = false;
String solID = String.valueOf(List_solIDs.get(i));
try
{
ResultSet AAttestation_sol_id = stmt.executeQuery("SELECT * FROM attestation_attestation WHERE solution_id =" +solID+";");
int Returned = AAttestation_sol_id.getInt("final_grade_id");
}
catch(Exception e)
{
sol_id_valid = true;
}
if(sol_id_valid ==true)
{
try
{
ResultSet CCresult_javaBuilder = stmt.executeQuery("SELECT log FROM checker_checkerresult WHERE solution_id = " +solID+ " AND content_type_id = 22;"); //"SELECT id FROM checker_checkerresult where solution_id = " +List_solIDs.get(i)+ ";"
log_javaBuilder = CCresult_javaBuilder.getString("log");
CCresult_javaBuilder.close();
ResultSet CCresult_Junit_checker = stmt.executeQuery("SELECT log FROM checker_checkerresult WHERE solution_id = " +solID+ " AND content_type_id = 24;");
while (CCresult_Junit_checker.next())
{
log_JunitChecker.add(CCresult_Junit_checker.getString("log"));
}
CCresult_Junit_checker.close();
}
catch (Exception e)
{
log_JunitChecker.add(null);
}
//end
All types of potential improvements will be welcome.
P.S.: Tried googling.
Seems you want to look at using some ORM layer e.g. http://hibernate.org/orm/
What you're looking for is probably a higher-level layer which
abstracts you from the underlying lower-level JDBC type of coding.
Better than writing generic method by yourself it is always better to use some framework, There are many JPA implementations out there which solve not only this issue but also takes care of multiple persistence layer boiler plate code. Start JPA from Here. You can also use Spring JDBC template as well to solve problem mentioned above Spring JDBC Documentation.
Now, if you really don't want any framework dependency and finish this code quite fast, You can define your own JDBCTemplate class which takes query and parameter map and return ResultSet. This class can handle open connection, query execution and closing connection etc.
What if you try to use generics on methods? this is a quick example, just for illustration, you must improve all this :)
resource: official docs
public static <T> List<T> getSingleValueList(ResultSet rs, Class<T> clazz, String colName) throws Exception {
ArrayList<T> list = new ArrayList<T>();
while (rs.next()) {//loops through all elements of generic list
list.add((T) rs.getObject(colName)); //adds all elements of the resultset rs to the list
}
rs.close();
return list;
}
public static <T> T getSingleValue(ResultSet rs, Class<T> clazz, String colName) throws Exception {
try {
if (rs.next()) {//loops through all elements of generic list
return (T) rs.getObject(colName);
} else {
throw new Exception("no value found.");
}
} finally {
rs.close();
}
}
public static void main(String[] args) throws Exception {
Connection c = null;
Statement stmt = null;
try {
//set up database connection
Class.forName("org.sqlite.JDBC");
c = DriverManager.getConnection("jdbc:sqlite:/nfs/home/mals/p/pu2002/workspace/Database2");
c.setAutoCommit(false);
stmt = c.createStatement();
//end
//get task id to work with
String Task_id = null;
if (args.length != 0) //if an argument was passed, Task_id will be the first element of the array args (arguments)
{
Task_id = args[0];
} else if (args.length == 0) //if no arguments were passed, the highest number in the column id from tasks_task will be selected and set as Task_id
{
ResultSet TTask_id = stmt.executeQuery("SELECT max(id) FROM tasks_task");
int t_id = TTask_id.getInt(1);
Task_id = String.valueOf(t_id);
TTask_id.close();
}
//end
//get solution IDs from taks_ids
ResultSet SSolution_task_id = stmt.executeQuery("SELECT id FROM solutions_solution WHERE task_id =" + Task_id + " AND final = 1;"); //Sqlite3-Ausdruck SELECT..., Task IDs verändern pro Aufgabe - "SELECT * FROM solutions_solution where task_id ="+Task_id +";"
List<Integer> List_solIDs = getSingleValueList(SSolution_task_id, Integer.class, "id"); //create an empty array list
//end
//get logs according to content type
int count = List_solIDs.size();
String log_javaBuilder = null;
List<String> log_JunitChecker = new ArrayList<String>();
List<String> tmplog_JunitChecker;
for (int i = 0; i < count; i++) {
boolean sol_id_valid = false;
String solID = String.valueOf(List_solIDs.get(i));
try {
ResultSet AAttestation_sol_id = stmt.executeQuery("SELECT * FROM attestation_attestation WHERE solution_id =" + solID + ";");
Integer Returned = getSingleValue(AAttestation_sol_id, Integer.class, "final_grade_id");
} catch (Exception e) {
sol_id_valid = true;
}
if (sol_id_valid == true) {
try {
ResultSet CCresult_javaBuilder = stmt.executeQuery("SELECT log FROM checker_checkerresult WHERE solution_id = " + solID + " AND content_type_id = 22;"); //"SELECT id FROM checker_checkerresult where solution_id = " +List_solIDs.get(i)+ ";"
log_javaBuilder = getSingleValue(CCresult_javaBuilder, String.class, "log");
ResultSet CCresult_Junit_checker = stmt.executeQuery("SELECT log FROM checker_checkerresult WHERE solution_id = " + solID + " AND content_type_id = 24;");
tmplog_JunitChecker = getSingleValueList(CCresult_Junit_checker, String.class, "log");
log_JunitChecker.addAll(tmplog_JunitChecker);
} catch (Exception e) {
log_JunitChecker.add(null);
}
//end
}
}
} catch (Exception eeee) {
//handle it
}
}
I hope I gave you a light.
Anyway, frameworks in almost all cases help a lot.

Java and SQL query: slowness

I'm getting a java applet to populate a database.
I work with JDK 1.7 and XAMPP 3.2.1 32Bit on a HP i3 ram6gb.
My database is big and have this structure:
[TABLE] attributi rows:245 InnoDB 16 KiB
[TABLE]dati rows:1,938,620 InnoDB 167.7 MiB
[TABLE]minilinks rows:12,670,740 InnoDB 625 MiB
[TABLE]personaggi rows:196,046 InnoDB 8.5 MiB
[TABLE]relazioni rows:0 InnoDB 16 KiB
I have to fill table 'relazioni'. This is the code:
public class PageLink {
private static boolean cleanRelazioni = true;
private static boolean relazioni = true;
static public void main(String argv[]) throws InterruptedException, IOException {
String intestazione = "################################# Progetto Parser/Scanner XML Wikipedia #############################################################\n\n";
System.out.println(intestazione);
PrintWriter outRel = null;
//connessione al Database
Database db = new Database("wikipages","","root","localhost");
if(db.connect()){
System.out.println("Connessione al Database ESEGUITA.");
}
if(cleanRelazioni){
db.cleanRelazioni();
}
long startTime = System.currentTimeMillis();
if(relazioni){
int cont = 0;
System.out.println("Relazioni delle sole Biografie");
outRel = new PrintWriter(new FileWriter("fileTest/outputRelazioniPageToPage.txt"));
try{
outRel.println("Relazioni delle sole Biografie");
}
catch(Exception e){
e.printStackTrace();
}
//tiro fuori Nome+Cognome corrente
List<Integer> listaPersonaggi = db.getIDPersonaggi();
for (int id : listaPersonaggi) {
System.out.println("\nIl Personaggio ");
db.printPersonaggio(id);
System.out.println("E' correlato con ");
cont = db.checkPageToPage(id);
System.out.println("=====>"+cont+"\n\n");
try{
outRel.flush();
outRel.println(id + " è correlato con "+cont+" personaggi.");
outRel.println("");
}
catch(Exception e){
e.printStackTrace();
}
}
}
long endTime = System.currentTimeMillis();
System.out.println("Tempo Esecuzione: " + (endTime - startTime) + " millisecondi => secondi "+(endTime - startTime)/1000);
System.out.println("FINE PROGRAMMA.");
db.disconnect();
}
}
this are the the method of the Database's Class:
public int checkPageToPage(int id){
List<Integer> listaPersonaggiCorrelati = new ArrayList<Integer>();
listaPersonaggiCorrelati.add(id);
Statement stmt;
ResultSet rs;
String query = null;
int conteggio = 0;
String personaggio = new String("");
try {
stmt = conn.createStatement();
query = "SELECT pl_title FROM minilinks WHERE pl_from="+id+"";
rs = stmt.executeQuery(query);
while (rs.next()) {
personaggio = rs.getString("pl_title").replace("_", " ").replace("\"", "");
if(checkExistPersonaggio(personaggio)!=-1){
listaPersonaggiCorrelati.add(checkExistPersonaggio(personaggio));
System.out.print("personaggio correlato => "+personaggio+"\n");
conteggio++;
}
}
} catch (SQLException e) {
e.printStackTrace();
}
if (conteggio>0)
insertRelazioni (listaPersonaggiCorrelati);
return conteggio;
}
public void insertRelazioni(List<Integer> listaPersonaggiCorrelati){
int id_personaggio1;
int id_personaggio2;
try {
for(int i=0; i<=listaPersonaggiCorrelati.size()-1;i++){
for(int j=0; j<=listaPersonaggiCorrelati.size()-1;j++){
int tot_relazioni=0;
id_personaggio1=listaPersonaggiCorrelati.get(i);
id_personaggio2=listaPersonaggiCorrelati.get(j);
if(id_personaggio1 != id_personaggio2){
//System.out.println(id_personaggio1+" e "+id_personaggio2);
tot_relazioni=checkExistRelazione(id_personaggio1, id_personaggio2);
if (tot_relazioni==-1){
//System.out.println("nuova relazione tra "+id_personaggio1+" e "+id_personaggio2);
executeUpdate("INSERT into relazioni (id_personaggio, id_personaggio_correlato, tot) VALUES("+id_personaggio1+", "+id_personaggio2+", '1')");
}
else {
//System.out.println("aggiorno la relazione tra "+id_personaggio1+" e "+id_personaggio2);
tot_relazioni++;
executeUpdate("UPDATE relazioni SET tot="+tot_relazioni+" WHERE id_personaggio = "+id_personaggio1+" and id_personaggio_correlato = "+id_personaggio2+"");
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
e.getMessage();
}
}
My problem is: insert query are very very slow.. for insert 4000 rows this takes 30minutes.
Is there anything I can do to fix this problem? I'm curious of what this cause mean and why I'm having trouble.
Maybe I can use Mysql 64bit version?
Since miniliks is the biggest table and you select
"SELECT pl_title FROM minilinks WHERE pl_from="+id+"";
Do have a special index on pl_from column?

How to improve the speed of this code?

I'm trying to import all googlebooks-1gram files into a postgresql database. I wrote the following Java code for that:
public class ToPostgres {
public static void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
try {
PreparedStatement wordInsert = c.prepareStatement(
"INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
);
PreparedStatement countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
String lastWord = "";
Long lastId = -1L;
for (String filename: files) {
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line = "";
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
Long id = -1L;
if (lastWord.equals(data[0])) {
id = lastId;
} else {
wordInsert.setString(1, data[0]);
wordInsert.executeUpdate();
ResultSet resultSet = wordInsert.getGeneratedKeys();
if (resultSet != null && resultSet.next())
{
id = resultSet.getLong(1);
}
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
lastId = id;
}
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
However, when running this for ~3 hours it only placed 1.000.000 entries in the wordcounts table. When I check the amount of lines in the entire 1gram dataset it's 500.000.000 lines. So to import everything would take about 62.5 days, I can accept that it imports in about a week, but 2 months? I think I'm doing something seriously wrong here(I do have a server that runs 24/7, so I can actually run it for this long, but faster would be nice XD)
EDIT: This code is how I solved it:
public class ToPostgres {
public static void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
c.setAutoCommit(false);
try {
PreparedStatement wordInsert = c.prepareStatement(
"INSERT INTO words (id, word) VALUES (?,?)"
);
PreparedStatement countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
String lastWord = "";
Long id = 0L;
for (String filename: files) {
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line = "";
int i = 0;
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
if (!lastWord.equals(data[0])) {
id++;
wordInsert.setLong(1, id);
wordInsert.setString(2, data[0]);
wordInsert.executeUpdate();
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
if (i % 10000 == 0) {
c.commit();
}
if (i % 100000 == 0) {
System.out.println(i+" mark file "+filename);
}
i++;
}
c.commit();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
I reached 1.5 million rows in about 15 minutes now. That's fast enough for me, thanks all!
JDBC connections have autocommit enabled by default, which carries a per-statement overhead. Try disabling it:
c.setAutoCommit(false)
then commit in batches, something along the lines of:
long ops = 0;
for(String filename : files) {
// ...
while ((line = input.readLine()) != null) {
// insert some stuff...
ops ++;
if(ops % 1000 == 0) {
c.commit();
}
}
}
c.commit();
If your table has indexes, it might be faster to delete them, insert the data, and recreate the indexes later.
Setting autocommit off, and doing a manual commit every 10 000 records or so (look into the documentation for a reasonable value - there is some limit) could speed up as well.
Generating the index/foreign key yourself, and keeping track of it should be faster than wordInsert.getGeneratedKeys(); but I'm not sure, whether it is possible from your content.
There is an approach called 'bulk insert'. I don't remember the details, but its a starting point for a search.
Write it to do threading, running 4 threads at the same time, or split it up in sections (read from config file) and distribute it to X machines and have them get the data togeather.
Use batch statements to execute multiple inserts at the same time, rather than one INSERT at a time.
In addition I would remove the part of your algorithm which updates the word count after each insert into the words table, instead just calculate all of the word counts once inserting the words is complete.
Another approach would be to do bulk inserts rather than single inserts. See this question Whats the fastest way to do a bulk insert into Postgres? for more information.
Create threads
String lastWord = "";
Long lastId = -1L;
PreparedStatement wordInsert;
PreparedStatement countInsert ;
public class ToPostgres {
public void main(String[] args) throws Exception {
String filePath = "./";
List<String> files = new ArrayList<String>();
for (int i =0; i < 10; i++) {
files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
}
Connection c = null;
try {
c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
"postgres", "xxxxxx");
} catch (SQLException e) {
e.printStackTrace();
}
if (c != null) {
try {
wordInsert = c.prepareStatement(
"INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
);
countInsert = c.prepareStatement(
"INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
"VALUES (?,?,?,?,?)"
);
for (String filename: files) {
new MyThread(filename). start();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
class MyThread extends Thread{
String file;
public MyThread(String file) {
this.file = file;
}
#Override
public void run() {
try {
super.run();
BufferedReader input = new BufferedReader(new FileReader(new File(file)));
String line = "";
while ((line = input.readLine()) != null) {
String[] data = line.split("\t");
Long id = -1L;
if (lastWord.equals(data[0])) {
id = lastId;
} else {
wordInsert.setString(1, data[0]);
wordInsert.executeUpdate();
ResultSet resultSet = wordInsert.getGeneratedKeys();
if (resultSet != null && resultSet.next())
{
id = resultSet.getLong(1);
}
}
countInsert.setLong(1, id);
countInsert.setInt(2, Integer.parseInt(data[1]));
countInsert.setInt(3, Integer.parseInt(data[2]));
countInsert.setInt(4, Integer.parseInt(data[3]));
countInsert.setInt(5, Integer.parseInt(data[4]));
countInsert.executeUpdate();
lastWord = data[0];
lastId = id;
}
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}

Categories

Resources