creating large csv files in Java getting really slow

creating large csv files in Java getting really slow - java

i have a performance problem when trying to create a csv file starting from another csv file.
this is how the original file looks:
country,state,co,olt,olu,splitter,ont,cpe,cpe.latitude,cpe.longitude,cpe.customer_class,cpe.phone,cpe.ip,cpe.subscriber_id
COUNTRY-0001,STATE-0001,CO-0001,OLT-0001,OLU0001,SPLITTER-0001,ONT-0001,CPE-0001,28.21487,77.451775,ALL,SIP:+674100002743#IMS.COMCAST.NET,SIP:E28EDADA06B2#IMS.COMCAST.NET,CPE_SUBSCRIBER_ID-QHLHW4
COUNTRY-0001,STATE-0002,CO-0002,OLT-0002,OLU0002,SPLITTER-0002,ONT-0002,CPE-0002,28.294018,77.068924,ALL,SIP:+796107443092#IMS.COMCAST.NET,SIP:58DD999D6466#IMS.COMCAST.NET,CPE_SUBSCRIBER_ID-AH8NJQ
potentially it could be millions of lines like this, i have detected the problem with 1.280.000 lines.
this is the algorithm:
File csvInputFile = new File(csv_path);
int blockSize = 409600;
brCsvInputFile = new BufferedReader(frCsvInputFile, blockSize);
String line = null;
StringBuilder sbIntermediate = new StringBuilder();
skipFirstLine(brCsvInputFile);
while ((line = brCsvInputFile.readLine()) != null) {
createIntermediateStringBuffer(sbIntermediate, line.split(REGEX_COMMA));
}
private static void skipFirstLine(BufferedReader br) throws IOException {
String line = br.readLine();
String[] splitLine = line.split(REGEX_COMMA);
LOGGER.debug("First line detected! ");
createIndex(splitLine);
createIntermediateIndex(splitLine);
}
private static void createIndex(String[] splitLine) {
LOGGER.debug("START method createIndex.");
for (int i = 0; i < splitLine.length; i++)
headerIndex.put(splitLine[i], i);
printMap(headerIndex);
LOGGER.debug("COMPLETED method createIndex.");
}
private static void createIntermediateIndex(String[] splitLine) {
LOGGER.debug("START method createIntermediateIndex.");
com.tekcomms.c2d.xml.model.v2.Metadata_element[] metadata_element = null;
String[] servicePath = newTopology.getElement().getEntity().getService_path().getLevel();
if (newTopology.getElement().getMetadata() != null)
metadata_element = newTopology.getElement().getMetadata().getMetadata_element();
LOGGER.debug(servicePath.toString());
LOGGER.debug(metadata_element.toString());
headerIntermediateIndex.clear();
int indexIntermediateId = 0;
for (int i = 0; i < servicePath.length; i++) {
String level = servicePath[i];
LOGGER.debug("level is: " + level);
headerIntermediateIndex.put(level, indexIntermediateId);
indexIntermediateId++;
// its identificator is going to be located to the next one
headerIntermediateIndex.put(level + "ID", indexIntermediateId);
indexIntermediateId++;
}
// adding cpe.latitude,cpe.longitude,cpe.customer_class, it could be
// better if it would be metadata as well.
String labelLatitude = newTopology.getElement().getEntity().getLatitude();
// indexIntermediateId++;
headerIntermediateIndex.put(labelLatitude, indexIntermediateId);
String labelLongitude = newTopology.getElement().getEntity().getLongitude();
indexIntermediateId++;
headerIntermediateIndex.put(labelLongitude, indexIntermediateId);
String labelCustomerClass = newTopology.getElement().getCustomer_class();
indexIntermediateId++;
headerIntermediateIndex.put(labelCustomerClass, indexIntermediateId);
// adding metadata
// cpe.phone,cpe.ip,cpe.subscriber_id,cpe.vendor,cpe.model,cpe.customer_status,cpe.contact_telephone,cpe.address,
// cpe.city,cpe.state,cpe.zip,cpe.bootfile,cpe.software_version,cpe.hardware_version
// now i need to iterate over each Metadata_element belonging to
// topology.element.metadata
// are there any metadata?
if (metadata_element != null && metadata_element.length != 0)
for (int j = 0; j < metadata_element.length; j++) {
String label = metadata_element[j].getLabel();
label = label.toLowerCase();
LOGGER.debug(" ==label: " + label + " index_pos: " + j);
indexIntermediateId++;
headerIntermediateIndex.put(label, indexIntermediateId);
}
printMap(headerIntermediateIndex);
LOGGER.debug("COMPLETED method createIntermediateIndex.");
}
Reading the entire dataset, 1.280.000 lines take 800 ms! so the problem is in this method
private static void createIntermediateStringBuffer(StringBuilder sbIntermediate, String[] splitLine) throws ClassCastException,
NullPointerException {
LOGGER.debug("START method createIntermediateStringBuffer.");
long start, end;
start = System.currentTimeMillis();
ArrayList<String> hashes = new ArrayList<String>();
com.tekcomms.c2d.xml.model.v2.Metadata_element[] metadata_element = null;
String[] servicePath = newTopology.getElement().getEntity().getService_path().getLevel();
LOGGER.debug(servicePath.toString());
if (newTopology.getElement().getMetadata() != null) {
metadata_element = newTopology.getElement().getMetadata().getMetadata_element();
LOGGER.debug(metadata_element.toString());
}
for (int i = 0; i < servicePath.length; i++) {
String level = servicePath[i];
LOGGER.debug("level is: " + level);
if (splitLine.length > getPositionFromIndex(level)) {
String name = splitLine[getPositionFromIndex(level)];
sbIntermediate.append(name);
hashes.add(name);
sbIntermediate.append(REGEX_COMMA).append(HashUtils.calculateHash(hashes)).append(REGEX_COMMA);
LOGGER.debug(" ==sbIntermediate: " + sbIntermediate.toString());
}
}
// end=System.currentTimeMillis();
// LOGGER.info("COMPLETED adding name hash. " + (end - start) + " ms. " + (end - start) / 1000 + " seg.");
// adding cpe.latitude,cpe.longitude,cpe.customer_class, it should be
// better if it would be metadata as well.
String labelLatitude = newTopology.getElement().getEntity().getLatitude();
if (splitLine.length > getPositionFromIndex(labelLatitude)) {
String lat = splitLine[getPositionFromIndex(labelLatitude)];
sbIntermediate.append(lat).append(REGEX_COMMA);
}
String labelLongitude = newTopology.getElement().getEntity().getLongitude();
if (splitLine.length > getPositionFromIndex(labelLongitude)) {
String lon = splitLine[getPositionFromIndex(labelLongitude)];
sbIntermediate.append(lon).append(REGEX_COMMA);
}
String labelCustomerClass = newTopology.getElement().getCustomer_class();
if (splitLine.length > getPositionFromIndex(labelCustomerClass)) {
String customerClass = splitLine[getPositionFromIndex(labelCustomerClass)];
sbIntermediate.append(customerClass).append(REGEX_COMMA);
}
// end=System.currentTimeMillis();
// LOGGER.info("COMPLETED adding lat,lon,customer. " + (end - start) + " ms. " + (end - start) / 1000 + " seg.");
// watch out metadata are optional, it can appear as a void chain!
if (metadata_element != null && metadata_element.length != 0)
for (int j = 0; j < metadata_element.length; j++) {
String label = metadata_element[j].getLabel();
LOGGER.debug(" ==label: " + label + " index_pos: " + j);
if (splitLine.length > getPositionFromIndex(label)) {
String actualValue = splitLine[getPositionFromIndex(label)];
if (!"".equals(actualValue))
sbIntermediate.append(actualValue).append(REGEX_COMMA);
else
sbIntermediate.append("").append(REGEX_COMMA);
} else
sbIntermediate.append("").append(REGEX_COMMA);
LOGGER.debug(" ==sbIntermediate: " + sbIntermediate.toString());
}//for
sbIntermediate.append("\n");
end = System.currentTimeMillis();
LOGGER.info("COMPLETED method createIntermediateStringBuffer. " + (end - start) + " ms. ");
}
As you can see, this method adds a precalculated line to the StringBuffer, reads every line from input csv file, calculate new data from that lines and finally add the generated line to the StringBuffer, so finally i can create the file with that buffer.
I have run jconsole and i can see that there are no memory leaks, i can see the sawtooths representing the creation of objects and the gc recollecting garbaje. It never traspasses the memory heap threshold.
One thing i have noticed is that the time needed for add a new line to the StringBuffer is completed within a very few ms range, (5,6,10), but is raising with time, to (100-200) ms and i suspect more in a near future, so probably this is the battle horse.
I have tried to analyze the code, i know that there are 3 for loops, but they are very shorts, the first loop iterates over 8 elements only:
for (int i = 0; i < servicePath.length; i++) {
String level = servicePath[i];
LOGGER.debug("level is: " + level);
if (splitLine.length > getPositionFromIndex(level)) {
String name = splitLine[getPositionFromIndex(level)];
sbIntermediate.append(name);
hashes.add(name);
sbIntermediate.append(REGEX_COMMA).append(HashUtils.calculateHash(hashes)).append(REGEX_COMMA);
LOGGER.debug(" ==sbIntermediate: " + sbIntermediate.toString());
}
}
I have meassured the time needed to get the name from the splitline and it is worthless, 0 ms, the same to calculateHash method, 0 ms.
the other loop, are practically the same, iterates over 0 to n, where n is a very tiny int, 3 to 10 for example, so i do not understand why it takes more time to finish the method, the only thing i find is that to add a new line to the buffer is getting slow the process.
I am thinking about a producer consumer multi threaded strategy, a reader thread that reads every line and put them into a circular buffer, another threads take it one by one, process them and add a precalculated line to the StringBuffer, which is thread safe, when the file is fully readed, the reader thread sends a message to to the another threads telling them to stop. Finally i have to save this buffer to a file. What do you think? this is a good idea?

I am thinking about a producer consumer multi threaded strategy, a reader thread that reads every line and put them into a circular buffer, another threads take it one by one, process them and add a precalculated line to the StringBuffer, which is thread safe, when the file is fully readed, the reader thread sends a message to to the another threads telling them to stop. Finally i have to save this buffer to a file. What do you think? this is a good idea?
Maybe, but it's quite a lot of work, I'd try something simpler first.
line.split(REGEX_COMMA)
Your REGEX_COMMA is a string which gets compiled into an regex a million times. It's trivial, but I'd try to use a Pattern instead.
You're producing a lot of garbage with your split. Maybe you should avoid it by manually splitting the input into a reused ArrayList<String> (it's just a few lines).
If all you need is writing the result into a file, it might be better to avoid building one huge String. Maybe a List<String> or even a List<StringBuilder> would be better, maybe writing directly to a buffered stream would do.
You seem to be working with ASCII only. Your encoding is platform dependent which may mean you're using UTF-8, which is possibly slow. Switching to a simpler encoding could help.
Working with byte[] instead of String would most probably help. Bytes are half as big as chars and there's no conversion needed when reading a file. All the operations you do can be done with bytes equally easy.
One thing i have noticed is that the time needed for add a new line to the StringBuffer is completed within a very few ms range, (5,6,10), but is raising with time, to (100-200) ms and i suspect more in a near future, so probably this is the battle horse.
That's resizing, which could be sped up by using the suggested ArrayList<String>, as the amount of data to be copied is much lower. Writing the data out when the buffer gets big would do as well.
I have meassured the time needed to get the name from the splitline and it is worthless, 0 ms, the same to calculateHash method, 0 ms.
Never use currentTimeMillis for this as nanoTime is strictly better. Use a profiler. The problem with a profiler is that it changes what it should measure. As a poor man's profiler, you can compute the sum of all the times spend inside of the suspect method and compare it with the total time.
What's the CPU load and what does GC do when running the program?

I used superCSV library in my project to handle large set of lines. it is relatively fast than manually read the lines. Reference

Related

Read and compare two large Files

I would like to read and compare all the lines of both files, I explain, I would like to find for each password hasher (from my test.txt file) the hashes that are the same (from the password.txt file). The problem is that it should be fast enough (I would say max 45 min for 10M for password.txt and 1M for test.txt).
I have for the moment this code
private static void bufferedReaderFilePasswordFirst() {
Path path = Paths.get("C:\\Users\\basil\\OneDrive - Haute Ecole Bruxelles Brabant (HE2B)\\Documents\\NetBeansProjects\\sha256\\passwords.txt");
Path pathUser = Paths.get("C:\\Users\\basil\\OneDrive - Haute Ecole Bruxelles Brabant (HE2B)\\Documents\\NetBeansProjects\\sha256\\test.txt");
int nbOfLine = 0;
StringBuffer oui = new StringBuffer();
try (BufferedReader readerPasswordGenerate = Files.newBufferedReader(path, Charset.forName("UTF-8"));) {
String currentLineUser = null;
String currentLinePassword = null;
long start = System.nanoTime();
while (((currentLinePassword = readerPasswordGenerate.readLine()) != null)) {
BufferedReader readerPasswordUser = Files.newBufferedReader(pathUser, Charset.forName("UTF-8"));
while ((currentLineUser = readerPasswordUser.readLine()) != null) {
String firstWord = currentLinePassword.substring(0, currentLinePassword.indexOf(":"));
if ((firstWord.charAt(0) == currentLineUser.charAt(0))
&& (firstWord.charAt(14) == currentLineUser.charAt(14))
&& (firstWord.charAt(31) == currentLineUser.charAt(31))
&& (firstWord.charAt(63) == currentLineUser.charAt(63))
) {
if (firstWord.equals(currentLineUser)) {
String secondWord = currentLinePassword.substring(currentLinePassword.lastIndexOf(":") + 1);
oui.append(secondWord).append(System.lineSeparator());
}
}
}
if (nbOfLine % 300 == 0) {
System.out.println("We are at the " + nbOfLine);
final long consumed = System.nanoTime() - start;
final long totConsumed = TimeUnit.NANOSECONDS.toMillis(consumed);
final double tot = (double) totConsumed;
System.out.printf("Not done. Took %s seconds", (tot / 1000));
System.out.println(oui + " oui");
}
nbOfLine++;
}
System.out.println(oui);
final long consumed = System.nanoTime() - start;
final long totConsumed = TimeUnit.NANOSECONDS.toMillis(consumed);
final double tot = (double) totConsumed;
System.out.printf("Done. Took %s seconds", (tot / 1000));
} catch (IOException ex) {
ex.printStackTrace(); //handle an exception here
}
}
In this code, I just compare for each element in my test.txt if the corresponding element in the password hash is same.
The password.txt contains for all elements: hash:password
and test.txt contains only: hash
Thanks

In this code, I just compare for each element in my test.txt if the corresponding element in the password hash is same.
If you are familiar with Big-O notation, you might recognize that this means your algorithm runs in O(n^2) time. In your specific case, for each of the 1,000,000 lines in test.txt you are doing 10,000,000 comparisons for a total of 10,000,000,000,000 total comparisons. To achieve your goal of running it within 45 minutes you would need to do 3.7 billion comparisons per second. For comparison, the i7 in my laptop runs at a max of 3.9GHz (billion cycles per second) and it will take much more than a single cpu cycle to execute one of these comparisons.
You can reduce the time complexity down to O(n) by first reading the password.txt into a HashMap (10,000,000 operations). From there, any individual check from test.txt only takes a single operation (1,000,000 total), resulting in 11,000,000 operations total. That means you only have to do ~4,000 operations a second (a 99.99989% reduction) to finish in 45 minutes which is much more doable.
Here's some pseudo-code to illustrate what that could look like:
// I like Scanner over BufferedReader for reading files. Use whatever you like.
Scanner readPassword = new Scanner(new File("password.txt"));
// Load all password/hash pairings from password.txt into a HashMap for quick lookups
HashMap<String, List> passwords = new HashMap<>();
while (readPassword.hasNextLine()) {
String line = readPassword.nextLine();
String[] lineParts = line.split(":");
String hash = lineParts[0];
String password = lineParts[1];
// If we haven't seen the hash before, create a new list to store its associated passwords
if (passwords.get(hash) == null) {
passwords.put(hash, new LinkedList<>());
}
// Add the password to the list of all passwords that have this hash
passwords.get(hash).add(password);
}
// Perform all the lookups from test.txt
Scanner readTest = new Scanner(new File("test.txt"));
while (readTest.hasNextLine()) {
String testHash = readTest.nextLine();
List matchingPasswords = passwords.get(testHash);
// Now do whatever you want with the list of associated passwords...
}
Side Notes:
Looking at your code, it look like you have a few extra requirements (e.g. timing) that I didn't consider in this code snippet. I trust you can figure out how to integrate those additional requirements.
Some of the more academic people on here might take issue with a few parts of my Big-O description/analysis. I'm sure their comments on this post will expound that topic in greater detail if that interests you.

Streaming a file from HDFS vs copying it to local disk

In my Java application I am using a text file(size ~ 300 MB) which is kept in HDFS. Each line of the file contains a string and an Integer ID separated by a comma. I am reading the file line by line and creating Hashmaps(String, ID) from it.
The file looks like this:
String1,Integer1
String2,Integer2
...
Now, I am currently reading the file from HDFS directly using Apacha Hadoop configuration and FileSystem Object.
Configuration conf = new Configuration();
conf.addResource("core-site.xml"));
conf.addResource("hdfs-site.xml"));
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
path= "<some location in HDFS>"
FileSystem fs = FileSystem.get(URI.create(path), conf);
in = fs.open(new Path(path));
The input Stream "in" is passed to another function called read(InputStream in) for reading the file.
public void init(InputStream is) throws Exception {
ConcurrentMap<String, String> pageToId = new ConcurrentHashMap();
ConcurrentMap<String, String> idToPage = new ConcurrentHashMap();
logger.info("Free memory: " + Runtime.getRuntime().freeMemory());
InputStreamReader stream = new InputStreamReader(is, StandardCharsets.UTF_8);
BufferedReader reader = new BufferedReader(stream);
List<String> pageIdMappingColumns = ServerProperties.getInstance().getIdMappingColumns();
String line;
int line_no=0;
while (true) {
try {
line = reader.readLine();
if (line == null) {
break;
}
line_no++;
//System.out.println("Free memory: " + Runtime.getRuntime().freeMemory());
String[] values = line.split(COMMA);
//System.out.println("Free memory: " + Runtime.getRuntime().freeMemory());
if (values.length < pageIdMappingColumns.size()) {
throw new RuntimeException(PAGEMAPPER_INVALID_MAPPING_FILE_FORMAT);
}
String id = EMPTY_STR;
String page = EMPTY_STR;
for (int i = 0; i < values.length; i++) {
String s = values[i].trim();
if (PAGEID.equals(pageIdMappingColumns.get(i))) {
id = s;
continue;
}
if (PAGENAME.equals(pageIdMappingColumns.get(i))) {
page = s;
}
}
pageToId.put(page, id);
idToPage.put(id, page);
} catch (Exception e) {
logger.error(PAGEMAPPER_INIT + e.toString() + " on line " + line_no);
}
}
logger.info("Free memory: " + Runtime.getRuntime().freeMemory());
logger.info("Total number of lines: " + line_no);
reader.close();
ConcurrentMap<String, String> oldPageToId = pageToIdRef.get();
ConcurrentMap<String, String> oldIdToPage = idToPageRef.get();
idToPage.put(MINUS_1, START);
idToPage.put(MINUS_2, EXIT);
pageToId.put(START, MINUS_1);
pageToId.put(EXIT, MINUS_2);
/* Update the Atomic reference hashmaps in memory in two conditions
1. If there was no map in memory(first iteration)
2. If the number of page-names and page-id pairs in the mappings.txt file are more than the previous iteration
*/
if (oldPageToId == null || oldIdToPage != null && oldIdToPage.size() <= idToPage.size() && oldPageToId.size() <= pageToId.size()) {
idToPageRef.set(idToPage);
pageToIdRef.set(pageToId);
logger.info(PAGEMAPPER_INIT + " " + PAGEMAPPER_UPDATE_MAPPING);
} else {
logger.info(PAGEMAPPER_INIT + " " + PAGEMAPPER_LOG_MSZ);
}
}
I am closing the stream when the work is done like this:
IOUtils.closeQuietly(is);
I am executing the above code every 1 hour since the file is being changed in HDFS in that duration. So now, I am getting java.lang.OutOfMemoryError: Java heap space.
My question is: Is it better to copy the file to disk and then use it rather than directly accessing it from HDFS as far as memory requirements are concerned ?
Note: The file has > 3200000 lines.

Stream is always the way to choose.
You're receiving OutOfMemory because you never close your stream, hence memory leak.
Either manually close your stream or use try-with-resource
Edit
pageToId.put(page, id);
idToPage.put(id, page);
You're storing atleast 2x your file size in memory. Which is roughly 600MB.
After that, you assign that value to some ref variable:
idToPageRef.set(idToPage);
pageToIdRef.set(pageToId);
I guess that you're still having reference to old ref data somewhere, hence the internal map data is not released.
You also have resource leak at
throw new RuntimeException(PAGEMAPPER_INVALID_MAPPING_FILE_FORMAT);
You should use try-with-resource or manually close your stream in finally block.

Java looping through array - Optimization

I've got some Java code that runs quite the expected way, but it's taking some amount of time -some seconds- even if the job is just looping through an array.
The input file is a Fasta file as shown in the image below. The file I'm using is 2.9Mo, and there are some other Fasta file that can take up to 20Mo.
And in the code im trying to loop through it by bunches of threes, e.g: AGC TTT TCA ... etc The code has no functional sens for now but what I want is to append each Amino Acid to it's equivalent bunch of Bases. Example :
AGC - Ser / CUG Leu / ... etc
So what's wrong with the code ? and Is there any way to do it better ? Any optimization ? Looping through the whole String is taking some time, maybe just seconds, but need to find a better way to do it.
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
public class fasta {
public static void main(String[] args) throws IOException {
File fastaFile;
FileReader fastaReader;
BufferedReader fastaBuffer = null;
StringBuilder fastaString = new StringBuilder();
try {
fastaFile = new File("res/NC_017108.fna");
fastaReader = new FileReader(fastaFile);
fastaBuffer = new BufferedReader(fastaReader);
String fastaDescription = fastaBuffer.readLine();
String line = fastaBuffer.readLine();
while (line != null) {
fastaString.append(line);
line = fastaBuffer.readLine();
}
System.out.println(fastaDescription);
System.out.println();
String currentFastaAcid;
for (int i = 0; i < fastaString.length(); i+=3) {
currentFastaAcid = fastaString.toString().substring(i, i + 3);
System.out.println(currentFastaAcid);
}
} catch (NullPointerException e) {
System.out.println(e.getMessage());
} catch (FileNotFoundException e) {
System.out.println(e.getMessage());
} catch (IOException e) {
System.out.println(e.getMessage());
} finally {
fastaBuffer.close();
}
}
}

currentFastaAcid = fastaString.toString().substring(i, i + 3);
Please replace with
currentFastaAcid = fastaString.substring(i, i + 3);
toString method of StringBuilder create new instance of String object every time you call it. It still contain a copy of all your large string. If you call substring directly from StringBuilder it will return a small copy of substring.
Also remove System.out.println if you don't really need it.

The big factor here is you are doing the call to substring over a new String each time.
Instead, use substring directly over the stringbuilder
for (int i = 0; i < fastaString.length(); i+=3){
currentFastaAcid = fastaString.substring(i, i + 3);
System.out.println(currentFastaAcid);
}
Also, instead of print the currentFastaAcid each time, save it into a list and print this list at the end
List<String> acids = new LinkedList<String>();
for (int i = 0; i < fastaString.length(); i+=3){
currentFastaAcid = fastaString.substring(i, i + 3);
acids.add(currentFastaAcid);
}
System.out.println(acids.toString());

Your main problem besides the debug output surely is, that you are creating a new String with your completely read data from the file in each iteration of your loop:
currentFastaAcid = fastaString.toString().substring(i, i + 3);
fastaString.toString() will give the same result in each iteration and therefore is redundant. Get it outside the loop and you will surely save some seconds runtime.

Apart from suggested optimization in the serial code, I will go for parallel processing to reduce time further. If you have really big file, you can divide the work of reading file and processing read-lines, in separate threads. That way, when one thread is busy reading nextline from large file, other thread can process read-lines and print them on console.

If you remove the
System.out.println(currentFastaAcid);
line in the for loop, you will gain quite decent time.

Java match/exceed performance of readline

For my application, I had to write a custom "readline" method since I wanted to detect and preserve the newline endings in an ASCII text file. The Java readLine() method does not tell which newline sequence (\r, \n, \r\n) or EOF was encountered, so I cannot put the exact same newline sequence when writing to the modified file.
Here is the SSCE of my test example.
public class TestLineIO {
public static java.util.ArrayList<String> readLineArrayFromFile1(java.io.File file) {
java.util.ArrayList<String> lineArray = new java.util.ArrayList<String>();
try {
java.io.BufferedReader br = new java.io.BufferedReader(new java.io.FileReader(file));
String strLine;
while ((strLine = br.readLine()) != null) {
lineArray.add(strLine);
}
br.close();
} catch (java.io.IOException e) {
System.err.println("Could not read file");
System.err.println(e);
}
lineArray.trimToSize();
return lineArray;
}
public static boolean writeLineArrayToFile1(java.util.ArrayList<String> lineArray, java.io.File file) {
try {
java.io.BufferedWriter out = new java.io.BufferedWriter(new java.io.FileWriter(file));
int size = lineArray.size();
for (int i = 0; i < size; i++) {
out.write(lineArray.get(i));
out.newLine();
}
out.close();
} catch (java.io.IOException e) {
System.err.println("Could not write file");
System.err.println(e);
return false;
}
return true;
}
public static java.util.ArrayList<String> readLineArrayFromFile2(java.io.File file) {
java.util.ArrayList<String> lineArray = new java.util.ArrayList<String>();
try {
java.io.FileInputStream stream = new java.io.FileInputStream(file);
try {
java.nio.channels.FileChannel fc = stream.getChannel();
java.nio.MappedByteBuffer bb = fc.map(java.nio.channels.FileChannel.MapMode.READ_ONLY, 0, fc.size());
char[] fileArray = java.nio.charset.Charset.defaultCharset().decode(bb).array();
if (fileArray == null || fileArray.length == 0) {
return lineArray;
}
int length = fileArray.length;
int start = 0;
int index = 0;
while (index < length) {
if (fileArray[index] == '\n') {
lineArray.add(new String(fileArray, start, index - start + 1));
start = index + 1;
} else if (fileArray[index] == '\r') {
if (index == length - 1) { //last character in the file
lineArray.add(new String(fileArray, start, length - start));
start = length;
break;
} else {
if (fileArray[index + 1] == '\n') {
lineArray.add(new String(fileArray, start, index - start + 2));
start = index + 2;
index++;
} else {
lineArray.add(new String(fileArray, start, index - start + 1));
start = index + 1;
}
}
}
index++;
}
if (start < length) {
lineArray.add(new String(fileArray, start, length - start));
}
} finally {
stream.close();
}
} catch (java.io.IOException e) {
System.err.println("Could not read file");
System.err.println(e);
e.printStackTrace();
return lineArray;
}
lineArray.trimToSize();
return lineArray;
}
public static boolean writeLineArrayToFile2(java.util.ArrayList<String> lineArray, java.io.File file) {
try {
java.io.BufferedWriter out = new java.io.BufferedWriter(new java.io.FileWriter(file));
int size = lineArray.size();
for (int i = 0; i < size; i++) {
out.write(lineArray.get(i));
}
out.close();
} catch (java.io.IOException e) {
System.err.println("Could not write file");
System.err.println(e);
return false;
}
return true;
}
public static void main(String[] args) {
System.out.println("Begin");
String fileName = "test.txt";
long start = 0;
long stop = 0;
start = java.util.Calendar.getInstance().getTimeInMillis();
java.io.File f = new java.io.File(fileName);
java.util.ArrayList<String> javaLineArray = readLineArrayFromFile1(f);
stop = java.util.Calendar.getInstance().getTimeInMillis();
System.out.println("Total time = " + (stop - start) + " ms");
java.io.File oj = new java.io.File(fileName + "_readline.txt");
writeLineArrayToFile1(javaLineArray, oj);
start = java.util.Calendar.getInstance().getTimeInMillis();
java.util.ArrayList<String> myLineArray = readLineArrayFromFile2(f);
stop = java.util.Calendar.getInstance().getTimeInMillis();
System.out.println("Total time = " + (stop - start) + " ms");
java.io.File om = new java.io.File(fileName + "_custom.txt");
writeLineArrayToFile2(myLineArray, om);
System.out.println("End");
}
}
Version 1 uses readLine(), whereas version 2 is my version, which preserves newline characters.
On a text file with about 500K lines, version1 takes about 380 ms, whereas version2 takes 1074 ms.
How can I speed-up the performance of version2?
I checked Google guava and apache-commons libraries but cannot find a suitable replacement for "readLine()" that will tell which newline character was encountered when reading a text file.

Whenever the issue regards a program's speed, the main thing you should keep in mind is that, for any continuous process within that program, the speed is nearly always limited by one of two things: CPU (processing power) or IO (memory allocation and transfer speed).
Usually either your CPU is faster than your IO, or the contrary. Because of this, your program's speed-limit is almost always dictated by one of them, and it's usually easy to know which:
A program that does a lot of calculations but makes only a few, small operations with files, is almost certainly CPU-bound.
A program that reads a lot of data from files, or writes a lot of data to them, but is not very demanding towards processing, is almost certainly IO-bound.
Things are kinda straightforward when trying to improve an CPU-bounded program's speed. It mostly comes down to achieving the same goal or effect while making less operations.
This, on the other hand, does not make the process any easier. In fact, it's usually much harder to optimize CPU-bounded programs than to optimize IO-bounded ones, because each CPU-related operation is usually unique, and has to be revised individually.
Although generally easier once you have the experience, things are not so straightforward with IO-bound programs. There are a lot more stuff to consider when dealing with IO-bound processes.
I'll be using Hard-Disk Drives (HDDs) as the basis, since the characteristics I'll mention affect HDDs the strongest (because they are mechanical), but you should keep in mind that many of the same concepts apply, to some extent, to almost every memory-storage hardware, including Solid-State Drives (SSDs) and even RAM!
These are the main performance characteristics of most memory-storage hardware:
Access time: Also known as response time, it is the time it takes before the hardware can actually transfer data.
For mechanical hardware such as HDDs, this is mostly related to the mechanical nature of the drive, in other words, it's rotating disk and moving "heads". As such, access time of mechanical drives can vary significantly between each-other.
For circuital hardware such as SSDs and RAM, this time is not dependent on moving parts, but rather electrical connections, so the access time is very quick and consistent, and you shouldn't worry about it.
Seek time: The time it takes for the hardware to seek (reach) the correct position within it's internal subdivisions, in order to read from or write to addresses in that section.
For mechanical drives, mainly rotary ones, the seek time measures the time it takes the head assembly on the actuator arm to travel to the track of the disk where the data will be read from or written to.
Average seek time ranges from 3 ms (~) for high-end server drives, to 15 ms (~) for mobile drives, with the most common desktop drives typically having a seek time around 9 ms (~).
With RAM and SSDs, there are no moving parts, so a measurement of the seek time is only testing the electronic circuits, and preparing a particular location on the memory in the device for the operation.
Typical SSDs will have a seek time between 0.08 to 0.16 ms (~), with RAM being even faster.
Command-Processing time: Also known as command overhead, it is the time it takes for the drive's electronics to set up the necessary communication between the various internal components, so it can read or write the data.
This is in the range of 0.003 ms (~) for both, mechanical and circuital devices, and is usually ignored in benchmarks.
Settle time: It is the time it takes for the heads to settle on the target track and stop vibrating, so that they do not read or write off-track.
This amount is usually very small (typically less than 0.1 ms), and typically included in benchmarks as part of the seek time.
Data-Transfer rate: Also called throughput, it covers both: The internal rate, which is the time it takes to move data between the disk surface and the controller on the drive. And the external rate, which is the time to move data between the controller on the drive and an external component in the host system. It has a few sub-factors within:
Media rate: Speed at which the drive can read bits from the media. In other words, the actual read/write speed.
Sector overhead: Additional time (bytes) needed for control structures and other information necessary to manage the drive, locate and validate data and perform other support functions.
Allocation speed: Similar to sector overhead, it's the time taken for the drive to determine the slots that will be written to, and to register them on it's address dictionary. Only needed for write operations.
Head-Switch time: Time required to electrically switch from one head to another; Only applies to multi-head drives and is about 1 to 2 ms.
Cylinder-switch time: Time required to move to an adjacent track; The name cylinder is used because typically all the tracks of a drive with more than one head or data surface are read before moving the actuator, implying the image of a circle or cylinder rather than a track. This time is exclusive to rotary mechanical drives, and is typically about about 2 to 3 ms.
This means that the main performance issues regarding IO are caused by going back-and-forth between IO and processing. An issue that can be enormously diminished by using buffers, and processing and reading/writhing in bigger chunks of data, rather than every byte.
As you can also see, although many of the speed characteristics are still present, RAM and SSDs do not have the same internal limits of HDDs, so their internal and external transfer rates often reach the maximum capabilities of the drive-to-host interface.
Chunk approach example:
This example will create a Test folder on the desktop, and generate a Test.txt file within.
The file is generated with an specified number of lines, each line containing the word "Test" repeated for an specific number of times (for file-size purposes). Each line is ended by "\r", "\n" or "\r\n", sequentially.
It's meaningless to save the results of each chunk in-memory cumulatively, as doing so would lead the whole file end up in-memory eventually, which is nearly the same problem of not using chunks to begin with.
As such, an output file is created in the same Test folder, to which the result of every chunk is stored at, once that chunk is finished.
The base file is read using buffers, and those buffers are additionally used as the chunks.
The process here is simply printing a textual version of the line-separator ("\\r", "\\n" or "\\r\\n"), followed by ": ", followed by the line contents; But for the last line, "EOF" is used instead.
To actually operate with chunks, it's probably easier to manage with a class-based approach, rather than a purely function-based one.
Anyways, here goes the code:
public static void main(String[] args) throws FileNotFoundException, IOException {
File file = new File(TEST_FOLDER, "Test.txt");
//These settings create a 122 MB file.
generateTestFile(file, 500000, 50);
long clock = System.nanoTime();
processChunks(file, 8 * (int) Math.pow(1024, 2));
clock = System.nanoTime() - clock;
float millis = clock / 1000000f;
float seconds = millis / 1000f;
System.out.printf(""
+ "%12d nanos\n"
+ "%12.3f millis\n"
+ "%12.3f seconds\n",
clock, millis, seconds);
}
public static File prepareResultFile(File source) {
String ofn = source.getName(); //Original File Name.
int extPos = ofn.lastIndexOf('.'); //Extension index.
String ext = ofn.substring(extPos); //Get extension.
ofn = ofn.substring(0, extPos); //Get name without extension reusing 'ofn'.
return new File(source.getParentFile(), ofn + "_Result" + ext);
}
public static void processChunks(File file, int buffSize)
throws FileNotFoundException, IOException {
//No need for buffers bigger than the file itself.
if (file.length() < buffSize) {
buffSize = (int)file.length();
}
byte[] buffer = new byte[buffSize];
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file), buffSize);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(
prepareResultFile(file)), buffSize);
StringBuilder sb = new StringBuilder();
while (bis.read(buffer) > (-1)) {
//Check if a "\r\n" was split between chunks.
boolean skipFirst = false;
if (sb.length() > 0 && sb.charAt(sb.length() - 1) == '\r') {
if (buffer[0] == '\n') {
bos.write(("\\r\\n: " + sb.toString() + System.lineSeparator()).getBytes());
sb = new StringBuilder();
skipFirst = true;
}
}
for (int i = skipFirst ? 1 : 0; i < buffer.length; i++) {
if (buffer[i] == '\r') {
if (i + 1 < buffer.length) {
if (buffer[i + 1] == '\n') {
bos.write(("\\r\\n: " + sb.toString() + System.lineSeparator()).getBytes());
i++; //Skip '\n'.
} else {
bos.write(("\\r: " + sb.toString() + System.lineSeparator()).getBytes());
}
sb = new StringBuilder(); //Reset accumulator.
} else {
//A "\r\n" might be split between two chunks.
}
} else if (buffer[i] == '\n') {
bos.write(("\\n: " + sb.toString() + System.lineSeparator()).getBytes());
sb = new StringBuilder(); //Reset accumulator.
} else {
sb.append((char) buffer[i]);
}
}
}
bos.write(("EOF: " + sb.toString()).getBytes());
bos.flush();
bos.close();
bis.close();
System.out.println("Finished!");
}
public static boolean generateTestFile(File file, int lines, int elements)
throws IOException {
String[] lineBreakers = {"\r", "\n", "\r\n"};
BufferedOutputStream bos = null;
try {
bos = new BufferedOutputStream(new FileOutputStream(file));
for (int i = 0; i < lines; i++) {
for (int ii = 1; ii < elements; ii++) {
bos.write("test ".getBytes());
}
bos.write("test".getBytes());
bos.write(lineBreakers[i % 3].getBytes());
}
bos.flush();
System.out.printf("LOG: Test file \"%s\" created.\n", file.getName());
return true;
} catch (IOException ex) {
System.err.println("ERR: Could not write file.");
throw ex;
} finally {
try {
bos.close();
} catch (IOException ex) {
System.err.println("WRN: Could not close stream.");
Logger.getLogger(Q_13458142_v2.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
I don't know what IDE you are using, but if it's NetBeans, make a memory-profile of your code and compare to a profile of this one. You should notice a big difference in the amount of memory needed during processing.
Here, the chunk approach's memory usage, which includes not only the chunk itself but also the program's own variables and structures, does not go over 40 MB even tough we are dealing with a file bigger than 100 MB. As you can see:
It also spends very little time in GB, mostly less than 5% at any given point:

The second version doesn't seem to use BufferedReader or another form of buffer. It might be the cause of slow down.
Since you seem to read the whole file in memory, you can perhaps read it as a big string (with a buffer) then parse it in memory to analyze the line endings.

Your are doubling the out statements(one for line and one for newline):
Can you try below(use lineSeparator() to get the line separator and append before writing):
out.write(lineArray.get(i)+System.lineSeparator());

Don't reinvent the wheel.
Check the BufferedReader#readLine() code
Copy, paste, and make the changes you need to keep the line separator inside the line

Process Builder Incrementing Error

I have written a program to monitor the status of some hard drives attached to a RAID on Linux. Through this program I execute several command line commands. An interesting error occurs though....the program runs for a good three minutes before it seems that it can no longer correctly execute the command it had been previously executing (for many iterations).
It spits out an array index error (my variable driveLetters[d]) because it appears to miss the drive somehow (even though it found it hundreds of times before).
Other things to note...if I tell it to reset int "d" to "0" if it exceeds the number of drives...the program won't crash and instead will just become stuck in an infinite loop.
Also, the time at which the program crashes varies. It doesn't appear to crash after a set number of intervals. Finally, I don't get any kind of memory leak errors.
Here is some of code that should reveal the error:
public static void scsi_generic() throws IOException, InterruptedException
{
int i =0;
int d =0;
int numberOfDrives = 8;
char driveLetters[] = {'b','c','d','e','f','g','h','i','j','k','l','m'};
String drive = "";
while (i <= numberOfDrives)
{
System.out.println("position 1");
List<String> commands = new ArrayList<String>();
commands.add("cat");
commands.add("/sys/class/scsi_generic/sg"+i+"/device/sas_address");
SystemCommandExecutor commandExecutor = new SystemCommandExecutor(commands);
int driveFound = commandExecutor.executeCommand();
if (driveFound == 0)
{
System.out.println("Folder: sg" + i + " was found." );
StringBuilder stdout = commandExecutor.getStandardOutputFromCommand();
String data = stdout.toString();
String sas = data.substring(11,12);
int sasA = Integer.parseInt(sas,16);
boolean matchedSG = false;
while (matchedSG == false)
{
System.out.println("position2");
List<String> lookSD = new ArrayList<String>();
lookSD.add("test");
lookSD.add("-d");
lookSD.add("/sys/class/scsi_generic/sg"+i+"/device/block:sd" + driveLetters[d]);
SystemCommandExecutor commandSearch = new SystemCommandExecutor(lookSD);
int sdFound = commandSearch.executeCommand();
StringBuilder stdout3 = commandSearch.getStandardOutputFromCommand();
StringBuilder stderr = commandSearch.getStandardErrorFromCommand();
String sdFound2 = stdout3.toString();
if (sdFound == 0)
{
matchedSG = true;
System.out.println("Found the SD drive.");
drive = "sd"+driveLetters[d];
System.out.println(sasA);
hdsas.set(sasA , sas);
d = 0;
i++;
loadDrives(drive , sasA);
}
/* else if (sdFound != )
{
System.out.println("Error:" + sdFound);
System.out.println(d+ " "+ i);
}
*/
else if ( d >= 8)
{
System.out.println("Drive letter: " + driveLetters[d]);
System.out.println("Int: " + i);
// System.out.println(sdFound2);
System.out.println("sd error: "+ sdFound);
// System.out.println(stderr);
//System.out.println(sdFound2 + " m");
}
else
{
d++;
}
}
}
else
{
System.out.println("Folder: sg" + i + " could not be found.");
i++;
}
d =0;
}
}
Any help or suggestions would be awesome! Thanks.
EDIT:
The solution I found was to use the java library for testing if a directory exists rather than doing it through the linux command line.
Ex:
File location = new File("directory");
if (location.exists())
{
}
No idea why it works and doesn't crash, where as the linux command line did after a short period of time, but it does.

This is no direct answer to your question, but it still might help you:
I often have to find bugs in code like yours (very long methods with "global" variables, that is, variables declared at the beginning of a method and used all over then). Just by refactoring the code properly (short methods with a single purpose each), the cause of the bug becomes immediately visible to me and is fixed within a second (while the refactoring itself takes much longer).
I guess that's what everyone trying to offer you help is doing anyway: Refactor your code (probably only in one's head) so that is (much) more easy to understand what's going on.

The solution I found was to use the java library for testing if a directory exists rather than doing it through the linux command line.
Ex:
File location = new File("directory");
if (location.exists())
{
}
No idea why it works and doesn't crash, where as the linux command line did after a short period of time, but it does.

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.