Java read into byte array - java

I have a .txt file consisting of 1's and 0's like so;
11111100000001010000000101110010
11111100000001100000000101110010
00000000101001100010000000100000
I would like to be able to read 8 (1's and 0's) and put each 'byte' into a byte array. So a line would be 4 bytes;
11111100 00000101 00000001 01110010 --> 4 bytes, line 1
11111100 00000110 00000001 01110010 --> 8 bytes, line 2
00000000 10100110 00100000 00100000 --> total 12 bytes, line 3
...
and so on.
I believe I need to store the data in a binary file but I'm not sure how to do this. Any help is greatly appreciated.
Edit 1:
I would like to put 8 1's and 0's (11111100, 00000101) into a byte and store in a byte array so 11111100 would be the first byte in the array, 00000101 the second and so on. I hope this is clearer.
Edit 2:
fileopen = new JFileChooser(System.getProperty("user.dir") + "/Example programs"); // open file from current directory
filter = new FileNameExtensionFilter(".txt", "txt");
fileopen.addChoosableFileFilter(filter);
if (fileopen.showOpenDialog(null)== JFileChooser.APPROVE_OPTION)
{
try
{
file = fileopen.getSelectedFile();
//create FileInputStream object
FileInputStream fin = new FileInputStream(file);
byte[] fileContent = new byte[(int)file.length()];
fin.read(fileContent);
for(int i = 0; i < fileContent.length; i++)
{
System.out.println("bit " + i + "= " + fileContent[i]);
}
//create string from byte array
String strFileContent = new String(fileContent);
System.out.println("File content : ");
System.out.println(strFileContent);
}
catch(FileNotFoundException e){}
catch(IOException e){}
}

Here's one way, with comments in the code:
import java.lang.*;
import java.io.*;
import java.util.*;
public class Mkt {
public static void main(String[] args) throws Exception {
BufferedReader br = new BufferedReader(new FileReader("in.txt"));
List<Byte> bytesList = new ArrayList<Byte>();
// Read line by line
for(String line = br.readLine(); line != null; line = br.readLine()) {
// 4 byte representations per line
for(int i = 0; i < 4; i++) {
// Get each of the 4 bytes (i.e. 8 characters representing the byte)
String part = line.substring(i * 8, (i + 1) * 8);
// Parse that into the binary representation
// Integer.parseInt is used as byte in Java is signed (-128 to 127)
byte currByte = (byte)Integer.parseInt(part, 2);
bytesList.add(currByte);
}
}
Byte[] byteArray = bytesList.toArray(new Byte[]{});
// Just print for test
for(byte currByte: byteArray) {
System.out.println(currByte);
}
}
}
Input is read from file named in.txt. Here's a sample run:
$ javac Mkt.java && java Mkt
-4
5
1
114
-4
6
1
114
0
-90
32
32
Hope this helps to get you started, you can tweak to your needs.

Use BufferedReader to read in the txt file.
BufferedReader in = new BufferedReader(...);
ArrayList<byte> bytes = new ArrayList<byte>();
ArrayList<char> buffer = new ArrayList<char>();
int c = 0;
while((c = in.read()) >= 0) {
if(c == '1' || c == '0') buffer.add((char)c);
if(buffer.size() == 8) {
bytes.add(convertToByte(buffer));
buffer.clear();
}
}

Related

Error in reading GZip in java but not in python

I write some data to GZIP binary file using the below java code
public static void WriteDictAndIndex(HashMap<String, Term> terms, int index){
try{
GZIPOutputStream postingListOutput = new GZIPOutputStream(new FileOutputStream(String.format("./generated/posting_list_%d", index)));
GZIPOutputStream dictionaryOutput = new GZIPOutputStream(new FileOutputStream(String.format("./generated/dictionary_%d", index)));
Integer START=0, SIZE=0, VOCAB=0;
for(String s : terms.keySet()){
ArrayList<Pair<Integer, Byte>> postingList = terms.get(s).postingList;
SIZE = postingList.size()*5;
// Write one posting list to the file system
ByteBuffer list_buffer = ByteBuffer.allocate(SIZE);
int totalCount = 0;
for(Pair<Integer, Byte> p : postingList) {
// Write the docID (4 bytes)
list_buffer.putInt(p.getValue0());
// Write the term frequency (1 byte)
byte termFrequency = p.getValue1();
list_buffer.put(termFrequency);
// Counter for the total occurrences of words
totalCount += (int)termFrequency;
}
if(index == 0 && totalCount == 1)
continue;
postingListOutput.write(list_buffer.array());
// Write one dictionary entry to the file system
byte[] token = s.getBytes();
ByteBuffer dict_buffer = ByteBuffer.allocate(16+token.length);
dict_buffer.putInt(token.length);
dict_buffer.put(token);
dict_buffer.putInt(terms.get(s).documentFrequency);
dict_buffer.putInt(START);
dict_buffer.putInt(SIZE);
dictionaryOutput.write(dict_buffer.array());
START += SIZE;
VOCAB += 1;
}
//INFO
System.out.println(String.format("Vocabulary Size: %d", VOCAB));
postingListOutput.close();
dictionaryOutput.close();
}catch(IOException e){
System.err.println(e);
}
}
Now when I read first 695 bytes of this file using python, it reads as expected. But when I read the file using java GZIP, there are some discrepancies (the last 10 bytes of the first 695 bytes that I read are different)
I am trying to read using the following code:
try{
GZIPInputStream postingList = new GZIPInputStream(new FileInputStream(new File(args[1])));
GZIPInputStream dictionary = new GZIPInputStream(new FileInputStream(new File(args[2])));
byte[] buf = new byte[4];
while(true){
// Get the size of the token from the dictionary
dictionary.read(buf);
int tokenSize = ByteBuffer.wrap(buf).getInt();
// Read the token
byte[] tokenBuffer = new byte[tokenSize];
dictionary.read(tokenBuffer);
String token = new String(tokenBuffer, StandardCharsets.UTF_8);
// Read the document frequency
dictionary.read(buf);
int documentFrequency = ByteBuffer.wrap(buf).getInt();
// Read the starting index of the posting list
dictionary.read(buf);
int START = ByteBuffer.wrap(buf).getInt();
// Read the size of the posting list
dictionary.read(buf);
int SIZE = ByteBuffer.wrap(buf).getInt();
// Read the posting list
for(int i=0; i<documentFrequency; i++){
byte[] ID = new byte[4];
postingList.read(ID);
int docID = ByteBuffer.wrap(ID).getInt();
byte[] frequency = new byte[1];
postingList.read(frequency);
System.out.println(String.format("%d: %d: %d",i, docID, frequency[0]));
}
break;
}
postingList.close();
dictionary.close();
}
catch(IOException e){
System.err.println(e);
}
The print statement above will print multiple lines with after reading an integer(4 byte) and a byte in each line.
Last 2 print statments should be of the form(which python reads fine)
137: 81257: 1
138: 81737: 1
But I am getting(using the below java code)
137: 65536: 61
138: 1761673217: 63
Any pointers on what could be the mistake?

Extra bytes are added when I run my compressed file in my decompression method

I'm working with text files here. My LZW decompression doesn't seem to work correctly. Something is wrong when I try to decompress my compressed file back: I get extra bytes added to the text file after decompression. As of now, I found the sequence of values that's causing the issue, it's the * (star character)
For example, When I have a text file containing this following sentence:
*** Hello ***
Here's how my decompression works (I'm showing only 2 iterations):
#1 iteration
priorword = 42
currword = 256
Added in dict: 256: **
Wrote in file: **
#2 iteration
priorword = 256
currword = 32
Added in dict: 257: **_ (_ means space)
Write in file: **
We know have in file: ****_
What could be wrong here, an extra star gets added because as you can see the original file has three stars.
After decompression, I get: **** Hello **
A star * from the right is added to the left, how could this be?
Decompression method:
public void LZW_Decompress(String input) throws IOException {
// DictSize builds up to 4k, Array_Char holds these values
Array_char = new String[4096];
for (int i = 0; i < 256; i++) {
dictionary.put(i, Character.toString((char) i));
Array_char[i] = Character.toString((char) i);
}
// Read input as uncompressed file & Write out compressed file
RandomAccessFile in = new RandomAccessFile(input, "r");
RandomAccessFile out = new RandomAccessFile(input.replace(
".lzw", ""), "rw");
try {
// Gets the first word in code and outputs its corresponding char
buffer[0] = in.readByte();
buffer[1] = in.readByte();
priorword = getvalue(buffer[0], buffer[1], onleft);
onleft = !onleft;
out.writeBytes(Array_char[priorword]);
// Reads every 3 bytes and generates corresponding characters
while (true) {
if (onleft) {
buffer[0] = in.readByte();
buffer[1] = in.readByte();
currword = getvalue(buffer[0], buffer[1], onleft);
} else {
buffer[2] = in.readByte();
currword = getvalue(buffer[1], buffer[2], onleft);
}
onleft = !onleft;
if (currword >= dictSize) {
if (dictSize < 4096) {
Array_char[dictSize] = Array_char[priorword]
+ Array_char[priorword].charAt(0);
}
dictSize++;
out.writeBytes(Array_char[priorword]
+ Array_char[priorword].charAt(0));
} else {
if (dictSize < 4096) {
Array_char[dictSize] = Array_char[priorword]
+ Array_char[currword].charAt(0);
}
dictSize++;
out.writeBytes(Array_char[currword]);
}
priorword = currword;
}
} catch (EOFException e) {
in.close();
out.close();
}
}

How to decode bytes from file that contains strings in java?

I have a file that contains a string followed by bytes that contain binary numbers encoded in them.
Thisisastring. �J
In my code I try to ignore the string and focus on decoding the bytes that are separated by a space. When I run the code the outcome seems to be correct except the first binary number is off by a lot.
StringBuffer buffer = new StringBuffer();
File file = new File(arg);
FileInputStream in = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(in, "UTF8");
Reader inn = new BufferedReader(isr);
int ch;
while ((ch = inn.read()) > -1){
buffer.append((char)ch);
}
inn.close();
String content = buffer.toString();
String temp = new String();
for(int i=0; i<content.length(); i++){
temp += content.charAt(i);
if(content.charAt(i) == ' '){
while(i != content.length()-1){
i++;
byte b = (byte) content.charAt(i);
String x = Integer.toString(b & 0xFF, 2);
System.out.println(x);
}
}
}
Results:
11111101 <- Why is only this one incorrect?
11000
1001010
1011
What is expected:
10010101
00011000
01001010
1011
You should not use Readers or Strings for binary data.
StringBuffer buffer = new StringBuffer();
File file = new File(arg);
FileInputStream in = new FileInputStream(file);
DataInputStream dis = new DataInputStream(new BufferedInputStream(in));
int ch;
while ((ch = din.read()) > -1){
buffer.append((char)ch);
if (ch == ' ')
{
// next byte is a binary value
byte b = din.readByte();
String x = Integer.toString(b & 0xFF, 2);
System.out.println(x);
}
}

Reading binary file byte by byte

I've been doing research on a java problem I have with no success. I've read a whole bunch of similar questions here on StackOverflow but the solutions just doesn't seem to work as expected.
I'm trying to read a binary file byte by byte.
I've used:
while ((data = inputStream.read()) != -1)
loops...
for (int i = 0; i < bFile.length; i++) {
loops...
But I only get empty or blank output. The actual content of the file I'm trying to read is as follows:
¬í sr assignment6.PetI¿Z8kyQŸ I ageD weightL namet Ljava/lang/String;xp > #4 t andysq ~ #bÀ t simbasq ~ #I t wolletjiesq ~
#$ t rakker
I'm merely trying to read it byte for byte and feed it to a character array with the following line:
char[] charArray = Character.toChars(byteValue);
Bytevalue here represents an int of the byte it's reading.
What is going wrong where?
Since java 7 it is not needed to read byte by byte, there are two utility function in Files:
Path path = Paths.get("C:/temp/test.txt");
// Load as binary:
byte[] bytes = Files.readAllBytes(path);
String asText = new String(bytes, StandardCharset.ISO_8859_1);
// Load as text, with some Charset:
List<String> lines = Files.readAllLines(path, StandardCharsets.ISO_8859_1);
As you want to read binary data, one would use readAllBytes.
String and char is for text. As opposed to many other programming languages, this means Unicode, so all scripts of the world may be combined. char is 16 bit as opposed to the 8 bit byte.
For pure ASCII, the 7 bit subset of Unicode / UTF-8, byte and char values are identical.
Then you might have done the following (low-quality code):
int fileLength = (int) path.size();
char[] chars = new char[fileLength];
int i = 0;
int data;
while ((data = inputStream.read()) != -1) {
chars[i] = (char) data; // data actually being a byte
++i;
}
inputStream.close();
String text = new String(chars);
System.out.println(Arrays.toString(chars));
The problem you had, probably concerned the unwieldy fixed size array in java, and that a char[] still is not a String.
For binary usage, as you seem to be reading serialized data, you might like to dump the file:
int i = 0;
int data;
while ((data = inputStream.read()) != -1) {
char ch = 32 <= data && data < 127 ? (char) data : ' ';
System.out.println("[%06d] %02x %c%n", i, data, ch);
++i;
}
Dumping file position, hex value and char value.
it is simple example:
public class CopyBytes {
public static void main(String[] args) throws IOException {
FileInputStream in = null;
FileOutputStream out = null;
try {
in = new FileInputStream("xanadu.txt");
out = new FileOutputStream("outagain.txt");
int c;
while ((c = in.read()) != -1) {
out.write(c);
}
} finally {
if (in != null) {
in.close();
}
if (out != null) {
out.close();
}
}
}
}
If you want to read text(characters) - use Readers, if you want to read bytes - use Streams
Why not using Apache Commons:
byte[] bytes = IOUtils.toByteArray(inputStream);
Then you can convert it to char:
String str = new String(bytes);
Char[] chars = str.toCharArray();
Or like you did:
char[] charArray = Character.toChars(bytes);
To deserialize objects:
List<Object> results = new ArrayList<Object>();
FileInputStream fis = new FileInputStream("your_file.dat");
ObjectInputStream ois = new ObjectInputStream(fis);
try {
while (true) {
results.add(ois.readObject());
}
} catch (OptionalDataException e) {
if (!e.eof) throw e;
} finally {
ois.close();
}
Edit:
Use file.length() for they array size, and make a byte array. Then inputstream.read(b).
Edit again: if you want characters, use inputstreamreader(fileinputstream(file),charset), it even comes with charset.

How can I write a sequence of strings and then a byte array to a file?

I want to write first a sequence of strings and then a sequence of bytes into a file, using Java. I started by using FileOutputStream because of the array of bytes. After searching the API, I realised that FileOutputStream cannot write Strings, only ints and bytes, so I switched to DataOutputStream. When I run the program, I get an exception. Why?
Here's a portion of my code:
try {
// Create the file
FileOutputStream fos;
DataOutputStream dos; // = new DataOutputStream("compressedfile.ecs_h");
File file= new File("C:\\MyFile.txt");
fos = new FileOutputStream(file);
dos=new DataOutputStream(fos);
/* saves the characters as a dictionary into the file before the binary seq*/
for (int i = 0; i < al.size(); i++) {
String name= al.get(i).name; //gets the string from a global arraylist, don't pay attention to this!
dos.writeChars(name); //saving the name in the file
}
System.out.println("\nIS SUCCESFULLY WRITTEN INTO FILE! ");
dos.writeChars("><");
String strseq;
/*write all elements from the arraylist into a string variable*/
strseq= seq.toString();
System.out.println("sTringSeq: " + strseq);
/*transpose the sequence string into a byte array*/
byte[] data = new byte[strseq.length() / 8];
for (int i = 0; i < data.length; i++) {
data[i] = (byte) Integer.parseInt(strseq.substring(i * 8, (i + 1) * 8), 2);
dos.write(data[i]);
}
dos.flush();
//Close the output stream
dos.close();
} catch(Exception e){}
The problem with your code is that the last for loop was counting over the wrong number of bytes. The code below fixes your problem writing your test data to a file. This works on my machine.
public static void main(String[] args) {
ArrayList<String> al = new ArrayList<String>();
al.add("String1");
al.add("String2");
try {
// Create the file
FileOutputStream fos = new FileOutputStream("MyFile.txt");
DataOutputStream dos = new DataOutputStream(fos);
/* saves the characters as a dictionary into the file before the binary seq */
for (String str : al) {
dos.writeChars(str);
}
System.out.println("\nIS SUCCESFULLY WRITTEN INTO FILE! ");
dos.writeChars("><");
String strseq = "001100111100101000101010111010100100111000000000";
// Ensure that you have a string of the correct size
if (strseq.length() % 8 != 0) {
throw new IllegalStateException(
"Input String is cannot be converted to bytes - wrong size: "
+ strseq.length());
}
int numBytes = strseq.length() / 8;
for (int i = 0; i < numBytes; i++) {
int start = i * 8;
int end = (i + 1) * 8;
byte output = (byte) Integer.parseInt(strseq.substring(start, end), 2);
dos.write(output);
}
dos.writeChars("> Enf of File");
dos.flush();
// Close the output stream
dos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
The approach of writing bytes directly to a test file does have a few problems (I assume that it's a text file in that your test file name ends with .txt), the most obvious one being that some text editors don't handle/display null characters very well (your last test byte was: 00000000 or null). If you want to see the bytes as readable bytes then you could investigate encoding them using Base64 encoding.
Line:
data[i] = (byte) Integer.parseInt(strseq.substring(i * 8, (i + 1) * 8), 2);
looks very suspiciously...
can you provide move details about strseq and its value?
What about this code ?
this code :
byte[] data = new byte[strseq.length() / 8];
for (int i = 0; i < data.length; i++) {
data[i] = (byte) Integer.parseInt(strseq.substring(i * 8, (i + 1) * 8), 2);
dos.write(data[i]);
}
becomes
byte[] data = strseq.getBytes();
With the FileWriter class you have a nice abstraction of a file writing operation.
May this class can help you to write your file...
You can substitute the other OutputStreams by only this class. It have all the methods of you want for write a string and a byte array in a file.

Categories

Resources