How to getProgress of large files using XMLStreamReader - java

I am using below code to read large xml file (in GBs) in hadoop RecordReader using XMLStreamReader
public class RecordReader {
int progressCouunt = 0;
public RecordReader() {
XMLInputFactory factory = XMLInputFactory.newInstance();
FSDataInputStream fdDataInputStream = fs.open(file); //hdfs file
try {
reader = factory.createXMLStreamReader(fdDataInputStream);
} catch (XMLStreamException exception) {
throw new RuntimeException("XMLStreamException exception : ", exception);
}
}
#Override
public float getProgress() throws IOException, InterruptedException {
return progressCouunt;
}
}
My question is how to get reading progress of the file with XMLStreamReader as it does not provide any start or end position to calculate the progress percentage.
I have refered to How do I keep track of parsing progress of large files in StAX?, but cannot user filterReader.
Please help me here.

You could wrap the InputStream by extending FilterInputStream.
public interface InputStreamListener {
void onBytesRead(long totalBytes);
}
public class PublishingInputStream extends FilterInputStream {
private final InputStreamListener;
private long totalBytes = 0;
public PublishingInputStream(InputStream in, InputStreamListener listener) {
super(in);
this.listener = listener;
}
#Override
public int read(byte[] b) {
int count = super.read(b);
this.totalBytes += count;
this.listener.onBytesRead(totalBytes);
}
// TODO: override the other read() methods
}
Usage
XMLInputFactory factory = XMLInputFactory.newInstance();
InputStream in = fs.open(file);
final long fileSize = someHadoopService.getFileLength(file);
InputStremListener listener = new InputStreamListener() {
public void onBytesRead(long totalBytes) {
System.out.println(String.format("Read %s of %s bytes", totalBytes, fileSize));
}
};
InputStream publishingIn = new PublishingInputStream(in, listener);
try {
reader = factory.createXMLStreamReader(publishingIn);
// etc

Related

Get progress information during JAXB de-/serialization

Is there a way to register some progress monitor on JAXB Marshaller and Unmarshaller?
I would like to show some progress information in my GUI while data is de-/serialized.
I see that you can set a Unmarshaller.Listener and Marshaller.Listener, which have a "before" and "after" method. Nevertheless, I do not see any straight forward way to get the total number of elements to serialize.
I would need that obviously to calculate some "percentage done" info.
Is it ok to parse before unmarshalling?
If so, assuming you have a list of objects, you could do something like...
final String tagName = *** name of tag you are counting ***;
InputStream in = *** stream of your xml ***;
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser saxParser = spf.newSAXParser();
final AtomicInteger counter = new AtomicInteger();
saxParser.parse(in, new DefaultHandler() {
#Override
public void startElement (String uri, String localName, String qName, Attributes attributes) {
if (localName.equals(tagName))
counter.incrementAndGet();
}
});
Would doing a more low-level approach by leveraging on the InputStream be an acceptable solution?
E.g.
import java.io.IOException;
import java.io.InputStream;
import java.util.function.DoubleConsumer;
public class InputStreamWithProgressDecorator extends InputStream {
/** Input stream to be decorated */ private final InputStream inputStream;
/** Amount of byte read */ private long position = 0L;
/** File size */ private final long length;
/** Mark */ private int mark = 0;
/** Consumer of the progress */ private final DoubleConsumer callBack;
public InputStreamWithProgressDecorator(final InputStream is, final long l, final DoubleConsumer cb) {
inputStream = is;
length = l;
callBack = cb;
}
private void setPosition(final long fp) {
position = fp;
callBack.accept(getProgress());
}
public double getProgress() {
return length == 0L ? 100d : ((double) position) * 100d / ((double) length);
}
public long getPosition() {
return position;
}
#Override
public int read(byte[] b) throws IOException {
final int rc = inputStream.read(b);
setPosition(position + rc);
return rc;
}
#Override
public int read(byte[] b, int off, int len) throws IOException {
final int rc = inputStream.read(b, off, len);
setPosition(position + rc);
return rc;
}
#Override
public byte[] readAllBytes() throws IOException {
final byte[] result = inputStream.readAllBytes();
setPosition(position + result.length);
return result;
}
#Override
public byte[] readNBytes(int len) throws IOException {
final byte[] result = inputStream.readNBytes(len);
setPosition(position + result.length);
return result;
}
#Override
public int readNBytes(byte[] b, int off, int len) throws IOException {
final int rc = inputStream.readNBytes(b, off, len);
setPosition(position + rc);
return rc;
}
#Override
public long skip(long n) throws IOException {
final long rc = inputStream.skip(n);
setPosition(position + rc);
return rc;
}
#Override
public int available() throws IOException {
return inputStream.available();
}
#Override
public void close() throws IOException {
inputStream.close();
}
#Override
public synchronized void mark(int readlimit) {
inputStream.mark(readlimit);
mark = readlimit;
}
#Override
public synchronized void reset() throws IOException {
inputStream.reset();
setPosition(mark);
}
#Override
public boolean markSupported() {
return inputStream.markSupported();
}
#Override
public int read() throws IOException {
final int c = inputStream.read();
setPosition(position + 1);
return c;
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.function.DoubleConsumer;
public class Demo1 {
public static void main(String[] args) throws IOException {
final File file = new File(args[0]);
final DoubleConsumer callBack = p -> System.out.printf("%.0f%%\n", p);
try (final FileInputStream fis = new FileInputStream(file); final InputStreamWithProgressDecorator is = new InputStreamWithProgressDecorator(fis, file.length(), callBack)) {
// Simulating JAXB unmarshaller reads
byte[] buffer = is.readNBytes(1024);
while (buffer.length != 0) buffer = is.readNBytes(1024);
}
}
}
Or if you have a FileInputStream with a separate Thread approach :
public class FileInputStreamReadProgressThread extends Thread implements UncaughtExceptionHandler {
/** Input stream */ private final FileInputStream fileInputStream;
/** File size */ private final long length;
/** Read progress in percents */ private double progress = 0d;
/** Exception from thread */ private Throwable exception = null;
/** Consumer of the progress */ private final DoubleConsumer callBack;
public FileInputStreamReadProgressThread(final FileInputStream fis, final long l, final DoubleConsumer cb) {
fileInputStream = fis;
length = l;
callBack = cb;
setUncaughtExceptionHandler(this);
setName(getClass().getSimpleName());
}
public double getProgress() { return progress; }
public Throwable getException() { return exception; }
#Override public void uncaughtException(final Thread t, final Throwable e) { exception = e; }
#Override
public void run() {
try {
long position = -1L;
final FileChannel channel = fileInputStream.getChannel();
while (!isInterrupted() && channel.isOpen() && position < length) {
position = channel.position();
progress = length == 0L ? 100d : ((double)position) * 100d / ((double)length);
callBack.accept(progress);
sleep(100L);
}
} catch (final IOException e) {
exception = e;
} catch (final InterruptedException e) {
// Do nothing
}
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.Channels;
import java.util.function.DoubleConsumer;
public class Demo2 {
public static void main(String[] args) throws IOException {
final File file = new File(args[0]);
final DoubleConsumer callBack = p -> System.out.printf("%.0f%%\n", p);
try (final FileInputStream fis = new FileInputStream(file); final InputStream is = Channels.newInputStream(fis.getChannel())) {
final FileInputStreamReadProgressThread readProgressThread = new FileInputStreamReadProgressThread(fis, file.length(), callBack);
readProgressThread.start();
// Simulating JAXB unmarshaller reads
is.readAllBytes();
}
}
}

Override java.io.FileOutputStream method

I'm not sure if that's the right way to ask this, but I'm gonna try to explain my case and what I need.
I have a big java project, that upload files in many different java classes, like too many, and I have around 7 different main folders where the files are uploaded. The files at the moment are saved inside the webapp context, and I need to save them outside of context.
If there were only a few classes that upload these files I could spend a few days changing every class and direct it to a path outisde of context, but there are way too many classes, so I have to figure out a way to do it without changing every class, or any class at all, which would be ideal.
Every upload is done in the following way:
I get real path of one of my main folders:
String realpath = httpServletRequest.getSession()
.getServletContext()
.getRealPath("/mainfolder1/mainsubfolder1/");
Then I get the file and set custom file name:
FormFile file = myForm.getFile();
String contentType = file.getContentType();
String fileName = file.getFileName();
int fileSize = file.getFileSize();
customFileName = "anyName" + fileName.substring(fileName.lastIndexOf("."));
Then I validate and save the file:
if (fileSize > 0 && contentType != null && fileName.length() > 0){
InputStream in = file.getInputStream();
OutputStream bos = new FileOutputStream(realpath + "/" + customFileName);
int byteRead = 0;
byte[] buffer = new byte[8192];
while ((byteRead = in.read(buffer, 0, 8192)) != -1){
bos.write(buffer, 0, byteRead);
}
bos.close();
in.close();
}
Very simple way to save my files, and as you can see, they are saved inside context.
So if I could somehow override java.io.FileOutputStream, to not only save it inside context, but to make a copy outside of context too, that would be great, like save it in the specified path and also on some other path outside of context.
But I don't know if this is possible, or how to reproduce this behaviour.
What I need is to keep the class code exactly as it is but write the file 2 times:
First here: "/insideContext/mainfolder1/mainsubfolder1/"
Then here: "/outsideContext/mainfolder1/mainsubfolder1/"
Is this possible? If not, what would be the best way to accomplish this?
I'd refactor and use Decorator or Wrapper pattern. More about it here
Below some simple idea you could use.
public class ContextAwareDuplicatorOutputStream extends OutputStream {
FileOutputStream insideContext;
FileOutputStream outsideContext;
public ContextAwareDuplicatorOutputStream(String insideContextPath,
String outsideContextPath, String fileName)
throws FileNotFoundException {
insideContext = new FileOutputStream(insideContextPath
+ File.pathSeparator + fileName);
outsideContext = new FileOutputStream(outsideContextPath
+ File.pathSeparator + fileName);
}
#Override
public void close() throws IOException {
insideContext.close();
outsideContext.close();
}
#Override
public void flush() throws IOException {
insideContext.flush();
outsideContext.flush();
}
#Override
public void write(byte[] b) throws IOException {
insideContext.write(b);
outsideContext.write(b);
}
#Override
public void write(byte[] b, int off, int len) throws IOException {
insideContext.write(b, off, len);
outsideContext.write(b, off, len);
}
#Override
public void write(int b) throws IOException {
insideContext.write(b);
outsideContext.write(b);
}
}
Since you don't want to edit anything on your code, create a ServletContextListener that monitor the folder where you upload, and on the new file event, you copy it to the proper directory. Here is awnsered how to monitor a directory. Directory listener in Java
Below here is a small code, not really perfect, but the idea is there
public class FileMonitorServletContextListener implements
ServletContextListener {
public interface FileMonitor {
void start(String fromFolder, String toFolder);
void stop();
}
public class SimpleThreadedWatcher implements FileMonitor {
private class SimpleThread extends Thread {
private boolean running = true;
private String fromFolder;
private String toFolder;
public SimpleThread(String fromFolder, String toFolder) {
this.fromFolder = fromFolder;
this.toFolder = toFolder;
}
private void copy(Path child, String toFolder) {
// Copy the file to the folder
}
#Override
public void run() {
try {
WatchService watcher = FileSystems.getDefault()
.newWatchService();
Path fromPath = Paths.get(fromFolder);
watcher = FileSystems.getDefault().newWatchService();
WatchKey key = fromPath.register(watcher,
StandardWatchEventKinds.ENTRY_CREATE);
while (running) {
for (WatchEvent<?> event : key.pollEvents()) {
// Context for directory entry event is the file
// name of
// entry
#SuppressWarnings("unchecked")
WatchEvent<Path> ev = (WatchEvent<Path>) event;
Path name = ev.context();
Path child = fromPath.resolve(name);
// print out event
System.out.format("%s: %s\n", event.kind().name(),
child);
copy(child, toFolder);
boolean valid = key.reset();
if (!valid) {
break;
}
}
Thread.sleep(1000);
}
} catch (Exception e) {
throw new RuntimeException("Error: ", e);
}
}
public void stopWorking() {
running = false;
}
}
private SimpleThread worker;
#Override
public void start(String fromFolder, String toFolder) {
worker = new SimpleThread(fromFolder, toFolder);
worker.start();
}
#Override
public void stop() {
worker.stopWorking();
}
}
FileMonitor fileMonitor = new SimpleThreadedWatcher();
#Override
public void contextDestroyed(ServletContextEvent arg0) {
fileMonitor.stop();
}
#Override
public void contextInitialized(ServletContextEvent arg0) {
fileMonitor.start("FROM", "TO");
}
}

PIG Custom loader's getNext() is being called again and again

I have started working with Apache Pig for one of our projects. I have to create a custom input format to load our data files. For this, I followed this example Hadoop:Custom Input format. I also created my custom RecordReader implementation to read the data (we get our data in binary format from some other application) and parse that to proper JSON format.
The problem occurs when I use my custom loader in Pig script. As soon as my loader's getNext() method is invoked, it calls my custom RecordReader's nextKeyValue() method, which works fine. It reads the data properly, passes it back to my loader which parses the data and returns a Tuple. So far so good.
The problem arises when my loader's getNext() method is called again and again. It gets called, works fine, and returns the proper output (I debugged it till return statement). But then, instead of letting the execution go further, my loader gets called again. I tried to see the number of times my loader is called, and I could see the number go till 20K!
Can somebody please help me understand the problem in my code?
Loader
public class SimpleTextLoaderCustomFormat extends LoadFunc {
protected RecordReader in = null;
private byte fieldDel = '\t';
private ArrayList<Object> mProtoTuple = null;
private TupleFactory mTupleFactory = TupleFactory.getInstance();
#Override
public Tuple getNext() throws IOException {
Tuple t = null;
try {
boolean notDone = in.nextKeyValue();
if (!notDone) {
return null;
}
String value = (String) in.getCurrentValue();
byte[] buf = value.getBytes();
int len = value.length();
int start = 0;
for (int i = 0; i < len; i++) {
if (buf[i] == fieldDel) {
readField(buf, start, i);
start = i + 1;
}
}
// pick up the last field
readField(buf, start, len);
t = mTupleFactory.newTupleNoCopy(mProtoTuple);
mProtoTuple = null;
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
e.printStackTrace();
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
return t;
}
private void readField(byte[] buf, int start, int end) {
if (mProtoTuple == null) {
mProtoTuple = new ArrayList<Object>();
}
if (start == end) {
// NULL value
mProtoTuple.add(null);
} else {
mProtoTuple.add(new DataByteArray(buf, start, end));
}
}
#Override
public InputFormat getInputFormat() throws IOException {
//return new TextInputFormat();
return new CustomStringInputFormat();
}
#Override
public void setLocation(String location, Job job) throws IOException {
FileInputFormat.setInputPaths(job, location);
}
#Override
public void prepareToRead(RecordReader reader, PigSplit split)
throws IOException {
in = reader;
}
Custom InputFormat
public class CustomStringInputFormat extends FileInputFormat<String, String> {
#Override
public RecordReader<String, String> createRecordReader(InputSplit arg0,
TaskAttemptContext arg1) throws IOException, InterruptedException {
return new CustomStringInputRecordReader();
}
}
Custom RecordReader
public class CustomStringInputRecordReader extends RecordReader<String, String> {
private String fileName = null;
private String data = null;
private Path file = null;
private Configuration jc = null;
private static int count = 0;
#Override
public void close() throws IOException {
// jc = null;
// file = null;
}
#Override
public String getCurrentKey() throws IOException, InterruptedException {
return fileName;
}
#Override
public String getCurrentValue() throws IOException, InterruptedException {
return data;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
#Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
file = split.getPath();
jc = context.getConfiguration();
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
InputStream is = FileSystem.get(jc).open(file);
StringWriter writer = new StringWriter();
IOUtils.copy(is, writer, "UTF-8");
data = writer.toString();
fileName = file.getName();
writer.close();
is.close();
System.out.println("Count : " + ++count);
return true;
}
}
Try this in Loader
//....
boolean notDone = ((CustomStringInputFormat)in).nextKeyValue();
//...
Text value = new Text(((CustomStringInputFormat))in.getCurrentValue().toString())

Logging InputStream

I create an InputStream class, that extends CiphetInputStream. I want to log all data from my InputStream (that i use as input in parser further) so i done following:
public class MyInputStream extends CipherInputStream {
private OutputStream logStream = new ByteArrayOutputStream();
.....
#Override
public int read() throws IOException {
int read = super.read();
logStream.write(read);
return read;
}
#Override
public int read(byte[] b, int off, int len) throws IOException {
int read = super.read(b, off, len);
if (read > 0) {
logStream.write(b, off, read);
}
return read;
}
#Override
public int read(byte[] buffer) throws IOException {
int read = super.read(buffer);
if (read()>0) {
logStream.write(buffer);
}
return read;
}
#Override
public void close() throws IOException {
log();
super.close();
}
public void log() {
String logStr = new String(((ByteArrayOutputStream) logStream).toByteArray(), Charset.defaultCharset());
Log.d(getClass(), logStr);
try {
logStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
In actual my stream has something like this:
<response>
<result>0</result>
</response>
but log show smth like this mutation :
<<response>
<resultt >0</resullt>
</respoonse>
[and (?) symbol at the end]
Thanks for any help!
You can combine TeeInputStream and Logger.stream():
new TeeInputStream(
yourStream,
Logger.stream(Level.INFO, this)
);
If you want to see log in logcat, try Log.i(String tag, String message); or System.out.println("");. Both of them works. You can also use, Log.d, Log.w and Log.e also.

Record size of objects as they're being serialized?

What's the best way to record the size of certain objects as they are being serialized? For example, once objects of type A, B, C are serialized, record the size of their serialized bytes. We can get the size of the entire object graph via getBytes, but we'd like to break it down as to what are the largest contributors to the overall serialized size.
ObjectOutputStream offers writeObjectOverride, but we don't want to rewrite the serialization process. In simplified terms, we need to be aware of when we encounter a certain object prior to serialization, record the total current byte count, and then after it's serialized, take the difference of byte counts. It seems like encompassing writeSerialData would work, but the method is private.
Ideas?
Thanks.
--- UPDATE ---
The answers/suggestions below are insightful. Below is what I have so far. Let me know your thoughts. Thanks.
// extend to get a handle on outputstream
MyObjectOutputStream extends ObjectOutputStream {
private OutputStream out;
public MyObjectOutputStream(out) {
super(out);
this.out = out;
}
public OutputStream getOut() {
return this.out;
}
}
// counter
public static class CounterOutputStream extends FilterOutputStream {
private int bytesWritten = 0;
...
public int getBytesWritten() {
return this.bytesWritten;
}
public void resetCounter() {
bytesWritten = 0;
}
private void update(int len) {
bytesWritten += len;
}
}
// go serialize
ByteArrayOutputStream out = new ByteArrayOutputStream();
ObjectOutputStream oos = new MyObjectOutputStream(new CounterOutputStream(out, 1024));
// record serialized size of this class; do this for every interested class
public class MyInterestingObject {
...
private void writeObject(ObjectOutputStream out) throws IOException {
CounterOutputStream counter = null;
if (out instanceof MyObjectOutputStream) {
counter = (CounterOutputStream)((MyObjectOutputStream)out).getOut();
counter.resetCounter();
}
// continue w/ standard serialization of this object
out.defaultWriteObject();
if (counter != null) {
logger.info(this.getClass() + " bytes written: " + counter.getBytesWritten());
// TODO: store in context or somewhere to be aggregated post-serialization
}
}
}
The simplest solution would be to wrap the OutputStream you're using with an implementation that will count bytes written.
import java.io.IOException;
import java.io.OutputStream;
public class CountingOutputStream extends OutputStream {
private int count;
private OutputStream out;
public CountingOutputStream(OutputStream out) {
this.out = out;
}
public void write(byte[] b) throws IOException {
out.write(b);
count += b.length;
}
public void write(byte[] b, int off, int len) throws IOException {
out.write(b, off, len);
count += len;
}
public void flush() throws IOException {
out.flush();
}
public void close() throws IOException {
out.close();
}
public void write(int b) throws IOException {
out.write(b);
count++;
}
public int getBytesWritten() {
return count;
}
}
Then you would just use that
CountingOutputStream s = new CountingOutputStream(out);
ObjectOutputStream o = new ObjectOutputStream(s);
o.write(new Object());
o.close();
// s.getBytesWritten()
You could implement Externalizable rather than Serializable on any objects you need to capture such data from. You could then implement field-by-field byte counting in the writeExternal method, maybe by handing off to a utility class. Something like
public void writeExternal(ObjectOutput out) throws IOException
{
super.writeExternal(out);
out.writeUTF(this.myString == null ? "" : this.myString);
ByteCounter.getInstance().log("MyClass", "myString", this.myString);
}
Another hackish way would be to stick with Serializable, but to use the readResolve or writeReplace hooks to capture whatever data you need, e.g.
public class Test implements Serializable
{
private String s;
public Test(String s)
{
this.s = s;
}
private Object readResolve()
{
System.err.format("%s,%s,%s,%d\n", "readResolve", "Test", "s", s.length());
return this;
}
private Object writeReplace()
{
System.err.format("%s,%s,%s,%d\n", "writeReplace", "Test", "s", s.length());
return this;
}
public static void main(String[] args) throws Exception
{
File tmp = File.createTempFile("foo", "tmp");
ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(tmp));
Test test = new Test("hello world");
out.writeObject(test);
out.close();
ObjectInputStream in = new ObjectInputStream(new FileInputStream(tmp));
test = (Test)in.readObject();
in.close();
}
}

Categories

Resources