DynamoDB Parallel Scan - Java Synchronization

DynamoDB Parallel Scan - Java Synchronization - java

I'm trying to use the DynamoDB Parallel Scan Example:
http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/LowLevelJavaScanning.html
I have 200,000 items, and I've taken the sequential code scan, and modified it slightly for my usage:
Map<String, AttributeValue> lastKeyEvaluated = null;
do
{
ScanRequest scanRequest = new ScanRequest()
.withTableName(tableName)
.withExclusiveStartKey(lastKeyEvaluated);
ScanResult result = client.scan(scanRequest);
double counter = 0;
for(Map<String, AttributeValue> item : result.getItems())
{
itemSerialize.add("Set:"+counter);
for (Map.Entry<String, AttributeValue> getItem : item.entrySet())
{
String attributeName = getItem.getKey();
AttributeValue value = getItem.getValue();
itemSerialize.add(attributeName
+ (value.getS() == null ? "" : ":" + value.getS())
+ (value.getN() == null ? "" : ":" + value.getN())
+ (value.getB() == null ? "" : ":" + value.getB())
+ (value.getSS() == null ? "" : ":" + value.getSS())
+ (value.getNS() == null ? "" : ":" + value.getNS())
+ (value.getBS() == null ? "" : ":" + value.getBS()));
}
counter += 1;
}
lastKeyEvaluated = result.getLastEvaluatedKey();
}
while(lastKeyEvaluated != null);
The counter gives exactly 200,000 when this code has finished, however, I also wanted to try the parallel scan.
Function Call:
ScanSegmentTask task = null;
ArrayList<String> list = new ArrayList<String>();
try
{
ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);
int totalSegments = numberOfThreads;
for (int segment = 0; segment < totalSegments; segment++)
{
// Runnable task that will only scan one segment
task = new ScanSegmentTask(tableName, itemLimit, totalSegments, segment, list);
// Execute the task
executor.execute(task);
}
shutDownExecutorService(executor);
}
.......Catches something if error
return list;
Class:
I have a static list that the data is shared with all the threads. I was able to retrieve the lists, and output the amount of data.
// Runnable task for scanning a single segment of a DynamoDB table
private static class ScanSegmentTask implements Runnable
{
// DynamoDB table to scan
private String tableName;
// number of items each scan request should return
private int itemLimit;
// Total number of segments
// Equals to total number of threads scanning the table in parallel
private int totalSegments;
// Segment that will be scanned with by this task
private int segment;
static ArrayList<String> list_2;
Object lock = new Object();
public ScanSegmentTask(String tableName, int itemLimit, int totalSegments, int segment, ArrayList<String> list)
{
this.tableName = tableName;
this.itemLimit = itemLimit;
this.totalSegments = totalSegments;
this.segment = segment;
list_2 = list;
}
public void run()
{
System.out.println("Scanning " + tableName + " segment " + segment + " out of " + totalSegments + " segments " + itemLimit + " items at a time...");
Map<String, AttributeValue> exclusiveStartKey = null;
int totalScannedItemCount = 0;
int totalScanRequestCount = 0;
int counter = 0;
try
{
while(true)
{
ScanRequest scanRequest = new ScanRequest()
.withTableName(tableName)
.withLimit(itemLimit)
.withExclusiveStartKey(exclusiveStartKey)
.withTotalSegments(totalSegments)
.withSegment(segment);
ScanResult result = client.scan(scanRequest);
totalScanRequestCount++;
totalScannedItemCount += result.getScannedCount();
synchronized(lock)
{
for(Map<String, AttributeValue> item : result.getItems())
{
list_2.add("Set:"+counter);
for (Map.Entry<String, AttributeValue> getItem : item.entrySet())
{
String attributeName = getItem.getKey();
AttributeValue value = getItem.getValue();
list_2.add(attributeName
+ (value.getS() == null ? "" : ":" + value.getS())
+ (value.getN() == null ? "" : ":" + value.getN())
+ (value.getB() == null ? "" : ":" + value.getB())
+ (value.getSS() == null ? "" : ":" + value.getSS())
+ (value.getNS() == null ? "" : ":" + value.getNS())
+ (value.getBS() == null ? "" : ":" + value.getBS()));
}
counter += 1;
}
}
exclusiveStartKey = result.getLastEvaluatedKey();
if (exclusiveStartKey == null)
{
break;
}
}
}
catch (AmazonServiceException ase)
{
System.err.println(ase.getMessage());
}
finally
{
System.out.println("Scanned " + totalScannedItemCount + " items from segment " + segment + " out of " + totalSegments + " of " + tableName + " with " + totalScanRequestCount + " scan requests");
}
}
}
Executor Service Shut Down:
public static void shutDownExecutorService(ExecutorService executor)
{
executor.shutdown();
try
{
if (!executor.awaitTermination(10, TimeUnit.SECONDS))
{
executor.shutdownNow();
}
}
catch (InterruptedException e)
{
executor.shutdownNow();
Thread.currentThread().interrupt();
}
}
However, the amount of items changes every time I run this piece of code (Varies around 60000 in total, 6000 per threads, with 10 created threads). Removing synchronization does not change the result too.
Is there a bug with the synchronization or with the Amazon AWS API?
Thanks All
EDIT:
The new function call:
ScanSegmentTask task = null;
ArrayList<String> list = new ArrayList<String>();
try
{
ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);
int totalSegments = numberOfThreads;
for (int segment = 0; segment < totalSegments; segment++)
{
// Runnable task that will only scan one segment
task = new ScanSegmentTask(tableName, itemLimit, totalSegments, segment);
// Execute the task
Future<ArrayList<String>> future = executor.submit(task);
list.addAll(future.get());
}
shutDownExecutorService(executor);
}
The new class:
// Runnable task for scanning a single segment of a DynamoDB table
private static class ScanSegmentTask implements Callable<ArrayList<String>>
{
// DynamoDB table to scan
private String tableName;
// number of items each scan request should return
private int itemLimit;
// Total number of segments
// Equals to total number of threads scanning the table in parallel
private int totalSegments;
// Segment that will be scanned with by this task
private int segment;
ArrayList<String> list_2 = new ArrayList<String>();
static int counter = 0;
public ScanSegmentTask(String tableName, int itemLimit, int totalSegments, int segment)
{
this.tableName = tableName;
this.itemLimit = itemLimit;
this.totalSegments = totalSegments;
this.segment = segment;
}
#SuppressWarnings("finally")
public ArrayList<String> call()
{
System.out.println("Scanning " + tableName + " segment " + segment + " out of " + totalSegments + " segments " + itemLimit + " items at a time...");
Map<String, AttributeValue> exclusiveStartKey = null;
try
{
while(true)
{
ScanRequest scanRequest = new ScanRequest()
.withTableName(tableName)
.withLimit(itemLimit)
.withExclusiveStartKey(exclusiveStartKey)
.withTotalSegments(totalSegments)
.withSegment(segment);
ScanResult result = client.scan(scanRequest);
for(Map<String, AttributeValue> item : result.getItems())
{
list_2.add("Set:"+counter);
for (Map.Entry<String, AttributeValue> getItem : item.entrySet())
{
String attributeName = getItem.getKey();
AttributeValue value = getItem.getValue();
list_2.add(attributeName
+ (value.getS() == null ? "" : ":" + value.getS())
+ (value.getN() == null ? "" : ":" + value.getN())
+ (value.getB() == null ? "" : ":" + value.getB())
+ (value.getSS() == null ? "" : ":" + value.getSS())
+ (value.getNS() == null ? "" : ":" + value.getNS())
+ (value.getBS() == null ? "" : ":" + value.getBS()));
}
counter += 1;
}
exclusiveStartKey = result.getLastEvaluatedKey();
if (exclusiveStartKey == null)
{
break;
}
}
}
catch (AmazonServiceException ase)
{
System.err.println(ase.getMessage());
}
finally
{
return list_2;
}
}
}
Final EDIT:
Function Call:
ScanSegmentTask task = null;
ArrayList<String> list = new ArrayList<String>();
ArrayList<Future<ArrayList<String>>> holdFuture = new ArrayList<Future<ArrayList<String>>>();
try
{
ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);
int totalSegments = numberOfThreads;
for (int segment = 0; segment < totalSegments; segment++)
{
// Runnable task that will only scan one segment
task = new ScanSegmentTask(tableName, itemLimit, totalSegments, segment);
// Execute the task
Future<ArrayList<String>> future = executor.submit(task);
holdFuture.add(future);
}
for (int i = 0 ; i < holdFuture.size(); i++)
{
boolean flag = false;
while(flag == false)
{
Thread.sleep(1000);
if(holdFuture.get(i).isDone())
{
list.addAll(holdFuture.get(i).get());
flag = true;
}
}
}
shutDownExecutorService(executor);
}
Class:
private static class ScanSegmentTask implements Callable>
{
// DynamoDB table to scan
private String tableName;
// number of items each scan request should return
private int itemLimit;
// Total number of segments
// Equals to total number of threads scanning the table in parallel
private int totalSegments;
// Segment that will be scanned with by this task
private int segment;
ArrayList<String> list_2 = new ArrayList<String>();
static AtomicInteger counter = new AtomicInteger(0);
public ScanSegmentTask(String tableName, int itemLimit, int totalSegments, int segment)
{
this.tableName = tableName;
this.itemLimit = itemLimit;
this.totalSegments = totalSegments;
this.segment = segment;
}
#SuppressWarnings("finally")
public ArrayList<String> call()
{
System.out.println("Scanning " + tableName + " segment " + segment + " out of " + totalSegments + " segments " + itemLimit + " items at a time...");
Map<String, AttributeValue> exclusiveStartKey = null;
try
{
while(true)
{
ScanRequest scanRequest = new ScanRequest()
.withTableName(tableName)
.withLimit(itemLimit)
.withExclusiveStartKey(exclusiveStartKey)
.withTotalSegments(totalSegments)
.withSegment(segment);
ScanResult result = client.scan(scanRequest);
for(Map<String, AttributeValue> item : result.getItems())
{
list_2.add("Set:"+counter);
for (Map.Entry<String, AttributeValue> getItem : item.entrySet())
{
String attributeName = getItem.getKey();
AttributeValue value = getItem.getValue();
list_2.add(attributeName
+ (value.getS() == null ? "" : ":" + value.getS())
+ (value.getN() == null ? "" : ":" + value.getN())
+ (value.getB() == null ? "" : ":" + value.getB())
+ (value.getSS() == null ? "" : ":" + value.getSS())
+ (value.getNS() == null ? "" : ":" + value.getNS())
+ (value.getBS() == null ? "" : ":" + value.getBS()));
}
counter.addAndGet(1);
}
exclusiveStartKey = result.getLastEvaluatedKey();
if (exclusiveStartKey == null)
{
break;
}
}
}
catch (AmazonServiceException ase)
{
System.err.println(ase.getMessage());
}
finally
{
return list_2;
}
}
}

OK, I believe the issue is in the way you synchronized.
In your case, your lock is pretty much pointless, as each thread has its own lock, and so synchronizing never actually blocks one thread from running the same piece of code. I believe that this is the reason that removing synchronization does not change the result -- because it never would have had an effect in the first place.
I believe your issue is in fact due to the static ArrayList<String> that's shared by your threads. This is because ArrayList is actually not thread-safe, and so operations on it are not guaranteed to succeed; as a result, you have to synchronize operations to/from it. Without proper synchronization, it could be possible to have two threads add something to an empty ArrayList, yet have the resulting ArrayList have a size of 1! (or at least if my memory hasn't failed me. I believe this is the case for non-thread-safe objects, though)
As I said before, while you do have a synchronized block, it really isn't doing anything. You could synchronize on list_2, but all that would do is effectively make all your threads run in sequence, as the lock on the ArrayList wouldn't be released until one of your threads was done.
There are a few solutions to this. You can use Collections.synchronizedList(list_2) to create a synchronized wrapper to your ArrayList. This way, adding to the list is guaranteed to succeed. However, this induces a synchronization cost per operations, and so isn't ideal.
What I would do is actually have ScanSegmentTask implement Callable (technically Callable<ArrayList<String>>. The Callable interface is almost exactly like the Runnable interface, except its method is call(), which returns a value.
Why is this important? I think that what would produce the best results for you is this:
Make list_2 an instance variable, initialized to a blank list
Have each thread add to this list exactly as you have done
Return list_2 when you are done
Concatenate each resulting ArrayList<String> to the original ArrayList using addAll()
This way, you have no synchronization overhead to deal with!
This will require a few changes to your executor code. Instead of calling execute(), you'll need to call submit(). This returns a Future object (Future<ArrayList<String>> in your case) that holds the results of the call() method. You'll need to store this into some collection -- an array, ArrayList, doesn't matter.
To retrieve the results, simply loop through the collection of Future objects and call get() (I think). This call will block until the thread that the Future object corresponds to is complete.
I think that's it. While this is more complicated, I think that this is be best performance you're going to get, as with enough threads either CPU contention or your network link will become the bottleneck. Please ask if you have any questions, and I'll update as needed.

Related

I'm having trouble setting the value of a variable that will update my test

My problem is in the part where I'm doing the "if/else" conditions, when I call the function that will perform the comparisons and will define if the test passed or not and will send some information, I'm receiving null.
Problems are among the asterisks. If anyone can help me
This is my code :
public static void fxSpot_GBP_JPY(TradeData data, TradeData output) throws Exception {
if (data == null) {
fail("The input data object was not correctly filled");
}
if (output == null) {
fail("The output data object was not correctly filled");
}
//Used to set the comment, the status and update to JIRA
FieldsJSON fields = new FieldsJSON();
String assertionError = "";
List<String> inputs = new ArrayList<String>();
List<String> outputs = new ArrayList<String>();
String newDate = Utils.formatTimeZoneMinute(data.getTradeDate());
String asOfTrade = Utils.formatTimeZoneMinute(data.getAsOfTradeDate());
String executionDate = Utils.formatTimeZoneMinute(output.getExecutionDateTime());
try {
//Add the data in the list
inputs.add(data.getTransactionNumber()); outputs.add(output.getBloombergId());
inputs.add(newDate); outputs.add(output.getTradeDate());
inputs.add(asOfTrade); outputs.add(executionDate);
inputs.add(data.getSettlementDate()); outputs.add(output.getValueDate());
inputs.add(data.getTradeAmount()); outputs.add(output.getAmount2());
inputs.add(data.getCustomerAccountCounterparty()); outputs.add(output.getMiPartyId());
inputs.add(data.getPrincipalLoanAmount()); outputs.add(output.getAmount());
inputs.add(data.getSecurityPrice()); outputs.add(output.getRate());
inputs.add(data.getISOCodeOf1stCurrency()); outputs.add("BRL");//output.getCurrency2()
inputs.add(data.getISOCodeOf2ndCurrency()); outputs.add(output.getCurrency1());
//Compare values
System.out.println("-------------------");
int y = 0;
int x = 0;
for(String input : inputs) {
for(String out : outputs) {
if(y == x) {
if(input.equals(out)) {
WriterCSV.setOk("Ok");
**String comment = input + " = " + out;
fields.setComment(comment);
fields.setStatus("PASS");**
System.out.println("ok - " + input + " = " + out);
}else {
WriterCSV.setOk("not Ok");
**String comment = input + " = " + out;
fields.setComment(comment);
fields.setStatus("FAIL");**
System.out.println("not Ok - " + input + " = " + out);
}
}
x = x+1; // count of the list of output
}
y = y+1; // count of the list of inputs
x = 0; // reset to 0 the count of outputs
}
// evidence with the name and value of fields compared
WriterCSV.reportSpot_CSV(data,output);
}
Here is my test:
#Test
#Tag("compare")
public void CompareSpot() throws Exception {
//Create a list to read the CSVfile
List<DTOTradeData> dto;
//Used to get the TradeData of list dto.
DTOTradeData dtd = new DTOTradeData();
// Read a csvFile and return a list with the values to new xml
dto = CSVReader.readCSV("spot.csv");
//The xpath of xml
FileDriverSpot spot = new FileDriverSpot();
FileDriver output = new FileDriverSpotOutput();
FieldsJSON fields = new FieldsJSON();
//new xml = dataInput and the outputFile = dataOutput
TradeData dataInput = new TradeData();
TradeData dataOutput = new TradeData();
for (int i = 0; i < dto.size(); i++) {
dtd = dto.get(i); // get TradeData
dtd.getTradeData().setDriver(spot); // set the driver
if (fileExist(Setup.xmlPath + dtd.getInputFile() + ".xml")) {
dataInput = Reader.read(spot, Setup.xmlPath + dtd.getInputFile() + ".xml");
dataOutput = Reader.read(output, Setup.spotPath + dtd.getOutputFile());
try {
// make the comparison
**FunctionalTest.fxSpot_GBP_JPY(dataInput, dataOutput);**
}
catch(AssertionError e) {
String comment = e.toString();
fields.setComment(comment);
}
} else {
fail("The file: " + dtd.getTemplateFile()
+ " needs to go through the writing process before being compared.");
}
//Convert the file to base64
String inputData = UpdateTestStatus.convertToBase64(Setup.xmlPath + dtd.getInputFile() + ".xml");
String outputData = UpdateTestStatus.convertToBase64(Setup.spotPath + dtd.getOutputFile());
String evidenceCompared = UpdateTestStatus.convertToBase64(Setup.reportPath+"ReportSpot.csv");
System.out.println(UpdateTestStatus.updateTestRun(**fields.getStatus(),fields.getComment()**,
inputData,dtd.getInputFile()+ ".xml", //data of the XML and the name of the file
outputData,dtd.getOutputFile(),
evidenceCompared,"ReportSpot.csv",
Setup.testExec, dtd.getJiraId()).asString()); // ID testExecution and ID of
}
}

The test and the code under test each create a separate instance of FieldsJSON. Data set in one instance will not be visible in the other (unless the data is declared static, in which case there's no need to create instances).
You can fix this by using a single instance, either passed to the fxSpot_GBP_JPY method from the test, or returned from that method to the test.

How to find method which call threadPool

I have some bug in production's application, but I can't find the cause of it. I try to get some log to find a method, which calls my method(). But because I use threadPool I can't just get Thread.currentThread().getStackTrace() and iterate through StackTraceElements, it shows only some lines before ThreadPool.
If I use the next code, I'll get every method which I need, but it so expansive. Only 1 call of method cost 400+ Kb in a text file in my test environment. In production it would be about 1 Mb in a second, I think.
private final ExecutorService completableFutureExecutor =
new ThreadPoolExecutor(10, 2000, 60L, TimeUnit.SECONDS, new SynchronousQueue<>());
public void firstMethod(){
secondMethod();
}
private CompletableFuture<Void> secondMethod(){
return CompletableFuture.supplyAsync(()->method(),threadPool);
}
void method(){
Map<Thread, StackTraceElement[]> map = Thread.getAllStackTraces();
for (Thread thread : map.keySet()) {
printLog(thread);
}
}
private void printLog(Thread thread) {
StringBuilder builder = new StringBuilder();
for (StackTraceElement s : thread.getStackTrace()) {
builder.append("\n getClass = " + s.getClass());
builder.append("\n getClassName = " + s.getClassName());
builder.append("\n getFileName = " + s.getFileName());
builder.append("\n getLineNumber = " + s.getLineNumber());
builder.append("\n getMethodName = " + s.getMethodName());
builder.append("\n ---------------------------- \n ");
}
ownLogger.info("SomeThread = {} ", builder);
}
How to find that firstMethod() who calls secondMethod() ?

As I haven't found any good solution my own is to put logger before and after CompletableFuture call
It looks like
Logger beforeAsync= LoggerFactory.getLogger("beforeAsync");
Logger afterAsync= LoggerFactory.getLogger("afterAsync");
private CompletableFuture<Void> secondMethod(){
printLongerTrace(Thread.currentThread(),beforeAsync);
return CompletableFuture.supplyAsync(()->method(),threadPool);
}
private void methodWithException(){
try{
//do something
}
catch(Exception e){
printLongerTrace(e,"methodWithException", afterAsync);
}
}
public void printLongerTrace(Throwable t, String methodName, Logger ownlogger) {
if (t.getCause() != null) {
printLongerTrace(t.getCause(), methodName, fields, ownlogger);
}
StringBuilder builder = new StringBuilder();
builder.append("\n Thread = " + Thread.currentThread().getName());
builder.append("ERROR CAUSE = " + t.getCause() + "\n");
builder.append("ERROR MESSAGE = " + t.getMessage() + "\n");
printLog(t.getStackTrace(), builder);
ownlogger.info(methodName + "Trace ----- {}", builder);
}
public void printLongerTrace(Thread t, Logger ownlogger) {
StringBuilder builder = new StringBuilder();
builder.append("\n Thread = " + Thread.currentThread().getName());
printLog(t.getStackTrace(), builder);
ownlogger.info("Trace ----- {}", builder);
}
private StringBuilder printLog(StackTraceElement[] elements, StringBuilder builder) {
int size = elements.length > 15 ? 15 : elements.length;
for (int i = 0; i < size; i++) {
builder.append("Line " + i + " = " + elements[i] + " with method = " + elements[i].getMethodName() + "\n");
}
return builder;
}
printLongerTrace(Throwable t, String methodName, Logger ownlogger) needs to print exception with every cause in recursion.
printLongerTrace(Thread t, Logger ownlogger) needs to print which method call before CompletableFuture

Just dump the Stack by calling Thread.dumpStack() but this is only for debugin and has a big overhead, since dumping the stack is cpu intensive

Performance tuning for JavaRDD function

I want to convert dataframe to Array of Json using Java and Spark version 1.6, for which am converting the data from
Dataframe -> Json -> RDD -> Array
where the data looks like this.
[
{
"prtdy_pgm_x":"P818_C",
"prtdy_pgm_x":"P818",
"prtdy_attr_c":"Cost",
"prtdy_integer_r":0,
"prtdy_cds_d":"prxm",
"prtdy_created_s":"2018-05-12 04:12:19.0",
"prtdy_created_by_c":"brq",
"prtdy_create_proc_x":"w_pprtdy_security_t",
"snapshot_d":"2018-05-12-000018"
},
{
"prtdy_pgm_x":"P818_I",
"prtdy_pgm_x":"P818",
"prtdy_attr_c":"Tooling",
"prtdy_integer_r":0,
"prtdy_cds_d":"prxm",
"prtdy_created_s":"2018-05-12 04:12:20.0",
"prtdy_created_by_c":"brq",
"prtdy_create_proc_x":"w_pprtdy_security_t",
"snapshot_d":"2018-05-12-000018"
},
{
"prtdy_pgm_x":"P818_W",
"prtdy_pgm_x":"P818",
"prtdy_attr_c":"Weight",
"prtdy_integer_r":0,
"prtdy_cds_d":"prxm",
"prtdy_created_s":"2018-05-12 04:12:20.0",
"prtdy_created_by_c":"brq",
"prtdy_create_proc_x":"w_pprtdy_security_t",
"snapshot_d":"2018-05-12-000018"
},
......
]
so I wrote my code something like this.
if(cmnTableNames != null && cmnTableNames.length > 0)
{
for(int i=0; i < cmnTableNames.length; i++)
{
String cmnTableName = cmnTableNames[i];
DataFrame cmnTableContent = null;
if(cmnTableName.contains("PTR_security_t"))
{
cmnTableContent = hiveContext.sql("SELECT * FROM " + cmnTableName + " where fbrn04_snapshot_d = '" + snapshotId + "'");
}
else
{
cmnTableContent = hiveContext.sql("SELECT * FROM " + cmnTableName);
}
String cmnTable = cmnTableName.substring(cmnTableName.lastIndexOf(".") + 1);
if (cmnTableContent.count() > 0)
{
String cmnStgTblDir = hdfsPath + "/staging/" + rptName + "/common/" + cmnTable;
JavaRDD<String> cmnTblCntJson = cmnTableContent.toJSON().toJavaRDD();
String result = cmnTblCntJson.reduce((ob1, ob2) -> (String)ob1+","+(String)ob2); //This Part, takes more time than usual contains large set of data.
String output = "["+result+"]";
ArrayList<String> outputList = new ArrayList<String>();
outputList.add(output);
JavaRDD<String> finalOutputRDD = sc.parallelize(outputList);
String cmnStgMrgdDir = cmnStgTblDir + "/mergedfile";
if(dfs.exists(new Path(cmnStgTblDir + "/mergedfile"))) dfs.delete(new Path(cmnStgTblDir + "/mergedfile"), true);
finalOutputRDD.coalesce(1).saveAsTextFile(cmnStgMrgdDir, GzipCodec.class);
fileStatus = dfs.getFileStatus(new Path(cmnStgMrgdDir + "/part-00000.gz"));
dfs.setPermission(fileStatus.getPath(),FsPermission.createImmutable((short) 0770));
dfs.rename(new Path(cmnStgMrgdDir + "/part-00000.gz"), new Path(CommonPath + "/" + cmnTable + ".json.gz"));
}
else
{
System.out.println("There are no records in " + cmnTableName);
}
}
}
else
{
System.out.println("The common table lists are null.");
}
sc.stop();
but while reduce function is applied it's taking more time
JavaRDD<String> cmnTblCntJson = cmnTableContent.toJSON().toJavaRDD();
String result = cmnTblCntJson.reduce((ob1, ob2) -> (String)ob1+","+(String)ob2); //This Part, takes more time than usual contains large set of data.
the table with the partition "PTR_security_t" is huge and takes a lot of time compared to other tables which don't have partitions (40-50 mins odd for 588kb)
I Tried Applying Lambda but i ended up with Task not serializable error. Check the code below.
if(cmnTableNames != null && cmnTableNames.length > 0)
{
List<String> commonTableList = Arrays.asList(cmnTableNames);
DataFrame commonTableDF = sqc.createDataset(commonTableList,Encoders.STRING()).toDF();
commonTableDF.toJavaRDD().foreach(cmnTableNameRDD -> {
DataFrame cmnTableContent = null;
String cmnTableName = cmnTableNameRDD.mkString();
if(cmnTableName.contains("PTR_security_t"))
{
cmnTableContent = hiveContext.sql("SELECT * FROM " + cmnTableName + " where fbrn04_snapshot_d = '" + snapshotId + "'");
}
else
{
cmnTableContent = hiveContext.sql("SELECT * FROM " + cmnTableName);
}
String cmnTable = cmnTableName.substring(cmnTableName.lastIndexOf(".") + 1);
if (cmnTableContent.count() > 0)
{
String cmnStgTblDir = hdfsPath + "/staging/" + rptName + "/common/" + cmnTable;
JavaRDD<String> cmnTblCntJson = cmnTableContent.toJSON().toJavaRDD();
String result = cmnTblCntJson.reduce((ob1, ob2) -> (String)ob1+","+(String)ob2);
String output = "["+result+"]";
ArrayList<String> outputList = new ArrayList<String>();
outputList.add(output);
JavaRDD<String> finalOutputRDD = sc.parallelize(outputList);
String cmnStgMrgdDir = cmnStgTblDir + "/mergedfile";
if(dfs.exists(new Path(cmnStgTblDir + "/mergedfile"))) dfs.delete(new Path(cmnStgTblDir + "/mergedfile"), true);
finalOutputRDD.coalesce(1).saveAsTextFile(cmnStgMrgdDir, GzipCodec.class);
fileStatus = dfs.getFileStatus(new Path(cmnStgMrgdDir + "/part-00000.gz"));
dfs.setPermission(fileStatus.getPath(),FsPermission.createImmutable((short) 0770));
dfs.rename(new Path(cmnStgMrgdDir + "/part-00000.gz"), new Path(CommonPath + "/" + cmnTable + ".json.gz"));
}
else
{
System.out.println("There are no records in " + cmnTableName);
}
});
}
else
{
System.out.println("The common table lists are null.");
}
sc.stop();
is there any efficient way where i can enhance my Performance ?

100 records at a time to udf

I have to pass record to an UDF which calls an API but as we want to do it parallely,we are using spark and thats why UDF is being developed, the problem here is that that UDF needs to take only 100 records at a time not more than that, it can't handle more than 100 records parallely, so how to ensure that only 100 record pass to it in one go please note we don't want to use count() function on whole record.
I am attaching the UDF code here,it's a generic UDF which returns array of struct.moreover if we pass 100 records in batchsize variable each time then,if suppose there are 198 records then if as we dont want to use count() we will not be knowing that its last batchsize is going to be 98.so how to handle that thing.
Guys... I have a generic UDF in which call is made for an API but before calling it creates batch of 100 firstly then only call restapi.. So the argument UDF takes are x1:string, x2:string, batchsize:integer(currently the batchsize is 100)..so in UDF until and unless the batchsize is not 100 the call will not happen.. And for each record it will return null.
So till 99th record it will return. Null but at 100th record the call will happen
[So, now the problem part:as we are taking batchsize 100 and call will take place only at 100th record. So, in condition like if we have suppose 198 record in file then 100 record will get the output but, other 98 will only return null as they will not get processed..
So please help a way around, and UDF take one record at a time, but it keep on collecting till 100th record.. I hope this clears up
public class Standardize_Address extends GenericUDF {
private static final Logger logger = LoggerFactory.getLogger(Standardize_Address.class);
private int counter = 0;
Client client = null;
private Batch batch = new Batch();
public Standardize_Address() {
client = new ClientBuilder().withUrl("https://ss-staging-public.beringmedia.com/street-address").build();
}
// StringObjectInspector streeti;
PrimitiveObjectInspector streeti;
PrimitiveObjectInspector cityi;
PrimitiveObjectInspector zipi;
PrimitiveObjectInspector statei;
PrimitiveObjectInspector batchsizei;
private ArrayList ret;
#Override
public String getDisplayString(String[] argument) {
return "My display string";
}
#Override
public ObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
System.out.println("under initialize");
if (args[0] == null) {
throw new UDFArgumentTypeException(0, "NO Street is mentioned");
}
if (args[1] == null) {
throw new UDFArgumentTypeException(0, "No Zip is mentioned");
}
if (args[2] == null) {
throw new UDFArgumentTypeException(0, "No city is mentioned");
}
if (args[3] == null) {
throw new UDFArgumentTypeException(0, "No State is mentioned");
}
if (args[4] == null) {
throw new UDFArgumentTypeException(0, "No batch size is mentioned");
}
/// streeti =args[0];
streeti = (PrimitiveObjectInspector)args[0];
// this.streetvalue = (StringObjectInspector) streeti;
cityi = (PrimitiveObjectInspector)args[1];
zipi = (PrimitiveObjectInspector)args[2];
statei = (PrimitiveObjectInspector)args[3];
batchsizei = (PrimitiveObjectInspector)args[4];
ret = new ArrayList();
ArrayList structFieldNames = new ArrayList();
ArrayList structFieldObjectInspectors = new ArrayList();
structFieldNames.add("Street");
structFieldNames.add("city");
structFieldNames.add("zip");
structFieldNames.add("state");
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
StructObjectInspector si2 = ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames,
structFieldObjectInspectors);
ListObjectInspector li2;
li2 = ObjectInspectorFactory.getStandardListObjectInspector(si2);
return li2;
}
#Override
public Object evaluate(DeferredObject[] args) throws HiveException {
ret.clear();
System.out.println("under evaluate");
// String street1 = streetvalue.getPrimitiveJavaObject(args[0].get());
Object oin = args[4].get();
System.out.println("under typecasting");
int batchsize = (Integer) batchsizei.getPrimitiveJavaObject(oin);
System.out.println("batchsize");
Object oin1 = args[0].get();
String street1 = (String) streeti.getPrimitiveJavaObject(oin1);
Object oin2 = args[1].get();
String zip1 = (String) zipi.getPrimitiveJavaObject(oin2);
Object oin3 = args[2].get();
String city1 = (String) cityi.getPrimitiveJavaObject(oin3);
Object oin4 = args[3].get();
String state1 = (String) statei.getPrimitiveJavaObject(oin4);
logger.info("address passed, street=" + street1 + ",zip=" + zip1 + ",city=" + city1 + ",state=" + state1);
counter++;
try {
System.out.println("under try");
Lookup lookup = new Lookup();
lookup.setStreet(street1);
lookup.setCity(city1);
lookup.setState(state1);
lookup.setZipCode(zip1);
lookup.setMaxCandidates(1);
batch.add(lookup);
} catch (BatchFullException ex) {
logger.error(ex.getMessage(), ex);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
/* batch.add(lookup); */
if (counter == batchsize) {
System.out.println("under if");
try {
logger.info("batch input street " + batch.get(0).getStreet());
try {
client.send(batch);
} catch (Exception e) {
logger.error(e.getMessage(), e);
logger.warn("skipping current batch, continuing with the next batch");
batch.clear();
counter = 0;
return null;
}
Vector<Lookup> lookups = batch.getAllLookups();
for (int i = 0; i < batch.size(); i++) {
// ListObjectInspector candidates;
ArrayList<Candidate> candidates = lookups.get(i).getResult();
if (candidates.isEmpty()) {
logger.warn("Address " + i + " is invalid.\n");
continue;
}
logger.info("Address " + i + " is valid. (There is at least one candidate)");
for (Candidate candidate : candidates) {
final Components components = candidate.getComponents();
final Metadata metadata = candidate.getMetadata();
logger.info("\nCandidate " + candidate.getCandidateIndex() + ":");
logger.info("Delivery line 1: " + candidate.getDeliveryLine1());
logger.info("Last line: " + candidate.getLastLine());
logger.info("ZIP Code: " + components.getZipCode() + "-" + components.getPlus4Code());
logger.info("County: " + metadata.getCountyName());
logger.info("Latitude: " + metadata.getLatitude());
logger.info("Longitude: " + metadata.getLongitude());
}
Object[] e;
e = new Object[4];
e[0] = new Text(candidates.get(i).getComponents().getStreetName());
e[1] = new Text(candidates.get(i).getComponents().getCityName());
e[2] = new Text(candidates.get(i).getComponents().getZipCode());
e[3] = new Text(candidates.get(i).getComponents().getState());
ret.add(e);
}
counter = 0;
batch.clear();
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
return ret;
} else {
return null;
}
}
}

How to Wait for Completion of ANY Worker Thread?

I want a dispatcher thread that executes and retrieves results from a pool of worker threads. The dispatcher needs to continuously feed work to the worker threads. When ANY of the worker thread completes, the dispatcher needs to gather its results and re-dispatch (or create a new) worker thread. It seems to me like this should be obvious but I have been unable to find an example of a suitable pattern. A Thread.join() loop would be inadequate because that is really "AND" logic and I am looking for "OR" logic.
The best I could come up with is to have the dispatcher thread wait() and have the worker threads notify() when they are done. Though seems like I would have to guard against two worker threads that end at the same time causing the dispatcher thread to miss a notify(). Plus, this seems a little bit inelegant to me.
Even less elegant is the idea of the dispatcher thread periodically waking up and polling the worker thread pool and checking each thread to see if it has completed via isAlive().
I took a look at java.util.concurrent and didn't see anything that looked like it fit this pattern.
I feel that to implement what I mention above would involve a lot of defensive programming and reinventing the wheel. There's got to be something that I am missing. What can I leverage to implement this pattern?
This is the single-threaded version. putMissingToS3() would become the dispatcher thread and the capability represented in the uploadFileToBucket() would become the worker thread.
private void putMissingToS3()
{
int reqFilesToUpload = 0;
long reqSizeToUpload = 0L;
int totFilesUploaded = 0;
long totSizeUploaded = 0L;
int totFilesSkipped = 0;
long totSizeSkipped = 0L;
int rptLastFilesUploaded = 0;
long rptSizeInterval = 1000000000L;
long rptLastSize = 0L;
StopWatch rptTimer = new StopWatch();
long rptLastMs = 0L;
StopWatch globalTimer = new StopWatch();
StopWatch indvTimer = new StopWatch();
for (FileSystemRecord fsRec : fileSystemState.toList())
{
String reqKey = PathConverter.pathToKey(PathConverter.makeRelativePath(fileSystemState.getRootPath(), fsRec.getFullpath()));
LocalS3MetadataRecord s3Rec = s3Metadata.getRecord(reqKey);
// Just get a rough estimate of what the size of this upload will be
if (s3Rec == null)
{
++reqFilesToUpload;
reqSizeToUpload += fsRec.getSize();
}
}
long uploadTimeGuessMs = (long)((double)reqSizeToUpload/estUploadRateBPS*1000.0);
printAndLog("Estimated upload: " + natFmt.format(reqFilesToUpload) + " files, " + Utils.readableFileSize(reqSizeToUpload) +
", Estimated time " + Utils.readableElapsedTime(uploadTimeGuessMs));
globalTimer.start();
rptTimer.start();
for (FileSystemRecord fsRec : fileSystemState.toList())
{
String reqKey = PathConverter.pathToKey(PathConverter.makeRelativePath(fileSystemState.getRootPath(), fsRec.getFullpath()));
if (PathConverter.validate(reqKey))
{
LocalS3MetadataRecord s3Rec = s3Metadata.getRecord(reqKey);
//TODO compare and deal with size mismatches. Maybe go and look at last-mod dates.
if (s3Rec == null)
{
indvTimer.start();
uploadFileToBucket(s3, syncParms.getS3Bucket(), fsRec.getFullpath(), reqKey);
indvTimer.stop();
++totFilesUploaded;
totSizeUploaded += fsRec.getSize();
logOnly("Uploaded: Size=" + fsRec.getSize() + ", " + indvTimer.stopDeltaMs() + " ms, File=" + fsRec.getFullpath() + ", toKey=" + reqKey);
if (totSizeUploaded > rptLastSize + rptSizeInterval)
{
long invSizeUploaded = totSizeUploaded - rptLastSize;
long nowMs = rptTimer.intervalMs();
long invElapMs = nowMs - rptLastMs;
long remSize = reqSizeToUpload - totSizeUploaded;
double progessPct = (double)totSizeUploaded/reqSizeToUpload*100.0;
double mbps = (invElapMs > 0) ? invSizeUploaded/1e6/(invElapMs/1000.0) : 0.0;
long remMs = (long)((double)remSize/((double)invSizeUploaded/invElapMs));
printOnly("Progress: " + d2Fmt.format(progessPct) + "%, " + Utils.readableFileSize(totSizeUploaded) + " of " +
Utils.readableFileSize(reqSizeToUpload) + ", Rate " + d3Fmt.format(mbps) + " MB/s, " +
"Time rem " + Utils.readableElapsedTime(remMs));
rptLastMs = nowMs;
rptLastFilesUploaded = totFilesUploaded;
rptLastSize = totSizeUploaded;
}
}
}
else
{
++totFilesSkipped;
totSizeSkipped += fsRec.getSize();
logOnly("Skipped (Invalid chars): Size=" + fsRec.getSize() + ", " + fsRec.getFullpath() + ", toKey=" + reqKey);
}
}
globalTimer.stop();
double mbps = 0.0;
if (globalTimer.stopDeltaMs() > 0)
mbps = totSizeUploaded/1e6/(globalTimer.stopDeltaMs()/1000.0);
printAndLog("Actual upload: " + natFmt.format(totFilesUploaded) + " files, " + Utils.readableFileSize(totSizeUploaded) +
", Time " + Utils.readableElapsedTime(globalTimer.stopDeltaMs()) + ", Rate " + d3Fmt.format(mbps) + " MB/s");
if (totFilesSkipped > 0)
printAndLog("Skipped Files: " + natFmt.format(totFilesSkipped) + " files, " + Utils.readableFileSize(totSizeSkipped));
}
private void uploadFileToBucket(AmazonS3 amazonS3, String bucketName, String filePath, String fileKey)
{
File inFile = new File(filePath);
ObjectMetadata objectMetadata = new ObjectMetadata();
objectMetadata.addUserMetadata(Const.LAST_MOD_KEY, Long.toString(inFile.lastModified()));
objectMetadata.setLastModified(new Date(inFile.lastModified()));
PutObjectRequest por = new PutObjectRequest(bucketName, fileKey, inFile).withMetadata(objectMetadata);
// Amazon S3 never stores partial objects; if during this call an exception wasn't thrown, the entire object was stored.
amazonS3.putObject(por);
}

I think you are at right package. you should use ExecutorService API.
This removes burden of waiting and watching for thread's notification.
Example:
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.Executors;
public class ExecutorEx{
static class ThreadA implements Runnable{
int id;
public ThreadA(int id){
this.id = id;
}
public void run(){
//To simulate some work
try{Thread.sleep(Math.round(Math.random()*100));}catch(Exception e){}
// to show message
System.out.println(this.id + "--Test Message" + System.currentTimeMillis());
}
}
public static void main(String args[]) throws Exception{
int poolSize = 10;
ExecutorService pool = Executors.newFixedThreadPool(poolSize);
int i=0;
while(i<100){
pool.submit(new ThreadA(i));
i++;
}
pool.shutdown();
while(!pool.isTerminated()){
pool.awaitTermination(60, TimeUnit.SECONDS);
}
}
}
And if you want to return something from your thread will need to implement Callable instead of Runnable(call() instead of run()) and collect returned values in Future object array, that you can iterate over later.

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.