Java Spark Dataframe fixed length file - java

I want a fixed length file to be loaded depending on the given column name and length in separate file. I am able to load the data and append the new column. But, unable to retain the old column list. The column is getting overwritten. But, i want the complete list of columns. Below is the code, I have implemented:
samplefile.txt:
00120181120xyz12341
00220180203abc56792
00320181203pqr25483
00120181120xyz12341
schema.json:
{"Column":"id","length":"3","flag":"0"}
{"Column":"date","length":"8","flag":"0"}
{"Column":"name","length":"3","flag":"1"}
{"Column":"salary","length":"5","flag":"2"}
Current Output:
+-------------------+------+
| _c0|salary|
+-------------------+------+
|00120181120xyz12341| 12341|
|00220180203abc56792| 56792|
|00320181203pqr25483| 25483|
|00120181120xyz12341| 12341|
+-------------------+------+
Expected Output
+-------------------+------++----+--------+---+
| _c0|salary|name |date |id |
+-------------------+------++----+--------+---+
|00120181120xyz12341| 12341|xyz |20181120|001|
|00220180203abc56792| 56792|abc |20180203|002|
|00320181203pqr25483| 25483|pqr |20181203|003|
|00120181120xyz12341| 12341|xyz |20181120|001|
+-------------------+------+-----+--------+---+
Code:
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
public class App {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("Develop")
.master("local").getOrCreate();
Dataset<Row> ds = spark
.read()
.format("csv")
.option("header", "false")
.load("C://Users//path//samplefile.txt");
ds.show();
Dataset<Row> SchemaFile = spark
.read()
.format("csv")
.option("header", "true")
.load("C://Users//path//schema.txt");
SchemaFile.show();
List<String> s = new ArrayList<String>();
int lens = 1;
List<Row> it = SchemaFile.select("Column", "length").collectAsList();
List<StructField> fields = new ArrayList<>();
for (Row fieldName : it) {
System.out.println(fieldName.get(0));
System.out.println(Integer.parseInt(fieldName.get(1).toString()));
ds1 = ds.withColumn(
fieldName.get(0).toString(),
substrings(ds, "_c0", lens,
Integer.parseInt(fieldName.get(1).toString()),
fieldName.get(1).toString())); // selectExpr("substring("+"_c0"+","+lens+","+Integer.parseInt(fieldName.get(1).toString())+")");
s.add(fieldName.get(0).toString());
lens += Integer.parseInt((String) fieldName.get(1));
System.out.println("Lengths:" + lens);
ds1.show();
StructField field = DataTypes.createStructField(
fieldName.toString(), DataTypes.StringType, true);
fields.add(field);
}
StructType schema = DataTypes.createStructType(fields);
System.out.println(schema);
for (String s1 : s) {
System.out.println(s1);
}
}
private static Column substrings(Dataset<Row> ds, String string, int lens,
int i, String cols) {
return ds.col("_c0").substr(lens, i);
}
}
Any kind of help and advice is appreciated.
Thanks in Advance.

I know your question is quite old but maybe others come across this question as well and hope for an answer. I think you have just appended the wrong dataset and therefore dropped the columns after appending.
Possible solution:
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.List;
public class FlfReader {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("FixedLengthFileReader")
.master("local[*]").getOrCreate();
Dataset<Row> ds = spark
.read()
.format("csv")
.option("header", "false")
.load(FlfReader.class.getClassLoader().getResource("samplefile.txt").getPath());
ds.show();
Dataset<Row> SchemaFile = spark
.read()
.format("json")
.option("header", "true")
.load(FlfReader.class.getClassLoader().getResource("schema.json").getPath());
SchemaFile.show();
int lengths = 1;
List<Row> schemaFields = SchemaFile.select("Column", "length").collectAsList();
for (Row fieldName : schemaFields) {
int fieldLength = Integer.parseInt(fieldName.get(1).toString());
ds = ds.withColumn(
fieldName.get(0).toString(),
colSubstring(ds,
lengths,
fieldLength));
lengths += fieldLength;
}
ds.show();
}
private static Column colSubstring(Dataset<Row> ds, int startPos, int length) {
return ds.col("_c0").substr(startPos, length);
}
}

Related

Is there anyway in javaAPI to a Dataset<Row> to a map() and return a Dataset<Row>?

I am using spark-sql-2.4.1v with Java 8. I have an use-case as below,
Dataset<Row> ds = //a Dataset<Row> read from DB
I need to change do some manipulations based on the entries of another dataset
i.e.
List<String> codesList = Array.asList("code1","code2")
Dataset<Row> codes = sc.createDataSet(codesList , Encoders.bean(String.class))
I need process all code in parallel. To do the same I'm trying as below:
Dataset<Row> ds_res = codes.map( x_cod -> //map throwing an error
calcFunction(sparkSession, filePath, ds ,x_cod );
}).reduce(new Function2<Dataset<Row> df1,Dataset<Row> df2) => df1.union(df2))
ds_res .write().path(filePath).mode("append").save();
public static Dataset<Row> calcFunction(sparkSession, filePath, ds ,x_cod ){
//some complex calculation based on x_cod
return ds_res ; // return ds_res for further processing
}
How to make this work in parallel on the cluster?
Encoding List to a Dataset is more feasible option that encoding. If you are planning to use a bean class you can encode it to that type of Dataset<T>
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
import java.util.List;
public class ParallelizeArray {
public static void main(String[] args) {
final SparkSession sparkSess = Constant.getSparkSess();
List<String> codesList = Arrays.asList("code1", "code2");
final Dataset<String> dataFrame = sparkSess.createDataset(codesList, Encoders.STRING());
dataFrame.write().mode(SaveMode.Append).csv("src/main/resources/paraArray");
}
}
or use
final Encoder<Dataset> bean = Encoders.bean(Dataset.class);
Dataset<Row> ds_res = codes.map((MapFunction<String, Dataset>) x_cod -> calcFunction(sparkSess, filePath, ds ,x_cod),bean)
.reduce((ReduceFunction<Dataset>) (df1, df2) -> df1.union(df2));
public static Dataset<Row> calcFunction(SparkSession sparkSession, String filePath, Dataset<Row> ds ,String x_cod ){
Dataset<Row> ds_res = null;//some complex calculation based on x_cod
return ds_res ; // return ds_res for further processing
}

Commit message in Spark Structured Streaming

I'm using spark sturctured streaming (2.3) and kafka 2.4 version.
I want to kow how can I use ASync and Sync commit offset property.
If I set enable.auto.commit as true, Is it Sync or ASync ?
How can I define callback in spark structured streaming ? Or how can I use Sync or ASync in Spark structured streaming ?
Thanks in Advance
My Code
package sparkProject;
import java.io.StringReader;
import java.util.*;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
public class XMLSparkStreamEntry {
static StructType structType = new StructType();
static {
structType = structType.add("FirstName", DataTypes.StringType, false);
structType = structType.add("LastName", DataTypes.StringType, false);
structType = structType.add("Title", DataTypes.StringType, false);
structType = structType.add("ID", DataTypes.StringType, false);
structType = structType.add("Division", DataTypes.StringType, false);
structType = structType.add("Supervisor", DataTypes.StringType, false);
}
static ExpressionEncoder<Row> encoder = RowEncoder.apply(structType);
public static void main(String[] args) throws StreamingQueryException {
SparkConf conf = new SparkConf();
SparkSession spark = SparkSession.builder().config(conf).appName("Spark Program").master("local[*]")
.getOrCreate();
Dataset<Row> ds1 = spark.readStream().format("kafka").option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "Kafkademo").load();
Dataset<Row> ss = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
Dataset<Row> finalOP = ss.flatMap(new FlatMapFunction<Row, Row>() {
private static final long serialVersionUID = 1L;
#Override
public Iterator<Row> call(Row t) throws Exception {
JAXBContext jaxbContext = JAXBContext.newInstance(FileWrapper.class);
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
StringReader reader = new StringReader(t.getAs("value"));
FileWrapper person = (FileWrapper) unmarshaller.unmarshal(reader);
List<Employee> emp = new ArrayList<Employee>(person.getEmployees());
List<Row> rows = new ArrayList<Row>();
for (Employee e : emp) {
rows.add(RowFactory.create(e.getFirstname(), e.getLastname(), e.getTitle(), e.getId(),
e.getDivision(), e.getSupervisor()));
}
return rows.iterator();
}
}, encoder);
Dataset<Row> wordCounts = finalOP.groupBy("firstname").count();
StreamingQuery query = wordCounts.writeStream().outputMode("complete").format("console").start();
System.out.println("SHOW SCHEMA");
query.awaitTermination();
}
}
Can I anyone please check, where and how can I implement ASync and Sync offset commit in my above code ?
Thanks in Advance..!
Please read https://www.waitingforcode.com/apache-spark-structured-streaming/apache-spark-structured-streaming-apache-kafka-offsets-management/read This is an excellent source although a little bit of reading between the lines.
In short:
Structured Streaming ignores the offsets commits in Apache Kafka.
Instead, it relies on its own offsets management on the driver side
which is responsible for distributing offsets to executors and for
checkpointing them at the end of the processing round (epoch or
micro-batch).
Batck Spark Structured Streaming & KAFKA Integration works differently again.
Spark Structured Streaming doesn't support Kafka commit offset feature. Suggested option from the official docs is to enable checkpointing.
https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html
Other suggestion is to change it to Spark Streaming, which supports Kafka commitAsync API.
https://spark.apache.org/docs/latest/streaming-kafka-0-10-integration.html

Issue in extracting 54K entries into dataset and writing it into csv file in apache spark

Need to write large Dataset to CSV file. Below is my sample code
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.core.io.support.PropertiesLoaderUtils;
import org.apache.spark.sql.api.java.UDF3;
import org.apache.spark.sql.api.java.UDF2;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.spark.sql.Dataset;
public class TestUdf3{
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "F:\\JAVA\\winutils");
JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
SQLContext sqlContext = new SQLContext(sc);
List<Row> manufactuerSynonymData = new ArrayList<Row>();
try{
SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate();
HashMap<String, String> options = new HashMap<String, String>();options.put("header", "true");options.put("path", "D:\\xls\\Source25K.csv"); //Load source excel file
Dataset<Row> SourcePropertSet = sqlContext.load("com.databricks.spark.csv", options) ;
Resource resource = new ClassPathResource("/ActaulManufacturerSynonym.properties");
Properties allProperties = PropertiesLoaderUtils.loadProperties(resource);
StructType schemaManufactuerSynonymDictionary = new StructType(new StructField[] {new StructField("ManufacturerSynonymSource", DataTypes.StringType, false, Metadata.empty()), new StructField("ManufacturerSynonymTarget", DataTypes.StringType, false, Metadata.empty()) });
Set<String> setuniqueManufacturerEntries=allProperties.stringPropertyNames();
Row individualRowEntry;
for (String individualManufacturerEntry : setuniqueManufacturerEntries) {
individualRowEntry=RowFactory.create(individualManufacturerEntry,allProperties.getProperty(individualManufacturerEntry));
manufactuerSynonymData.add(individualRowEntry);
}
Dataset<Row> SynonaymList = spark.createDataFrame(manufactuerSynonymData, schemaManufactuerSynonymDictionary).withColumn("ManufacturerSynonymSource", lower(col("ManufacturerSynonymSource")));
SynonaymList.show(90,false);
UDF2<String, String, Boolean> contains = new UDF2<String, String, Boolean>() {
private static final long serialVersionUID = -5239951370238629896L;
#Override
public Boolean call(String t1, String t2) throws Exception {
return t1.matches(t2);
}
};
spark.udf().register("contains", contains, DataTypes.BooleanType);
UDF3<String, String, String, String> replaceWithTerm = new UDF3<String, String, String, String>() {
private static final long serialVersionUID = -2882956931420910207L;
#Override
public String call(String t1, String t2, String t3) throws Exception {
return t1 .replaceAll(t2,t3);
}
};
spark.udf().register("replaceWithTerm", replaceWithTerm, DataTypes.StringType);
Dataset<Row> joined = SourcePropertSet.join(SynonaymList, callUDF("contains", SourcePropertSet.col("manufacturersource"), SynonaymList.col("ManufacturerSynonymSource"))).withColumn("ManufacturerSource", callUDF("replaceWithTerm",SourcePropertSet.col("manufacturersource"),SynonaymList.col("ManufacturerSynonymSource"), SynonaymList.col("ManufacturerSynonymTarget")));
joined.show(54000);
joined.repartition(1).select("*").write().format("com.databricks.spark.csv").option("delimiter", ",")
.option("header", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("nullValue", "")
.save("D:\\xls\\synonym.csv");
}
catch(Exception e){
e.printStackTrace();
}
}
}
In the above code rather than displaying the output in console using Statement :
joined.show(54000,false);
I need to write it in to csv file directly
It gives me an runtime exception they are:
1. save("D:\xls\synonym.csv");
org.apache.spark.SparkException: Job aborted.
Caused by:
org.apache.spark.SparkException: Job aborted due to stage failure:
Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0
in stage 3.0 (TID 3, localhost, executor driver):
org.apache.spark.SparkException: Failed to execute user defined
function($anonfun$apply$2: (string, string) => boolean)
2. return t1.matches(t2);
java.lang.NullPointerException
Caused by:
org.apache.spark.SparkException: Failed to execute user defined
function($anonfun$apply$2: (string, string) => boolean)
Can anybody suggest how to write large dataset to excel file

Apache Spark SQL context dropDuplicates

I'm trying to filter DataFrame content, using Spark's 1.5 method dropDuplicates().
Using it with fully data filled tables (I mean no empty cells) gives correct result, but when my CSV source contains empty cells (I'll provide you with source file) - Spark throw ArrayIndexOutOfBoundsException.
What am I doing wrong? I've read Spark SQL and DataFrames tutorial for version 1.6.2, It does not describe DataFrame operations in detail. I am also reading book "Learning Spark. Lightning-Fast Big Data Analysis.", but It's written for Spark 1.5 and operations I need are not described there. I'll be glad to get explanation either link to manual.
Thank you.
package data;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import java.util.Arrays;
public class TestDrop {
public static void main(String[] args) {
DropData dropData = new DropData("src/main/resources/distinct-test.csv");
dropData.execute();
}
}
class DropData{
private String csvPath;
private JavaSparkContext sparkContext;
private SQLContext sqlContext;
DropData(String csvPath) {
this.csvPath = csvPath;
}
void execute(){
initContext();
DataFrame dataFrame = loadDataFrame();
dataFrame.show();
dataFrame.dropDuplicates(new String[]{"surname"}).show();
//this one fails too: dataFrame.drop("surname")
}
private void initContext() {
sparkContext = new JavaSparkContext(new SparkConf().setMaster("local[4]").setAppName("Drop test"));
sqlContext = new SQLContext(sparkContext);
}
private DataFrame loadDataFrame() {
JavaRDD<String> strings = sparkContext.textFile(csvPath);
JavaRDD<Row> rows = strings.map(string -> {
String[] cols = string.split(",");
return RowFactory.create(cols);
});
StructType st = DataTypes.createStructType(Arrays.asList(DataTypes.createStructField("name", DataTypes.StringType, false),
DataTypes.createStructField("surname", DataTypes.StringType, true),
DataTypes.createStructField("age", DataTypes.StringType, true),
DataTypes.createStructField("sex", DataTypes.StringType, true),
DataTypes.createStructField("socialId", DataTypes.StringType, true)));
return sqlContext.createDataFrame(rows, st);
}
}
Sending List instead of Object[] results as creation rows, containing 1 column with a list inside. That's what I was doing wrong.

Convert a JavaRDD String to JavaRDD Vector

I'm trying to load a csv file as a JavaRDD String and then want to get the data in JavaRDD Vector
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.feature.HashingTF;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
import org.apache.spark.mllib.stat.Statistics;
import breeze.collection.mutable.SparseArray;
import scala.collection.immutable.Seq;
public class Trial {
public void start() throws InstantiationException, IllegalAccessException,
ClassNotFoundException {
run();
}
private void run(){
SparkConf conf = new SparkConf().setAppName("csvparser");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> data = jsc.textFile("C:/Users/kalraa2/Documents/trial.csv");
JavaRDD<Vector> datamain = data.flatMap(null);
MultivariateStatisticalSummary mat = Statistics.colStats(datamain.rdd());
System.out.println(mat.mean());
}
private List<Vector> Seq(Vector dv) {
// TODO Auto-generated method stub
return null;
}
public static void main(String[] args) throws Exception {
Trial trial = new Trial();
trial.start();
}
}
The program is running without any error but i'm not able to get anything when trying to run it on spark-machine. Can anyone tell me whether the conversion of string RDD to Vector RDD is correct.
My csv file consist of only one column which are floating numbers
The null in this flatMap invocation might be a problem:
JavaRDD<Vector> datamain = data.flatMap(null);
I solved my answer by changing the code to this
JavaRDD<Vector> datamain = data.map(new Function<String,Vector>(){
public Vector call(String s){
String[] sarray = s.trim().split("\\r?\\n");
double[] values = new double[sarray.length];
for (int i = 0; i < sarray.length; i++) {
values[i] = Double.parseDouble(sarray[i]);
System.out.println(values[i]);
}
return Vectors.dense(values);
}
}
);
Assuming your trial.csv file looks like this
1.0
2.0
3.0
Taking your original code from your question a one line change is required with Java 8
SparkConf conf = new SparkConf().setAppName("csvparser").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> data = jsc.textFile("C:/Users/kalraa2/Documents/trial.csv");
JavaRDD<Vector> datamain = data.map(s -> Vectors.dense(Double.parseDouble(s)));
MultivariateStatisticalSummary mat = Statistics.colStats(datamain.rdd());
System.out.println(mat.mean());
Prints 2.0

Categories

Resources