[Summary] Java-based Spark, DF, RDD common code

Create a new Row in Java

import org.apache.spark.sql.RowFactory;

Row row = RowFactory.create("odd", i);

Create a Seq in Java

import scala.collection.JavaConversions;
import scala.collection.Seq;
import static java.util.Arrays.asList;

Seq<String> seq = JavaConversions.asScalaBuffer(asList("col_1","col_2"));

 

 

List<Row> data = new ArrayList<>();
for(int i=0;i<5;i++){
    List<String> mlist = new ArrayList<>();
    mlist.add("odd");
    mlist.add(String.valueOf(i));
    Row row = RowFactory.create(mlist.toArray());
    data.add(row);
}
StructType schema = DataTypes.createStructType(new StructField[]{
            createStructField("types", DataTypes.StringType, false),
            createStructField("nums", DataTypes.StringType, false)
        }
);



List<Row> data2 = new ArrayList<>();
for(int i=0;i<5;i++){
    List<String> mlist = new ArrayList<>();
    mlist.add("odd");
    mlist.add(String.valueOf(i));
    mlist.add(String.valueOf(5-i));
    Row row = RowFactory.create(mlist.toArray());
    data2.add(row);
}
StructType schema2 = DataTypes.createStructType(new StructField[]{
            createStructField("types", DataTypes.StringType, false),
            createStructField("nums", DataTypes.StringType, false),
            createStructField("ad", DataTypes.StringType, false),

        }
);


Dataset<Row> df = spark.createDataFrame(data, schema);
Dataset<Row> df2 = spark.createDataFrame(data2, schema2);
df.show();
df2.show();
Dataset<Row> df_join = df.join(df2, df.col("types").equalTo(df2.col("types"))
                               .and(df.col("nums").equalTo(df2.col("nums"))), "inner"    );
df_join.show();


df_join = df.join(df2, JavaConversions.asScalaBuffer(asList("nums","types")));
df_join.show();

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325508295&siteId=291194637