java 版本
package source;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.configuration.Configuration;
import java.util.ArrayList;
public class JavaDataSetSourceApp {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
recursive(env);
}
private static void recursive(ExecutionEnvironment env) throws Exception {
Configuration parameters = new Configuration();
parameters.setBoolean("recursive.file.enumeration", true);
DataSet<String> logs = env.readTextFile("F:\\data\\wordCount1\\Recursive").withParameters(parameters);
logs.print();
}
public static void readCSvFile(ExecutionEnvironment env) throws Exception{
}
private static void readTextFile(ExecutionEnvironment env) throws Exception {
DataSource<String> fileSource = env.readTextFile("F:\\data\\wordCount1", "UTF-8");
fileSource.print();
}
private static void fromCollection(ExecutionEnvironment env) throws Exception {
ArrayList<Integer> list = new ArrayList<>();
for (int i = 1; i <=10 ; i++) {
list.add(i);
}
DataSource<Integer> dataSource = env.fromCollection(list);
dataSource.print();
}
}
scala 版本
package source
import org.apache.flink.api.scala._
object DataSetDataSourceApp {
case class StudentClass( name:String, age:Int,job:String)
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment
.getExecutionEnvironment
println(env.getParallelism)
}
def textFile(env: ExecutionEnvironment): Unit = {
val fileSource: DataSet[String] = env.readTextFile("F:\\data\\wordCount1\\", "UTF-8")
fileSource.print()
}
def readCsv(env:ExecutionEnvironment):Unit={
val filePateh = "F:\\data\\wordCount1\\student.txt";
val csvSource1 = env.readCsvFile[(String,Integer,String)](filePateh,ignoreFirstLine = true)
env.readCsvFile[(String,Integer)](filePateh,ignoreFirstLine = true,includedFields = Array(0,1)).print()
val csvSource = env.readCsvFile[(String,Integer,String)](filePateh,ignoreFirstLine = true,includedFields = Array(0,1,2)).print()
env.readCsvFile[StudentClass](filePateh,ignoreFirstLine = true,includedFields = Array(0,1,2)).print()
}
private def fromCollection(env: ExecutionEnvironment): Unit = {
val data = 1 to 10;
val value: DataSet[Int] = env.fromCollection(data)
println(value.getParallelism)
value.print()
}
}
总结: 有用的部分就是 readCsvFile 和 readTextFile(可以读取压缩文件)