from pyspark import SparkContext,SparkConf
#spark配置信息 注意实际工程中不会用硬编码
sc = SparkContext("local", "context")
'''
RDD操作有两个方面:
1.transformations: map filter groupby...
2.actions: reduce count collect...
特点:
1) transformation are lazy, nothing actually happens until an action is called;
2) action triggers the computation;
3) action returns values to driver or writes data to external storage;
'''
'''
1.map(func) 将函数作用到RDD中每一个元素上,返回一个新的RDD eg:word=>(word,1)
'''
def word_map():
a = sc.parallelize(["Hadoop","HDFS","Mapreduce","Spark","Mapreduce","Spark"])
b = a.map(lambda x:(x,1))
print(b.collect())
#输出[('Hadoop', 1), ('HDFS', 1), ('Mapreduce', 1), ('Spark', 1), ('Mapreduce', 1), ('Spark', 1)]
'''
2.filter(func) 返回函数中对应true的元素
'''
def my_filter():
data = [1,2,3,4,5]
rdd = sc.parallelize(data)
#链式写法
new_rdd = rdd.map(lambda x:x**2).filter(lambda x:x>10)
print(new_rdd.collect())
#输出 [16, 25]
'''
3.flatMap(func) Similar to map, but each input item can be mapped to 0 or more output items (so func should return a Seq rather than a single item).
'''
def my_flatMap():
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.map(lambda x:x.split(" "))
flatMapRdd = rdd.flatMap(lambda line:line.split(" "))
print(mapRdd.collect()) #[['hello', 'spark'], ['hello', 'world'], ['hello', 'world']]
print(flatMapRdd.collect()) #['hello', 'spark', 'hello', 'world', 'hello', 'world']
'''
4.groupByKey
'''
def word_groupbyKey():
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
groupbyRdd = mapRdd.groupByKey()
print(groupbyRdd.collect())
#输出
# [('hello', <pyspark.resultiterable.ResultIterable object at 0x1055b70b8>),
# ('spark', <pyspark.resultiterable.ResultIterable object at 0x1055b70f0>),
# ('world', <pyspark.resultiterable.ResultIterable object at 0x1055b7208>)]
print(groupbyRdd.map(lambda x:{x[0]:list(x[1])}).collect())
#[{'hello': [1, 1, 1]}, {'spark': [1]}, {'world': [1, 1]}]
'''
5.reduceByKey 默认根据key升序排序
'''
def word_reducebyKey():
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
reducebyRdd = mapRdd.reduceByKey(lambda a,b:a+b)
print(reducebyRdd.collect())
#[('hello', 3), ('spark', 1), ('world', 2)]
'''
6.sortByKey
'''
def word_sortbyKey():
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
reducebyRdd = mapRdd.reduceByKey(lambda a,b:a+b)
print(reducebyRdd.sortByKey().collect())
# [('hello', 3), ('spark', 1), ('world', 2)]
#Topk实现技巧 把key value换个位置即可
print(reducebyRdd.map(lambda x:([x[1],x[0]])).sortByKey(False).map(lambda x:(x[1],x[0])).collect());
#[('hello', 3), ('world', 2), ('spark', 1)]
'''
7.union 把两个rdd整合成一个
'''
def my_union():
a = sc.parallelize([1,2,3])
b = sc.parallelize([3,4,5])
print(a.union(b).collect())
#[1, 2, 3, 3, 4, 5]
'''
7.distinct Return a new dataset that contains the distinct elements of the source dataset.
'''
def my_distinct():
a = sc.parallelize([1,2,3])
b = sc.parallelize([3,4,5])
print(a.union(b).distinct().collect())
'''
8.join When called on datasets of type (K, V) and (K, W),
returns a dataset of (K, (V, W)) pairs with all pairs of elements for each key.
Outer joins are supported through leftOuterJoin, rightOuterJoin, and fullOuterJoin.
'''
def my_join():
a = sc.parallelize([("A", "a1"), ("C", "c1"), ("D", "d1"), ("F", "f1"), ("F", "f2")])
b = sc.parallelize([("A", "a2"), ("C", "c2"), ("C", "c3"), ("E", "e1")])
print(a.fullOuterJoin(b).collect())
'''
9.action类操作 reduce foreach
'''
def my_action():
data = [1,2,3,4,5,6,7,8,9,10]
rdd = sc.parallelize(data)
rdd.collect()
#reduce
print(rdd.reduce(lambda x,y:x+y))
#foreach
print(rdd.foreach(lambda x:print(x)))
Spark学习笔记(一):pySpark RDD编程
猜你喜欢
转载自blog.csdn.net/weixin_41993767/article/details/87391287
今日推荐
周排行