前言
主要是涉及RDD批处理、SparkSQL、Sparkstreaming、Spark ML的pyspark实践学习
一、RDD批处理
运行环境:个人电脑
from pyspark import SparkConf, SparkContext
# import matplotlib.pyplot as plt
# from matplotlib.pyplot import hist
import numpy as np
import os
# 配置环境
os.environ ['JAVA_HOME'] = 'C:\Program Files\Java\jdk1.8.0_171'
os.environ ['SPARK_HOME'] = 'C:\spark-2.2.0-bin-hadoop2.7'
os.environ ['HADOOP_HOME'] = 'C:\hadoop.dll-and-winutils.exe-for-hadoop2.7.3-on-windows_X64-master'
# 格式化数据
conf = SparkConf().setMaster("local[*]").setAppName("First_App")
sc = SparkContext(conf=conf)
# 路径需变
# 1.探索用户数据
user_data = sc.textFile(r'D:\projects\sparklearn_data\ml-100k\u.user')
print(user_data.first()) # 1|24|M|technician|85711首行用户、