Spark电商项目的练习代码

本人刚刚开始学习spark，接触到电商有关的项目，下面是电商项目的训练代码。
# -*- coding: utf-8 -*-
# Program function：读取数据
#
# -*- coding: utf-8 -*-
# Program function：
import os
from pyspark.sql import SparkSession

# Import data types
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

# 这里可以选择本地PySpark环境执行Spark代码，也可以使用虚拟机中PySpark环境，通过os可以配置
os.environ['SPARK_HOME'] = '/export/server/spark-2.3.0-bin-hadoop2.7'
PYSPARK_PYTHON = "/root/anaconda3/envs/pyspark_env/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON


if __name__ == '__main__':
    # 1-环境变量
    spark = SparkSession.builder \
        .appName('test') \
        .getOrCreate()
    sc = spark.sparkContext
    # 2-获取数据
    data = spark.read.format("csv")\
        .option("header", True)\
        .option("inferSchema", "true")\
        .load("file:///tmp/pycharm_project_553/ECommerce/data/E_Commerce_Data_Clean.csv")

    data.printSchema()
    data.createOrReplaceTempView("data")
    data.show(2)

    '''
    (1)每个客户由编号CustomerID唯一标识，所以客户的数量为COUNT(DISTINCT CustomerID)，
    (2)再按照国家Country分组统计，
    (3)根据客户数降序排序，筛选出10个客户数最多的国家
    '''


    def countryCustomer():
        countryCustomerDF  = spark.sql(
            """
            select Country, count(distinct CustomerID) AS countOfCustomer
            from data
            group by Country
            order by countOfCustomer DESC 
            limit 10
            """
        )
        countryCustomerDF.show()
        return countryCustomerDF.collect()

    # countryCustomer()

    '''
    (1)SUM(Quantity)即可统计出总销量，即使有退货的情况。
    (2)再按照国家Country分组统计，
    (3)根据销量降序排序，
    (4)筛选出10个销量最高的国家。
    '''
    def countryQuantity():
        countryQuantityDF = spark.sql("""
            select Country, sum(Quantity) as total from data
            group by Country
            order by total desc 
            limit 10
        """)
        countryQuantityDF.show()
        return countryQuantityDF.collect()
    # countryQuantity()


    '''
    (1)UnitPrice 字段表示单价，Quantity字段表示销量，退货的记录中Quantity字段为负数，
    所以使用SUM(UnitPrice*Quantity)即可统计出总销售额，即使有退货的情况。
    (2)再按照国家Country分组统计，计算出各个国家的总销售额。
    '''
    def countrySumOfPrice():
        countrySumOfPriceDF = spark.sql("""
            select Country, round(sum(UnitPrice*Quantity), 2) as sum_money from data
            group by Country order by sum_money desc
        """)
        countrySumOfPriceDF.show()
        return countrySumOfPriceDF.collect()
    # countrySumOfPrice()

    '''
    (1)Quantity字段表示销量，退货的记录中Quantity字段为负数，所以使用SUM(Quantity)即可统计出总销量，即使有退货的情况。
    (2)再按照商品编码StockCode分组统计，计算出各个商品的销量。
    '''
    def stockQuantity():
        stockQuantityDF = spark.sql(
            """
            select StockCode, sum(Quantity) as cnt from data
            group by StockCode
            order by cnt desc
            limit 10
            """
        )
        stockQuantityDF.show()
        return stockQuantityDF.collect()
    # stockQuantity()


    '''
    (1)Description字段表示商品描述，由若干个单词组成，使用LOWER(Description)将单词统一转换为小写。
    #SELECT LOWER(Description) as description from data
    (2)此时的结果为DataFrame类型，使用df.withColumn生成words列，使用爆炸explode函数将单词扁平化
    (3)利用words分组统计并根据count进行降序排序
    (4)过滤掉空字符串，最后利用df.take(300)返回
    '''
    from pyspark.sql import functions as F


    def wordCount():
        df_words = spark.sql("select lower(Description) as description from data")
        df_news = df_words.withColumn("words", F.explode(F.split(F.col("description"), " ")))
        count__order_byDF = df_news.groupBy("words").count().orderBy("count", ascending=False)
        wordCountDF = count__order_byDF.filter(count__order_byDF["words"]!="")
        wordCountDF.show(20)
    # wordCount()


    '''
     3.3.4.6 	退货订单数最多的10个国家
    （1）InvoiceNo字段表示订单编号，所以订单总数为COUNT(DISTINCT InvoiceNo)，
    （2）由于退货订单的编号的首个字母为C，例如C540250，所以利用WHERE InvoiceNo LIKE ‘C%’子句即可筛选出退货的订单，
    （3）再按照国家Country分组统计，
    （4）根据退货订单总数降序排序，
    （5）筛选出10个退货订单数最多的国家。
    '''
    def countryReturnInvoice():
        result = spark.sql("""
            select Country, COUNT(Distinct InvoiceNo) as countInvoiceNo from data 
            where InvoiceNo like 'C%'
            group by Country
            order by countInvoiceNo desc
            limit 10      
        """)
        result.show()
    # countryReturnInvoice()

    # 3.3.4.7 	商品的平均单价与销量的关系
    '''
    (1)由于商品的单价UnitPrice是不断变化的，所以使用平均单价AVG(DISTINCT UnitPrice)来衡量一个商品。
    (2)再利用SUM(Quantity)计算出销量，
    (3)将结果按照商品的编号StockCode进行分组统计，
    (4)执行collect()方法即可将结果以数组的格式返回。
    '''
    def unitPriceSales():
        result = spark.sql("""
            select StockCode, AVG(DISTINCT UnitPrice), SUM(Quantity)
            from data
            group by StockCode
        """)
        result.show(5)
        return result.collect()
    # unitPriceSales()

    # 月销售额随时间的变化趋势
    def formatData():
        tradeRDD = data.select("InvoiceDate", "Quantity", "UnitPrice").rdd
        result1 = tradeRDD.map(lambda line: (line["InvoiceDate"].split(" ")[0], line["Quantity"], line["UnitPrice"]))
        result2 = result1.map(lambda line: (line[0].split("/"), line[1], line[2]))
        result3 = result2.map(lambda line: (line[0][2],
                                            line[0][0] if len(line[0][0])==2 else "0"+line[0][0],
                                            line[0][1] if len(line[0][0])==2 else "0"+line[0][1],
                                            line[1],
                                            line[2]))
        return result3


    def tradePrice():
        result3 = formatData()
        result4 = result3.map(lambda line: (line[0]+"-"+line[1], line[3]*line[4]))
        result5 = result4.reduceByKey(lambda a,b: a+b).sortByKey()
        schema = StructType([StructField("date", StringType(), True),
                             StructField("tradePrice", DoubleType(), True)])
        tradePriceDF = spark.createDataFrame(result5, schema)
        tradePriceDF.show(3)
    # tradePrice()

    # 8 日销量随时间的变化趋势
    def saleQuantity():
        result3 = formatData()
        result4 = result3.map(lambda line: (line[0]+"-"+line[1]+"-"+line[2], line[3]))
        result5 = result4.reduceByKey(lambda a,b: a+b).sortByKey()
        schema = StructType([StructField("date", StringType(), True), StructField("saleQuantity", IntegerType(), True)])
        saleQuantityDF = spark.createDataFrame(result5, schema)
        saleQuantityDF.show(10)
    # saleQuantity()

    # 各国的购买订单量和退货订单量的关系
    '''
    (1)InvoiceNo字段表示订单编号，退货订单的编号的首个字母为C，例如C540250。利用COUNT(DISTINCT InvoiceNo)子句统计订单总量，
    (2)再分别用WHERE InvoiceNo LIKE ‘C%’和WHERE InvoiceNo NOT LIKE ‘C%’统计出退货订单量和购买订单量。
    (3)接着按照国家Country分组统计，得到的returnDF和buyDF均为DataFrame类型，分别表示退货订单和购买订单
    (4)再对这两个DataFrame执行join操作，连接条件为国家Country相同，得到一个DataFrame。
    (5)但是这个DataFrame中有4个属性，包含2个重复的国家Country属性和1个退货订单量和1个购买订单量，为减少冗余，对结果筛选3个字段形成buyReturnDF。
    '''
    def buyReturn():
        buyDF = spark.sql("""
            select count(distinct InvoiceNo) as countInvoiceNo, Country
            from data
            where InvoiceNo like 'C%'
            group by Country
        """)
        buyDF.show(2)

        returnDF = spark.sql("""
                    select count(distinct InvoiceNo) as countInvoiceNo, Country
                    from data
                    where InvoiceNo not like 'C%'
                    group by Country
                """)
        returnDF.show(2)

        buyDF.createOrReplaceTempView("buy_data")
        returnDF.createOrReplaceTempView("return_data")

        resultDF = spark.sql("""
            select t1.Country, t1.countInvoiceNo as buy, t2.countInvoiceNo as return
            from buy_data as t1
            inner join return_data as t2
            on t1.Country = t2.Country
        """)

        resultDF.show(5)
    buyReturn()
Spark电商项目的练习代码

Spark电商项目的练习代码

猜你喜欢