此处我们对数据进行以下的处理以方便后续的数据分析和可视化工作:
数据清洗
- 去除重复数据
- 去除购买人数为空的记录
- 类型转换:将购买人数转换为数值型数据
- 字段扩充:增加收入列,价格*购买人数=收入
- 字段扩充:增加商品价格分箱数据
- 提取省份名称字段。
- 对商品名称进行分词处理。
# 读入数据
df_all = pd.read_excel('热干面数据.xlsx')
df = df_all.copy()
# 去除重复值
df.drop_duplicates(inplace=True)
# 删除购买人数为空的记录
df = df[df['purchase_num'].str.contains('人付款')]
# 提取数值
df['num'] = [re.findall(r'(\d+\.{0,1}\d*)', i)[0] for i in df['purchase_num']] # 提取数值
df['num'] = df['num'].astype('float') # 转化数值型
# 提取单位
df['unit'] = [''.join(re.findall(r'(万)', i)) for i in df['purchase_num']] # 提取单位
df['unit'] = df['unit'].apply(lambda x:10000 if x=='万' else 1)
# 计算真实金额
df['purchase_num'] = df['num'] * df['unit']
# 提取省份
df['province_name'] = df['location'].str.split(' ').apply(lambda x:x[0])
# 删除多余的列
df.drop(['num', 'unit'], axis=1, inplace=True)
# 重置索引
df = df.reset_index(drop=True)
df.head()
数据可视化
使用pyecharts进行动态数据可视化展示。我们主要对以下几个方面信息进行分析。
- 店铺销量排名Top10,看看哪些店铺销量高。
- 各省份店铺数量排名Top10,看看销量最高的热干面都来自哪里。
- 全国省份销量地区分布
- 商品标题文本分析,看看热干面搜索的结果页面,哪种种类关键词出现的比较多。
- 商品价格分布和各价格区间的销量表现。
店铺销量排名Top10
# 导入包
from pyecharts.charts import Bar
from pyecharts import options as opts
# 计算top10店铺
shop_top10 = df.groupby('shop_name')['purchase_num'].sum().sort_values(ascending=False).head(10)
# 绘制柱形图
bar1 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px'))
bar1.add_xaxis(shop_top10.index.tolist())
bar1.add_yaxis('sales_num', shop_top10.values.tolist())
bar1.set_global_opts(title_opts=opts.TitleOpts(title='热干面店铺商品销量Top10'),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
visualmap_opts=opts.VisualMapOpts(max_=shop_top10.values.max()))
bar1.render()
各省份店铺数量排名Top10
# 计算销量top10
province_top10 = df.groupby('province_name')['purchase_num'].sum().sort_values(ascending=False).head(10)
# 条形图
bar2 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px'))
bar2.add_xaxis(province_top10.index.tolist())
bar2.add_yaxis('sales_num', province_top10.values.tolist())
bar2.set_global_opts(title_opts=opts.TitleOpts(title='热干面商品销量省份排名Top10'),
visualmap_opts=opts.VisualMapOpts(max_=province_top10.values.max()))
bar2.render()
全国省份销量地区分布
from pyecharts.charts import Map
# 计算销量
province_num = df.groupby('province_name')['purchase_num'].sum().sort_values(ascending=False)
# 绘制地图
map1 = Map(init_opts=opts.InitOpts(width='1350px', height='750px'))
map1.add("", [list(z) for z in zip(province_num.index.tolist(), province_num.values.tolist())],
maptype='china'
)
map1.set_global_opts(title_opts=opts.TitleOpts(title='国内各省份热干面销量分布'),
visualmap_opts=opts.VisualMapOpts(max_=300000),
toolbox_opts=opts.ToolboxOpts()
)
map1.render()
商品价格分布和各价格区间的销量表现
def tranform_price(x):
if x <= 20:
return '0~20'
elif x <= 50:
return '20~50'
elif x <= 100:
return '50~100'
elif x <= 200:
return '100~200'
else:
return '200~2500'
df['price_cut'] = df.price.apply(lambda x: tranform_price(x))
price_num = df.price_cut.value_counts()
price_num
bar3 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px'))
bar3.add_xaxis(price_num.index.tolist())
bar3.add_yaxis('price_num', price_num.values.tolist())
bar3.set_global_opts(title_opts=opts.TitleOpts(title='不同价格区间的商品数量'),
visualmap_opts=opts.VisualMapOpts(max_=1500))
bar3.render()
from pyecharts.charts import Pie
price_cut_num = df.groupby('price_cut')['purchase_num'].sum()
data_pair = [list(z) for z in zip(price_cut_num.index, price_cut_num.values)]
# 饼图
pie1 = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))
# 内置富文本
pie1.add(
series_name="sales",
radius=["35%", "55%"],
data_pair=data_pair,
label_opts=opts.LabelOpts(
position="outside",
formatter="{a|{a}}{abg|}\n{hr|}\n {b|{b}: }{c} {per|{d}%} ",
background_color="#eee",
border_color="#aaa",
border_width=1,
border_radius=4,
rich={
"a": {"color": "#999", "lineHeight": 22, "align": "center"},
"abg": {
"backgroundColor": "#e3e3e3",
"width": "100%",
"align": "right",
"height": 22,
"borderRadius": [4, 4, 0, 0],
},
"hr": {
"borderColor": "#aaa",
"width": "100%",
"borderWidth": 0.5,
"height": 0,
},
"b": {"fontSize": 16, "lineHeight": 33},
"per": {
"color": "#eee",
"backgroundColor": "#334455",
"padding": [2, 4],
"borderRadius": 2,
},
},
),
)
pie1.set_global_opts(legend_opts=opts.LegendOpts(pos_left="left", pos_top='30%', orient="vertical"),
toolbox_opts=opts.ToolboxOpts(),
title_opts=opts.TitleOpts(title='热干面不同价格销量占比'))
pie1.set_series_opts(
tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)")
)
pie1.render()
商品标题文本分析 - 词云图
# 分词
import jieba
import jieba.analyse
txt = df['goods_name'].str.cat(sep='。')
# 添加关键词
jieba.add_word('热干面')
# 读入停用词表
stop_words = []
with open('stop_words.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
stop_words.append(line.strip())
# 添加停用词
stop_words.extend(['10', '12', '20', '200g', '500g', '900g', '300g'])
# 评论字段分词处理
word_num = jieba.analyse.extract_tags(txt,
topK=100,
withWeight=True,
allowPOS=())
# 去停用词
word_num_selected = []
for i in word_num:
if i[0] not in stop_words:
word_num_selected.append(i)
key_words = pd.DataFrame(word_num_selected, columns=['words','num'])
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType
word1 = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px'))
word1.add("", [*zip(key_words.words, key_words.num)],
word_size_range=[20, 200],
shape=SymbolType.DIAMOND)
word1.set_global_opts(title_opts=opts.TitleOpts('热干面店铺商品关键词分布'),
toolbox_opts=opts.ToolboxOpts())
word1.render()
在一个页面中生成所有图
from pyecharts.charts import Page
page = Page()
page.add(bar1, bar2, map1, bar3, pie1, word1)
page.render('热干面数据分析.html')
https://mp.weixin.qq.com/s/GcVVfeU3SvpjNX6G1qMZrQ