写在前面
学以致用,本项目通过对深圳市开放数据之轨道交通客流情况进行分析,了解深漂的我们每天在上下班都经历了些什么…
本系列项目以Spark技术栈为主,
花絮
SHOW DATABASES;
CREATE DATABASE IF NOT EXISTS szdw_ods;
CREATE DATABASE IF NOT EXISTS szdw_dwd;
CREATE DATABASE IF NOT EXISTS szdw_dws;
CREATE DATABASE IF NOT EXISTS szdw_ads;
USE szdw_ods;
SHOW TABLES;
--1 ODS 原始表, 不做改动, 直接加载
//OGT-101 |FFHEDIBCC|2018-09-01 00:00:00|地铁五号线 |0 |2018-08-31 23:11:06|665 |地铁出站 |700 |263031101|长龙 |
DROP TABLE IF EXISTS szdw_ods.subway_swipe_record_20180901;
CREATE EXTERNAL TABLE szdw_ods.subway_swipe_record_20180901(
car_no String COMMENT '车',
card_no String COMMENT '卡号',
close_date String COMMENT '结算时间',
company_name String COMMENT '线名',
conn_mark String COMMENT '联程标记',
deal_date String COMMENT '进站时间',
deal_money String COMMENT '应该收入',
deal_type String COMMENT '出行类型',
deal_value String COMMENT '实际收入',
equ_no String COMMENT '闸机号',
station String COMMENT '站名'
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
LOAD DATA local INPATH '/Users/liuge36/IdeaProjects/liuge-flink/data/csv/part-00000-0116a51f-7a9c-4916-b2cd-546edce131f5-c000.csv'
OVERWRITE INTO TABLE szdw_ods.subway_swipe_record_20180901;
SELECT * FROM szdw_ods.subway_swipe_record_20180901 LIMIT 10;
SELECT collect_set(deal_type) FROM szdw_ods.subway_swipe_record_20180901;
--["地铁入站","地铁出站","巴士"]
第二层:DWD 清洗降维层
区分维表 dim_ 和事实表 fact_,为了使粒度更加细化,我们把进站和出站记录分开,巴士数据暂不考虑。
--地铁进出站
DROP TABLE IF EXISTS szdw_dwd.fact_subway_in_out_detail;
CREATE EXTERNAL TABLE szdw_dwd.fact_subway_in_out_detail(
car_no String COMMENT '车',
card_no String COMMENT '卡号',
close_date String COMMENT '结算时间',
company_name String COMMENT '线名',
conn_mark String COMMENT '联程标记',
deal_date String COMMENT '进站时间',
deal_money String COMMENT '应该收入',
deal_type String COMMENT '出行类型',
deal_value String COMMENT '实际收入',
equ_no String COMMENT '闸机号',
station String COMMENT '站名'
)
PARTITIONED BY(dw_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
INSERT OVERWRITE TABLE szdw_dwd.fact_subway_in_out_detail
partition(dw_dt = '2018-09-01')
SELECT
car_no,
card_no,
close_date,
company_name,
conn_mark,
deal_date,
deal_money,
deal_type,
deal_value,
equ_no ,
station
FROM szdw_ods.subway_swipe_record_20180901
WHERE deal_type != '巴士'
AND unix_timestamp(deal_date, 'yyyy-MM-dd HH:mm:ss') > unix_timestamp('2018-09-01 06:14:00', 'yyyy-MM-dd HH:mm:ss')
AND unix_timestamp(deal_date, 'yyyy-MM-dd HH:mm:ss') < unix_timestamp('2018-09-01 23:59:00', 'yyyy-MM-dd HH:mm:ss')
ORDER BY deal_date;
SELECT count(1) FROM szdw_dwd.fact_subway_in_out_detail where dw_dt = '2018-09-01'; --780937
--地铁进站
DROP TABLE IF EXISTS szdw_dwd.fact_subway_in_detail;
CREATE EXTERNAL TABLE szdw_dwd.fact_subway_in_detail(
car_no String COMMENT '车',
card_no String COMMENT '卡号',
close_date String COMMENT '结算时间',
company_name String COMMENT '线名',
conn_mark String COMMENT '联程标记',
deal_date String COMMENT '进站时间',
deal_money String COMMENT '应该收入',
deal_type String COMMENT '出行类型',
deal_value String COMMENT '实际收入',
equ_no String COMMENT '闸机号',
station String COMMENT '站名'
)
PARTITIONED BY(dw_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
INSERT OVERWRITE TABLE szdw_dwd.fact_subway_in_detail
partition(dw_dt = '2018-09-01')
SELECT
car_no,
card_no,
close_date,
company_name,
conn_mark,
deal_date,
deal_money,
deal_type,
deal_value,
equ_no ,
station
FROM szdw_dwd.fact_subway_in_out_detail
WHERE deal_type = '地铁入站'
AND dw_dt = '2018-09-01'
ORDER BY deal_date ;
SELECT count(1) FROM szdw_dwd.fact_subway_in_detail where dw_dt = '2018-09-01'; --415386
--地铁出站
DROP TABLE IF EXISTS szdw_dwd.fact_subway_out_detail;
CREATE EXTERNAL TABLE szdw_dwd.fact_subway_out_detail(
car_no String COMMENT '车',
card_no String COMMENT '卡号',
close_date String COMMENT '结算时间',
company_name String COMMENT '线名',
conn_mark String COMMENT '联程标记',
deal_date String COMMENT '进站时间',
deal_money String COMMENT '应该收入',
deal_type String COMMENT '出行类型',
deal_value String COMMENT '实际收入',
equ_no String COMMENT '闸机号',
station String COMMENT '站名'
)
PARTITIONED BY(dw_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
INSERT OVERWRITE TABLE szdw_dwd.fact_subway_out_detail
partition(dw_dt = '2018-09-01')
SELECT
car_no,
card_no,
close_date,
company_name,
conn_mark,
deal_date,
deal_money,
deal_type,
deal_value,
equ_no ,
station
FROM szdw_dwd.fact_subway_in_out_detail
WHERE deal_type = '地铁出站'
AND dw_dt = '2018-09-01'
ORDER BY deal_date ;
SELECT count(1) FROM szdw_dwd.fact_subway_out_detail where dw_dt = '2018-09-01'; --365551
-- --DWS 宽表
szdw_ods.subway_swipe_record_20180901
szdw_dwd.fact_subway_in_out_detail
szdw_dwd.fact_subway_in_detail
szdw_dwd.fact_subway_out_detail
szdw_dws.subway_card_record_day_wide
DROP TABLE IF EXISTS szdw_dws.subway_card_record_day_wide;
CREATE EXTERNAL TABLE szdw_dws.subway_card_record_day_wide(
card_no STRING,
deal_date_arr ARRAY < STRING > ,
deal_value_arr ARRAY < STRING > ,
deal_type_arr ARRAY < STRING > ,
company_name_arr ARRAY < STRING > ,
station_arr ARRAY < STRING > ,
conn_mark_arr ARRAY < STRING > ,
deal_money_arr ARRAY < STRING > ,
equ_no_arr ARRAY < STRING > ,
`count` int)
PARTITIONED BY(dw_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
INSERT OVERWRITE TABLE szdw_dws.subway_card_record_day_wide PARTITION(dw_dt = '2018-09-01')
SELECT card_no,
collect_list(deal_date),
collect_list(deal_value),
collect_list(deal_type),
collect_list(company_name),
collect_list(station),
collect_list(conn_mark),
collect_list(deal_money),
collect_list(equ_no),
count(1) c
FROM szdw_dwd.fact_subway_in_out_detail
WHERE dw_dt = '2018-09-01'
GROUP BY card_no
ORDER BY c DESC;
SELECT count(1) FROM szdw_dws.subway_card_record_day_wide where dw_dt = '2018-09-01'; --412082
select * from szdw_dws.subway_card_record_day_wide limit 10;
--ADS 业务表, 当天的表现