深圳地铁客流大数据 Spark 技术栈

写在前面

学以致用,本项目通过对深圳市开放数据之轨道交通客流情况进行分析,了解深漂的我们每天在上下班都经历了些什么…

本系列项目以Spark技术栈为主,

花絮

SHOW DATABASES;

CREATE DATABASE IF NOT EXISTS szdw_ods;
CREATE DATABASE IF NOT EXISTS szdw_dwd;
CREATE DATABASE IF NOT EXISTS szdw_dws;
CREATE DATABASE IF NOT EXISTS szdw_ads;

USE szdw_ods;

SHOW TABLES;

--1 ODS 原始表, 不做改动, 直接加载
//OGT-101  |FFHEDIBCC|2018-09-01 00:00:00|地铁五号线  |0        |2018-08-31 23:11:06|665       |地铁出站 |700       |263031101|长龙   |
DROP TABLE IF EXISTS szdw_ods.subway_swipe_record_20180901;
CREATE EXTERNAL TABLE szdw_ods.subway_swipe_record_20180901(
car_no String COMMENT '车',
card_no String COMMENT '卡号',
close_date String COMMENT '结算时间',
company_name String COMMENT '线名',
conn_mark String COMMENT '联程标记',
deal_date String COMMENT '进站时间',
deal_money String COMMENT '应该收入',
deal_type String COMMENT '出行类型',
deal_value String COMMENT '实际收入',
equ_no String COMMENT '闸机号',
station String COMMENT '站名'
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';

LOAD DATA local INPATH '/Users/liuge36/IdeaProjects/liuge-flink/data/csv/part-00000-0116a51f-7a9c-4916-b2cd-546edce131f5-c000.csv'
OVERWRITE INTO TABLE szdw_ods.subway_swipe_record_20180901;

SELECT * FROM szdw_ods.subway_swipe_record_20180901 LIMIT 10;

SELECT collect_set(deal_type) FROM szdw_ods.subway_swipe_record_20180901;

--["地铁入站","地铁出站","巴士"]

第二层:DWD 清洗降维层
区分维表 dim_ 和事实表 fact_,为了使粒度更加细化,我们把进站和出站记录分开,巴士数据暂不考虑。

--地铁进出站
DROP TABLE IF EXISTS szdw_dwd.fact_subway_in_out_detail;
CREATE EXTERNAL TABLE szdw_dwd.fact_subway_in_out_detail(
car_no String COMMENT '车',
card_no String COMMENT '卡号',
close_date String COMMENT '结算时间',
company_name String COMMENT '线名',
conn_mark String COMMENT '联程标记',
deal_date String COMMENT '进站时间',
deal_money String COMMENT '应该收入',
deal_type String COMMENT '出行类型',
deal_value String COMMENT '实际收入',
equ_no String COMMENT '闸机号',
station String COMMENT '站名'
)
PARTITIONED BY(dw_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';




INSERT OVERWRITE TABLE szdw_dwd.fact_subway_in_out_detail
partition(dw_dt = '2018-09-01')
SELECT
car_no,
card_no,
close_date,
company_name,
conn_mark,
deal_date,
deal_money,
deal_type,
deal_value,
equ_no ,
station
FROM szdw_ods.subway_swipe_record_20180901
WHERE deal_type != '巴士'
  AND unix_timestamp(deal_date, 'yyyy-MM-dd HH:mm:ss') > unix_timestamp('2018-09-01 06:14:00', 'yyyy-MM-dd HH:mm:ss')
  AND unix_timestamp(deal_date, 'yyyy-MM-dd HH:mm:ss') < unix_timestamp('2018-09-01 23:59:00', 'yyyy-MM-dd HH:mm:ss')
ORDER BY deal_date;


SELECT count(1) FROM szdw_dwd.fact_subway_in_out_detail where dw_dt = '2018-09-01'; --780937



--地铁进站
DROP TABLE IF EXISTS szdw_dwd.fact_subway_in_detail;
CREATE EXTERNAL TABLE szdw_dwd.fact_subway_in_detail(
car_no String COMMENT '车',
card_no String COMMENT '卡号',
close_date String COMMENT '结算时间',
company_name String COMMENT '线名',
conn_mark String COMMENT '联程标记',
deal_date String COMMENT '进站时间',
deal_money String COMMENT '应该收入',
deal_type String COMMENT '出行类型',
deal_value String COMMENT '实际收入',
equ_no String COMMENT '闸机号',
station String COMMENT '站名'
)
PARTITIONED BY(dw_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';



INSERT OVERWRITE TABLE szdw_dwd.fact_subway_in_detail
partition(dw_dt = '2018-09-01')
SELECT
car_no,
card_no,
close_date,
company_name,
conn_mark,
deal_date,
deal_money,
deal_type,
deal_value,
equ_no ,
station
FROM szdw_dwd.fact_subway_in_out_detail
WHERE deal_type = '地铁入站'
  AND dw_dt = '2018-09-01'
ORDER BY deal_date ;


SELECT count(1) FROM szdw_dwd.fact_subway_in_detail where dw_dt = '2018-09-01'; --415386





--地铁出站
DROP TABLE IF EXISTS szdw_dwd.fact_subway_out_detail;
CREATE EXTERNAL TABLE szdw_dwd.fact_subway_out_detail(
car_no String COMMENT '车',
card_no String COMMENT '卡号',
close_date String COMMENT '结算时间',
company_name String COMMENT '线名',
conn_mark String COMMENT '联程标记',
deal_date String COMMENT '进站时间',
deal_money String COMMENT '应该收入',
deal_type String COMMENT '出行类型',
deal_value String COMMENT '实际收入',
equ_no String COMMENT '闸机号',
station String COMMENT '站名'
)
PARTITIONED BY(dw_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';



INSERT OVERWRITE TABLE szdw_dwd.fact_subway_out_detail
partition(dw_dt = '2018-09-01')
SELECT
car_no,
card_no,
close_date,
company_name,
conn_mark,
deal_date,
deal_money,
deal_type,
deal_value,
equ_no ,
station
FROM szdw_dwd.fact_subway_in_out_detail
WHERE deal_type = '地铁出站'
  AND dw_dt = '2018-09-01'
ORDER BY deal_date ;


SELECT count(1) FROM szdw_dwd.fact_subway_out_detail where dw_dt = '2018-09-01'; --365551



-- --DWS 宽表
szdw_ods.subway_swipe_record_20180901
szdw_dwd.fact_subway_in_out_detail
szdw_dwd.fact_subway_in_detail
szdw_dwd.fact_subway_out_detail

szdw_dws.subway_card_record_day_wide

DROP TABLE IF EXISTS szdw_dws.subway_card_record_day_wide;
CREATE EXTERNAL TABLE szdw_dws.subway_card_record_day_wide(
card_no STRING,
deal_date_arr ARRAY < STRING > ,
deal_value_arr ARRAY < STRING > ,
deal_type_arr ARRAY < STRING > ,
company_name_arr ARRAY < STRING > ,
station_arr ARRAY < STRING > ,
conn_mark_arr ARRAY < STRING > ,
deal_money_arr ARRAY < STRING > ,
equ_no_arr ARRAY < STRING > ,
`count` int)
PARTITIONED BY(dw_dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';



INSERT OVERWRITE TABLE szdw_dws.subway_card_record_day_wide PARTITION(dw_dt = '2018-09-01')
SELECT card_no,
       collect_list(deal_date),
       collect_list(deal_value),
       collect_list(deal_type),
       collect_list(company_name),
       collect_list(station),
       collect_list(conn_mark),
       collect_list(deal_money),
       collect_list(equ_no),
       count(1) c
FROM szdw_dwd.fact_subway_in_out_detail
WHERE dw_dt = '2018-09-01'
GROUP BY card_no
ORDER BY c DESC;


SELECT count(1) FROM szdw_dws.subway_card_record_day_wide where dw_dt = '2018-09-01'; --412082


select * from szdw_dws.subway_card_record_day_wide limit 10;


--ADS 业务表, 当天的表现







猜你喜欢

转载自blog.csdn.net/liuge36/article/details/111026544