hadoop——hive视频观看热度，Top N案例（youtube）

数据准备

user.txt

0.txt

字段以及字段名解析

user表

字段备注字段类型

uploader 上传者用户名 string

videos 上传视频数 int

friends 朋友数量 int

视频表：

字段备注详细描述

video id 视频唯一 id 11 位字符串

uploader 视频上传者上传视频的用户名 String

age 视频年龄视频上传日期和 2007 年 2 月

15 日之间的整数天（Youtube的独特设定）

category 视频类别上传视频指定的视频分类

length 视频长度整形数字标识的视频长度

views 观看次数视频被浏览的次数

rate 视频评分满分 5 分

ratings 流量视频的流量，整型数字

conments 评论数一个视频的整数评论数

related ids 相关视频 id 相关视频的 id，最多 20 个

数据清洗

通过mapreduce将数据清洗出来，通过观察原始数据形式，可以发现，视频可以有多个所属分类，每个所属分类用&符号分割，且分割的两边有空格字符，同时相关视频也是可以有多个元素，多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作，我们首先进行数据重组清洗操作。即：将所有的类别用“&”分割，同时去掉两边空格，多个相关视频 id 也使用“&”进行分割。将数据放到hdfs指定的文件夹里面。

ETL数据清洗

ETLUtils.java

package ETLUtils;

public class ETLUtils {

    public static String getETCString(String str){
        String[] lines=str.split("\t");
        StringBuilder newLines=new StringBuilder();
        //1.去掉空格
        lines[3]=lines[3].replaceAll(" ","");
        //2.过滤不合法的值
        if(lines.length<9) return null;
        //3.大于9的下标的\t变化成&连接符
        for(int i=0;i<lines.length;i++){
            newLines.append(lines[i]);
            if(i<9){

                newLines.append("\t");
            }else{
                if(i!=lines.length-1){
                    newLines.append("&");
                }
            }
        }
        return newLines.toString();
    }

}

ETLMapper

package mapper;

import ETLUtils.ETLUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ETLMapper extends Mapper<Object,Text,NullWritable,Text> {

    Text text=new Text();
    @Override
    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {

        String lines=ETLUtils.getETCString(value.toString());

        if(StringUtils.isBlank(lines)) return;

        text.set(lines);

        context.write(NullWritable.get(),text);

    }
}

ETLRunner

package runner;

import java.io.IOException;

import mapper.ETLMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class ETLRunner implements Tool {

    private Configuration conf=null;

    @Override
    public void setConf(Configuration conf) {
        this.conf=conf;
    }

    @Override
    public Configuration getConf() {
        return this.conf;
    }

    @Override
    public int run(String[] args) throws Exception {

        conf=this.getConf();
        //传输路径变量
        conf.set("inpath",args[0]);
        conf.set("outpath",args[1]);

        Job job=Job.getInstance(conf,"youtub_etl_video");
        job.setJarByClass(ETLRunner.class);

        job.setMapperClass(ETLMapper.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        job.setNumReduceTasks(0);

        this.initInputPath(job);
        this.initOutputPath(job);

        return job.waitForCompletion(true)? 0:1;
    }

    private void initInputPath(Job job) throws IOException {

        Configuration conf=job.getConfiguration();
        String inpaths=conf.get("inpath");

        //获取抽象文件系统对象
        FileSystem fs=FileSystem.get(conf);
        //创建hdfs路径实体对象
        Path inpath =new Path(inpaths);
        //判断文件系统存在该路径
        if(fs.exists(inpath)){
           //设置输入路径
            FileInputFormat.addInputPath(job,inpath);
        }else{
            throw new RuntimeException("HDFS目录不存在"+inpaths);
        }
    }

    private void initOutputPath(Job job) throws IOException {

        Configuration conf=job.getConfiguration();

        String outpath=conf.get("outpath");

        FileSystem fs=FileSystem.get(conf);

        Path opath=new Path(outpath);

        if(fs.exists(opath)){
            //存在输出路径，删除输出路径
            fs.delete(opath,true);
        }
            FileOutputFormat.setOutputPath(job,opath);

    }

    public static void main(String[] args) {

        try {
            int result=ToolRunner.run(new ETLRunner(),args);
            if(result==0){
                System.out.println("Success!");
            }else{
                System.out.println("Fail!");
            }
            System.exit(result);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }

}

建立四张表打算用orc的表进行操作（操作效率比较高），但是orc类型的表只能通过insert的形式插入数据，所以需要建立两张ori的表再进行插入操作。

create table youtube_ori(

videoId string,

uploader string,

age int,

category array<string>,

length int,

views int,

rate float,

ratings int,

comments int,

relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as textfile;

创建user ori表

create table youtube_user_ori(

uploader string,

videos int,

friends int)

clustered by (uploader) into 24 buckets

row format delimited

fields terminated by "\t"

stored as textfile;

//创建orc表

create table youtube_orc(

videoId string,

uploader string,

age int,

category array<string>,

length int,

views int,

rate float,

ratings int,

comments int,

relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as orc;

create table youtube_user_orc(

uploader string,

videos int,

friends int)

clustered by (uploader) into 24 buckets

row format delimited

fields terminated by "\t"

stored as orc;

导入数据到表ori中

load data inpath '/output/part-m-00000' into table youtube_ori;

load data inpath '/output/user.txt' into table youtube_user_ori;

将表导入到orc中

用insert into table A select * from B倒入到orc表

数据分析

字段：

视频id 上传者视频年龄视频类别观看长度观看次数视频评分流量评论数相关视频

videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId

统计视频观看数top10

create table viewstop10 as select videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId from youtube_orc order by views desc limit 10;

统计类别热度top10

create table hotTop10 as select t1.category_name as category,count(t1.videoId) as hot from(

select videoId,category_name from youtube_orc lateral view explode(category) t_catetory as category_name ) t1

group by t1.category_name order by hot desc limit 10;

统计出视频观看数最高的 20 个视频的所属类别以及类别包含这 Top20 视频的个数

create table top20views_countCategory as

select category_name,count(videoId) as vcount from

(select videoId,category from

( select * from youtube_orc order by views desc limit 20) t1) t2 lateral view explode(category) t_catetory as category_name

group by category_name order by vcount desc;

统计视频观看数 Top50 所关联视频的所属类别的热度排名

select category_name,count(views) as mcount from

(select videoId,category_name,views from

(select videoId,category,views from youtube_orc order by views desc limit 50) t1

lateral view explode(category) t_category as category_name

) t2 group by category_name order by mcount desc;

统计每个类别中的视频热度 Top10，以 Music 为例

create table youtube_category(

videoId string,

uploader string,

age int,

categoryId string,

length int,

views int,

rate float,

ratings int,

comments int,

relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as orc;

insert into table youtube_category

select

videoId,

uploader,

age,

categoryId,

length,

views,

rate,

ratings,

comments,

relatedId

from

youtube_orc lateral view explode(category) catetory as categoryId

create table musicTop10 as

select videoId,categoryId,views from youtube_category where categoryId="Music" order by views desc limit 10;

统计每个类别中视频流量 Top10，以 Music 为例

create table ratingsTop10 as

select videoId,views,ratings from youtube_category where categoryId="Music" order by ratings desc limit 10;

统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 的视频

select t2.videoId,t2.uploader,t2.views,t1.videos from

(select * from youtube_user_orc order by videos desc limit 10) t1 join youtube_orc t2 on t1.uploader=t2.uploader order by t2.views

limit 20;

统计每个类别视频观看数 Top10

create table categoryId_views as

select * from

(select videoId,categoryId,views,row_number() over(partition by categoryId order by views desc) rant from youtube_category) t1

where rant<=10;

hadoop——hive视频观看热度，Top N案例（youtube）

猜你喜欢