hadoop——hive视频观看热度,Top N案例(youtube)

  • 数据准备

user.txt

0.txt

字段以及字段名解析

 

user表

字段 备注 字段类型

uploader 上传者用户名 string

videos 上传视频数 int

friends 朋友数量 int

 

视频表:

字段 备注 详细描述

video id 视频唯一 id 11 位字符串

uploader 视频上传者 上传视频的用户名 String

age 视频年龄 视频上传日期和 2007 年 2 月

15 日之间的整数天(Youtube的独特设定)

category 视频类别 上传视频指定的视频分类

length 视频长度 整形数字标识的视频长度

views 观看次数 视频被浏览的次数

rate 视频评分 满分 5 分

ratings 流量 视频的流量,整型数字

conments 评论数 一个视频的整数评论数

related ids 相关视频 id 相关视频的 id,最多 20 个

  •  数据清洗
  1. 通过mapreduce将数据清洗出来,通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频 id 也使用“&”进行分割。将数据放到hdfs指定的文件夹里面。

ETL数据清洗

 

ETLUtils.java

 

package ETLUtils;

public class ETLUtils {

    public static String getETCString(String str){
        String[] lines=str.split("\t");
        StringBuilder newLines=new StringBuilder();
        //1.去掉空格
        lines[3]=lines[3].replaceAll(" ","");
        //2.过滤不合法的值
        if(lines.length<9) return null;
        //3.大于9的下标的\t变化成&连接符
        for(int i=0;i<lines.length;i++){
            newLines.append(lines[i]);
            if(i<9){

                newLines.append("\t");
            }else{
                if(i!=lines.length-1){
                    newLines.append("&");
                }
            }
        }
        return newLines.toString();
    }

}

 

ETLMapper

 

package mapper;

import ETLUtils.ETLUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ETLMapper extends Mapper<Object,Text,NullWritable,Text> {

    Text text=new Text();
    @Override
    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {

        String lines=ETLUtils.getETCString(value.toString());

        if(StringUtils.isBlank(lines)) return;

        text.set(lines);

        context.write(NullWritable.get(),text);

    }
}

 

 

ETLRunner

 

package runner;

import java.io.IOException;

import mapper.ETLMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class ETLRunner implements Tool {

    private Configuration conf=null;


    @Override
    public void setConf(Configuration conf) {
        this.conf=conf;
    }

    @Override
    public Configuration getConf() {
        return this.conf;
    }

    @Override
    public int run(String[] args) throws Exception {

        conf=this.getConf();
        //传输路径变量
        conf.set("inpath",args[0]);
        conf.set("outpath",args[1]);

        Job job=Job.getInstance(conf,"youtub_etl_video");
        job.setJarByClass(ETLRunner.class);

        job.setMapperClass(ETLMapper.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        job.setNumReduceTasks(0);

        this.initInputPath(job);
        this.initOutputPath(job);


        return job.waitForCompletion(true)? 0:1;
    }

    private void initInputPath(Job job) throws IOException {

        Configuration conf=job.getConfiguration();
        String inpaths=conf.get("inpath");

        //获取抽象文件系统对象
        FileSystem fs=FileSystem.get(conf);
        //创建hdfs路径实体对象
        Path inpath =new Path(inpaths);
        //判断文件系统存在该路径
        if(fs.exists(inpath)){
           //设置输入路径
            FileInputFormat.addInputPath(job,inpath);
        }else{
            throw new RuntimeException("HDFS目录不存在"+inpaths);
        }
    }

    private  void initOutputPath(Job job) throws IOException {

        Configuration conf=job.getConfiguration();

        String outpath=conf.get("outpath");

        FileSystem fs=FileSystem.get(conf);

        Path opath=new Path(outpath);

        if(fs.exists(opath)){
            //存在输出路径,删除输出路径
            fs.delete(opath,true);
        }
            FileOutputFormat.setOutputPath(job,opath);

    }

    public static void main(String[] args) {

        try {
            int result=ToolRunner.run(new ETLRunner(),args);
            if(result==0){
                System.out.println("Success!");
            }else{
                System.out.println("Fail!");
            }
            System.exit(result);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }

}

 

  1. 建立四张表打算用orc的表进行操作(操作效率比较高),但是orc类型的表只能通过insert的形式插入数据,所以需要建立两张ori的表再进行插入操作。

create table youtube_ori(

 videoId string,

 uploader string,

 age int,

 category array<string>,

 length int,

 views int,

 rate float,

 ratings int,

 comments int,

 relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as textfile;

 

创建user ori表

create table youtube_user_ori(

 uploader string,

 videos int,

 friends int)

clustered by (uploader) into 24 buckets

row format delimited

fields terminated by "\t"

stored as textfile;

 

 

//创建orc表

create table youtube_orc(

 videoId string,

 uploader string,

 age int,

 category array<string>,

 length int,

 views int,

 rate float,

 ratings int,

 comments int,

 relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as orc;

 

create table youtube_user_orc(

 uploader string,

 videos int,

 friends int)

clustered by (uploader) into 24 buckets

row format delimited

fields terminated by "\t"

stored as orc;

 

 

 

 

  1. 导入数据到表ori中

 

load data inpath '/output/part-m-00000' into table youtube_ori;

 

load data inpath '/output/user.txt' into table youtube_user_ori;

  1. 将表导入到orc中

用insert into table A select * from B倒入到orc表

 

  •  数据分析

字段:

视频id  上传者   视频年龄 视频类别 观看长度 观看次数 视频评分 流量 评论数 相关视频

videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId

 

  1. 统计视频观看数top10

create table viewstop10 as select videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId from youtube_orc order by views desc limit 10;

 

  1.  统计类别热度top10

create table hotTop10 as select t1.category_name as category,count(t1.videoId) as hot from(

select videoId,category_name from youtube_orc lateral view explode(category) t_catetory as category_name ) t1

group by t1.category_name order by hot desc limit 10;

 

  1.  统计出视频观看数最高的 20 个视频的所属类别以及类别包含这 Top20 视频的个数

create table top20views_countCategory as

select category_name,count(videoId) as vcount from

(select videoId,category from

( select * from youtube_orc order by views desc limit 20) t1) t2 lateral view explode(category) t_catetory as category_name

group by category_name order by vcount desc;

 

  1. 统计视频观看数 Top50 所关联视频的所属类别的热度排名

select category_name,count(views) as mcount from

(select videoId,category_name,views from

(select videoId,category,views from youtube_orc order by views desc limit 50) t1

lateral view explode(category) t_category as category_name

) t2 group by category_name order by mcount desc;

 

  1. 统计每个类别中的视频热度 Top10,以 Music 为例

create table youtube_category(

 videoId string,

 uploader string,

 age int,

 categoryId string,

 length int,

 views int,

 rate float,

 ratings int,

 comments int,

 relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as orc;

 

insert into table youtube_category

 select

 videoId,

 uploader,

 age,

 categoryId,

 length,

 views,

 rate,

 ratings,

 comments,

 relatedId

 from

 youtube_orc lateral view explode(category) catetory as categoryId

 

create table musicTop10 as

 select videoId,categoryId,views from youtube_category where categoryId="Music" order by views desc limit 10;

 

  1. 统计每个类别中视频流量 Top10,以 Music 为例

create table ratingsTop10 as

select videoId,views,ratings from youtube_category where categoryId="Music" order by ratings desc limit 10;

 

  1. 统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 的视频

select t2.videoId,t2.uploader,t2.views,t1.videos from

(select * from youtube_user_orc order by videos desc limit 10) t1 join youtube_orc t2 on t1.uploader=t2.uploader order by t2.views

limit 20;

 

  1. 统计每个类别视频观看数 Top10

create table categoryId_views as

select * from

(select videoId,categoryId,views,row_number() over(partition by categoryId order by views desc) rant from youtube_category) t1

 where rant<=10;

  •  

猜你喜欢

转载自blog.csdn.net/qq_37001101/article/details/84959293
今日推荐