数据开发面试准备

1、datanode、namenode、resourceManager，NodeManger的区别和流程
http://www.aboutyun.com/thread-7778-1-1.html
2、hadoop map-reducer的流程，combine的作用
http://blog.csdn.net/lisonglisonglisong/article/details/47125319
3、基本的hadoop wordCount的编写
http://blog.jobbole.com/82607/
http://blog.csdn.net/Jerome_s/article/details/26441151
package com.test.mapreduce.web;

import java.io.IOException;  
import java.util.StringTokenizer;  

import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.util.GenericOptionsParser;  

public class WordCount {  

    public static class WordMapper extends  
            Mapper<Object, Text, Text, IntWritable> {  

        private final static IntWritable one = new IntWritable(1);  
        private Text word = new Text();  

        public void map(Object key, Text value, Context context)  
                throws IOException, InterruptedException {  
            String line = value.toString();  
            StringTokenizer itr = new StringTokenizer(line);  
            while (itr.hasMoreTokens()) {  
                word.set(itr.nextToken().toLowerCase());  
                context.write(word, one);  
            }  
        }  
    }  

    public static class WordReducer extends  
            Reducer<Text, IntWritable, Text, IntWritable> {  
        private IntWritable result = new IntWritable();  

        public void reduce(Text key, Iterable<IntWritable> values,  
                Context context) throws IOException, InterruptedException {  
            int sum = 0;  
            for (IntWritable val : values) {  
                sum += val.get();  
            }  
            result.set(sum);  
            context.write(key, new IntWritable(sum));  
        }  
    }  

    public static void main(String[] args) throws Exception {  
        Configuration conf = new Configuration();  
        String[] otherArgs = new GenericOptionsParser(conf, args)  
                .getRemainingArgs();  
        if (otherArgs.length != 2) {  
            System.err.println("Usage: wordcount <in> <out>");  
            System.exit(2);  
        }  
        Job job = new Job(conf, "word count");  
        job.setJarByClass(WordCount.class);  
        job.setMapperClass(WordMapper.class);  
        job.setCombinerClass(WordReducer.class);  
        job.setReducerClass(WordReducer.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(IntWritable.class);  
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
        System.exit(job.waitForCompletion(true) ? 0 : 1);  
    }  
}  

---------------------------------------------------------------------------------
package cn.dataguru.hadoop;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//位置数据
//IMSI|IMEI|UPDATETYPE|CGI|TIME
//上网数据
//IMSI|IMEI|CGI|TIME|CGI|URL

/**  
 * 汇总基站数据表
 * 计算每个用户在不同的时间段不同的基站停留的时长
 * 输入参数 < input path > < output path > < date > < timepoint >
 * 参数示例： “/base /output 2012-09-12 09-17-24"
 * 意味着以“/base”为输入，"/output"为输出，指定计算2012年09月12日的数据，并分为00-07，07-17，17-24三个时段
 * 输出格式 “IMSI|CGI|TIMFLAG|STAY_TIME”
 */  
public class BaseStationDataPreprocess extends Configured implements Tool 
{
    /**  
     * 计数器
     * 用于计数各种异常数据
     */  
    enum Counter 
    {
        TIMESKIP,       //时间格式有误
        OUTOFTIMESKIP,  //时间不在参数指定的时间段内
        LINESKIP,       //源文件行有误
        USERSKIP        //某个用户某个时间段被整个放弃
    }

    /**  
     * 读取一行数据
     * 以“IMSI+时间段”作为 KEY 发射出去
     */
    public static class Map extends Mapper<LongWritable, Text, Text, Text> 
    {
        String date;
        String [] timepoint;
        boolean dataSource;

        /** 
         * 初始化
         */ 
        public void setup ( Context context ) throws IOException
        {
            this.date = context.getConfiguration().get("date");                         //读取日期
            this.timepoint = context.getConfiguration().get("timepoint").split("-");    //读取时间分割点

            //提取文件名
            FileSplit fs = (FileSplit)context.getInputSplit();
            String fileName = fs.getPath().getName();
            if( fileName.startsWith("POS") )
                dataSource = true;
            else if ( fileName.startsWith("NET") )
                dataSource = false;
            else
                throw new IOException("File Name should starts with POS or NET");
        }

        /**  
         * MAP任务
         * 读取基站数据
         * 找出数据所对应时间段
         * 以IMSI和时间段作为 KEY
         * CGI和时间作为 VALUE
         */ 
        public void map ( LongWritable key, Text value, Context context ) throws IOException, InterruptedException 
        {
            String line = value.toString();
            TableLine tableLine = new TableLine();

            //读取行
            try
            {
                tableLine.set(line, this.dataSource, this.date, this.timepoint );
            }
            catch ( LineException e )
            {
                if(e.getFlag()==-1)
                    context.getCounter(Counter.OUTOFTIMESKIP).increment(1);
                else
                    context.getCounter(Counter.TIMESKIP).increment(1);
                return;
            }
            catch (Exception e)
            {
                context.getCounter(Counter.LINESKIP).increment(1);
                return;
            }

            context.write( tableLine.outKey(), tableLine.outValue() );
        }
    }

    /**  
     * 统计同一个IMSI在同一时间段
     * 在不同CGI停留的时长
     */ 
    public static class Reduce extends Reducer<Text, Text, NullWritable, Text> 
    {
        private String date;
        private SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

        /**  
         * 初始化
         */ 
        public void setup ( Context context )
        {
            this.date = context.getConfiguration().get("date");                         //读取日期
        }

        public void reduce ( Text key, Iterable<Text> values, Context context ) throws IOException, InterruptedException
        {
            String imsi = key.toString().split("\\|")[0];
            String timeFlag = key.toString().split("\\|")[1];

            //用一个TreeMap记录时间
            TreeMap<Long, String> uploads = new TreeMap<Long, String>();
            String valueString;

            for ( Text value : values )
            {
                valueString = value.toString();
                try 
                {
                    uploads.put( Long.valueOf( valueString.split("\\|")[1] ), valueString.split("\\|")[0] );
                }
                catch ( NumberFormatException e ) 
                {
                    context.getCounter(Counter.TIMESKIP).increment(1);
                    continue;
                }
            }

            try 
            {
                //在最后添加“OFF”位置
                Date tmp = this.formatter.parse( this.date + " " + timeFlag.split("-")[1] + ":00:00" );
                uploads.put ( ( tmp.getTime() / 1000L ), "OFF");        

                //汇总数据
                HashMap<String, Float> locs = getStayTime(uploads);

                //输出
                for( Entry<String, Float> entry : locs.entrySet() )
                {
                    StringBuilder builder = new StringBuilder();
                    builder.append(imsi).append("|");
                    builder.append(entry.getKey()).append("|");
                    builder.append(timeFlag).append("|");
                    builder.append(entry.getValue());

                    context.write( NullWritable.get(), new Text(builder.toString()) );
                }
            }
            catch ( Exception e )
            {
                context.getCounter(Counter.USERSKIP).increment(1);
                return;
            }
        }

        /**
         * 获得位置停留信息
         */
        private HashMap<String, Float> getStayTime(TreeMap<Long, String> uploads)
        {
            Entry<Long, String> upload, nextUpload;
            HashMap<String, Float> locs = new HashMap<String, Float>();
            //初始化
            Iterator<Entry<Long, String>> it = uploads.entrySet().iterator();
            upload = it.next();
            //计算
            while( it.hasNext() )
            {
                nextUpload = it.next();
                float diff = (float) (nextUpload.getKey()-upload.getKey()) / 60.0f;
                if( diff <= 60.0 )                                  //时间间隔过大则代表关机
                {
                    if( locs.containsKey( upload.getValue() ) )
                        locs.put( upload.getValue(), locs.get(upload.getValue())+diff );
                    else
                        locs.put( upload.getValue(), diff );
                }
                upload = nextUpload;
            }
            return locs;
        }
    }

    public int run(String[] args) throws Exception {
        Configuration conf = getConf();

        conf.set("date", args[2]);
        conf.set("timepoint", args[3]);

        Job job = new Job(conf, "BaseStationDataPreprocess");
        job.setJarByClass(BaseStationDataPreprocess.class);

        FileInputFormat.addInputPath( job, new Path(args[0]) );         //输入路径
        FileOutputFormat.setOutputPath( job, new Path(args[1]) );       //输出路径

        job.setMapperClass( Map.class );                                //调用上面Map类作为Map任务代码
        job.setReducerClass ( Reduce.class );                           //调用上面Reduce类作为Reduce任务代码
        job.setOutputFormatClass( TextOutputFormat.class );
        job.setOutputKeyClass( Text.class );
        job.setOutputValueClass( Text.class );

        job.waitForCompletion(true);

        return job.isSuccessful() ? 0 : 1;
    }

    public static void main(String[] args) throws Exception 
    {
        if ( args.length != 4 )
        {
            System.err.println("");
            System.err.println("Usage: BaseStationDataPreprocess < input path > < output path > < date > < timepoint >");
            System.err.println("Example: BaseStationDataPreprocess /user/james/Base /user/james/Output 2012-09-12 07-09-17-24");
            System.err.println("Warning: Timepoints should be begined with a 0+ two digit number and the last timepoint should be 24");
            System.err.println("Counter:");
            System.err.println("\t"+"TIMESKIP"+"\t"+"Lines which contain wrong date format");
            System.err.println("\t"+"OUTOFTIMESKIP"+"\t"+"Lines which contain times that out of range");
            System.err.println("\t"+"LINESKIP"+"\t"+"Lines which are invalid");     
            System.err.println("\t"+"USERSKIP"+"\t"+"Users in some time are invalid");      
            System.exit(-1);
        }

        //运行任务
        int res = ToolRunner.run(new Configuration(), new BaseStationDataPreprocess(), args);

        System.exit(res);
    }

}
4、面试题
http://blog.csdn.net/qq_26442553/article/details/78718796
数据开发面试准备

猜你喜欢