[大数据] 超详细的MapReduce WordCount 统计微博评论最多的用户

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 大数据 -> 超详细的MapReduce WordCount 统计微博评论最多的用户 -> 正文阅读

[大数据]超详细的MapReduce WordCount 统计微博评论最多的用户

超详细的MapReduce WordCount 统计微博评论最多的用户

使用fastjson解析每一行的json

List<Map<String,Object>> parses = (List<Map<String,Object>>) JSON.parse(value.toString());

提取userId

for (Map<String, Object> pars : parses) {
            String new_value = (String) pars.get("userId");
            context.write(new IntWritable(1),new Text(new_value));
        }

Mapper完整代码

package anu.mapereduce;

import com.alibaba.fastjson.JSON;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.List;
import java.util.Map;

/**
 *  yucheng_gu
 */
public class MainMapper extends Mapper<LongWritable, Text,IntWritable,Text >{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        List<Map<String,Object>> parses = (List<Map<String,Object>>) JSON.parse(value.toString());
        for (Map<String, Object> pars : parses) {
            String new_value = (String) pars.get("userId");
            context.write(new IntWritable(1),new Text(new_value));
        }
    }
}

reduce查找每个用户的出现数量

Map<String,Integer> navs = new HashMap<>();
        for (Text value : values) {
            Integer integer = navs.get(value.toString());
            if (integer == null){
                navs.put(value.toString(),1);
            }else {
                navs.put(value.toString(),integer+1);
            }
        }

把所有用户的评论数量的信息做排序

List<String> llas = new ArrayList<>();
        for (String keys_l : navs.keySet()) {
            Integer is_v = 0;
            String nname = "null";
            Map<String,Integer> new_navs=new HashMap<>();
            for (String keyaa : navs.keySet()) {
                if (! llas.contains(keyaa)){
                    new_navs.put(keyaa,navs.get(keyaa));
                }
            }
            for (String keys : new_navs.keySet()) {
                if(new_navs.get(keys)>is_v){
                    is_v = new_navs.get(keys);
                    nname = keys;
                }
            }
            llas.add(nname);
        }

输出数据

for (String lla : llas) {
            context.write(new Text(lla),new IntWritable(navs.get(lla)));
        }

Reduce完整代码

package anu.mapereduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.*;

public class MainReduce extends Reducer< IntWritable,Text,Text, IntWritable> {

    @Override
    protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        Map<String,Integer> navs = new HashMap<>();
        for (Text value : values) {
            Integer integer = navs.get(value.toString());
            if (integer == null){
                navs.put(value.toString(),1);
            }else {
                navs.put(value.toString(),integer+1);
            }
        }
        List<String> llas = new ArrayList<>();
        for (String keys_l : navs.keySet()) {
            Integer is_v = 0;
            String nname = "null";
            Map<String,Integer> new_navs=new HashMap<>();
            for (String keyaa : navs.keySet()) {
                if (! llas.contains(keyaa)){
                    new_navs.put(keyaa,navs.get(keyaa));
                }
            }
            for (String keys : new_navs.keySet()) {
                if(new_navs.get(keys)>is_v){
                    is_v = new_navs.get(keys);
                    nname = keys;
                }
            }
            llas.add(nname);
        }
        for (String lla : llas) {
            context.write(new Text(lla),new IntWritable(navs.get(lla)));
        }
    }
}

为了方便调试不依赖集群运行，使用本地运行，具体的方法可自己百度

WordCountRunner 启动类的完整代码

package anu.mapereduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;

/**
 *  yucheng_gu
 */
public class WordCountRunner{
    public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
        //注册本地hadoop驱动
        System.setProperty("hadoop.home.dir","D:\\LocalServer\\hadoop-2.9.2");
        Configuration configuration = new Configuration();
        //创建一个job任务对象,super.getConf()获取父类的configuration，jobName：任务名称
        Job myWordCount = Job.getInstance(configuration, "MyWordCount");
        //配置job任务的八个步骤
        //第一步：指定读取文件的方式和源文件的路径
        myWordCount.setInputFormatClass(TextInputFormat.class);
        //TextInputFormat.addInputPath(myWordCount,new Path(args[0]));
        //第二步：指定map阶段的处理方式,和数据类型
        myWordCount.setMapperClass(MainMapper.class);
        //设置map阶段k2的类型
        myWordCount.setMapOutputKeyClass(IntWritable.class);
        //设置map阶段v2的类型
        myWordCount.setMapOutputValueClass(Text.class);
        //第三，四，五，六，步采用默认暂时不用配置
        //第七步：指定reduce阶段的处理方式和数据类型
        myWordCount.setReducerClass(MainReduce.class);
        //设置reduce阶段k3的类型
        myWordCount.setOutputKeyClass(Text.class);
        //设置reduce阶段v3的类型
        myWordCount.setOutputValueClass(IntWritable.class);
        //第八步：设置输出类型
        myWordCount.setOutputFormatClass(TextOutputFormat.class);
        //设置输出路径
        // 6 指定job的输入原始所在目录
        FileInputFormat.setInputPaths(myWordCount,
                new Path("D:\\javaproject\\20210722_GOUP_11_GYC\\MapperReuceDemo01\\src\\main\\resources\\datas.json"));
        FileOutputFormat.setOutputPath(myWordCount,
                new Path("D:\\javaproject\\20210722_GOUP_11_GYC\\MapperReuceDemo01\\src\\main\\resources\\input"));
        //等待任务结束
        boolean b = myWordCount.waitForCompletion(true);
        System.exit(b?0:1);
    }
}