MapRudce 单词统计 WordCount 案例代码

不管现实多么惨不忍睹,都要持之以恒地相信,这只是黎明前短暂的黑暗而已。不要惶恐眼前的难关迈不过去,不要担心此刻的付出没有回报,别再花时间等待天降好运。真诚做人,努力做事!你想要的,岁月都会给你。MapRudce 单词统计 WordCount 案例代码,希望对大家有帮助,欢迎收藏,转发!站点地址:www.bmabk.com,来源:原文

MapRudce 单词统计 WordCount 案例代码

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.lihaoze</groupId>
  <artifactId>hadoop</artifactId>
  <version>1.0.0</version>
  <packaging>jar</packaging>

  <name>hadoop</name>
  <url>http://maven.apache.org</url>

  <properties>
    <jdk.version>1.8</jdk.version>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
    <maven.test.failure.ignore>true</maven.test.failure.ignore>
    <maven.test.skip>true</maven.test.skip>
  </properties>
  <dependencies>
    <!-- junit-jupiter-api -->
    <dependency>
      <groupId>org.junit.jupiter</groupId>
      <artifactId>junit-jupiter-api</artifactId>
      <version>5.9.2</version>
      <scope>test</scope>
    </dependency>
    <!-- junit-jupiter-engine -->
    <dependency>
      <groupId>org.junit.jupiter</groupId>
      <artifactId>junit-jupiter-engine</artifactId>
      <version>5.9.2</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.projectlombok</groupId>
      <artifactId>lombok</artifactId>
      <version>1.18.26</version>
    </dependency>
    <dependency>
      <groupId>org.apache.logging.log4j</groupId>
      <artifactId>log4j-slf4j-impl</artifactId>
      <version>2.20.0</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>3.3.5</version>
    </dependency>
    <dependency>
      <groupId>com.google.guava</groupId>
      <artifactId>guava</artifactId>
      <version>31.1-jre</version>
    </dependency>
    <!-- commons-pool2 -->
    <dependency>
      <groupId>org.apache.commons</groupId>
      <artifactId>commons-pool2</artifactId>
      <version>2.11.1</version>
    </dependency>
    <dependency>
      <groupId>com.janeluo</groupId>
      <artifactId>ikanalyzer</artifactId>
      <version>2012_u6</version>
    </dependency>
    <dependency>
      <groupId>com.github.binarywang</groupId>
      <artifactId>java-testdata-generator</artifactId>
      <version>1.1.2</version>
    </dependency>

  </dependencies>
  <build>
    <finalName>${project.artifactId}</finalName>
    <!--<outputDirectory>../package</outputDirectory>-->
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.11.0</version>
        <configuration>
          <!-- 设置编译字符编码 -->
          <encoding>UTF-8</encoding>
          <!-- 设置编译jdk版本 -->
          <source>${jdk.version}</source>
          <target>${jdk.version}</target>
        </configuration>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-clean-plugin</artifactId>
        <version>3.2.0</version>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-resources-plugin</artifactId>
        <version>3.3.1</version>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-war-plugin</artifactId>
        <version>3.3.2</version>
      </plugin>
      <!-- 编译级别 -->
      <!-- 打包的时候跳过测试junit begin -->
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-surefire-plugin</artifactId>
        <version>2.22.2</version>
        <configuration>
          <skip>true</skip>
        </configuration>
      </plugin>
    </plugins>
  </build>
</project>

map类 WordCountMapper

package com.lihaozhe.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author 李昊哲
 * @version 1.0
 * @Description
 * @createTime 2023/4/12 上午9:35
 * <p>
 * 自定义的Mapper类 需要继承Hadoop提供的Mapper 并且根据具体业务指定输入数据和输出数据的数据类型
 * <p>
 * 输入数据的类型
 * KEYIN,  读取文件的偏移量  数字(LongWritable)
 * VALUEIN, 读取文件的一行数据  文本(Text)
 * <p>
 * 输出数据的类型
 * KEYOUT,  输出数据的key的类型 就是一个单词(Text)
 * VALUEOUT 输出数据value的类型 给单词的标记 1 数字(IntWritable)
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] words = line.split(" ");
        for (String word : words) {
            context.write(new Text(word),new IntWritable(1));
        }
    }
}

reduce类 WordCountReduce

package com.lihaozhe.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author 李昊哲
 * @version 1.0
 * @Description
 * @createTime 2023/4/12 上午9:46
 */
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        int total = 0;
        for (IntWritable value : values) {
            // 对value累加进行累加 输出结果
            total += value.get();
        }
        context.write(new Text(key),new IntWritable(total));
    }
}

驱动类 WordCountDriver

package com.lihaozhe.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author 李昊哲
 * @version 1.0
 * @Description
 * @createTime 2023/4/12 上午9:52
 */
public class WordCountDriver {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        System.setProperty("HADOOP_USER_NAME", "root");
        Configuration conf = new Configuration();
        // windows 跨平台提交
        conf.set("mapreduce.app-submission.cross-platform", "true");
        // conf.addResource("core-site.xml");
        // conf.addResource("mapred-site.xml");
        // conf.addResource("yarn-site.xml");
        //  conf.addResource("hdfs-site.xml");
        // 资源文件
        Path src = new Path("/wordcount/input/wcdata.txt");
        Path dst = new Path("/wordcount/result");
        FileSystem fs = FileSystem.get(conf);
        // 如果计算结果存储目录存在先删除
        if (fs.exists(dst)) {
            fs.delete(dst, true);
        }
        // 声明Job对象
        Job job = Job.getInstance(conf, "词频统计");
        // 指定当前Job的驱动类
        // 本地提交注释该行
        job.setJarByClass(WordCountDriver.class);
        // 本地提交启用该行
        // job.setJar("D:\\dev\\java\\code\\hadoop\\target\\hadoop.jar");
        // 指定当前Job的 Mapper和Reducer
        job.setMapperClass(WordCountMapper.class);
        // job.setCombinerClass(WordCountReduce.class);
        job.setReducerClass(WordCountReduce.class);
        // 指定Map段输出数据的key的类型和输出数据value的类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // FileInputFormat.addInputPath(job, new Path(args[0]));
        // FileOutputFormat.setOutputPath(job, new Path(args[1]));
        FileInputFormat.setInputPaths(job, src);
        FileOutputFormat.setOutputPath(job, dst);
        // 提交Job
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。

文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/188724.html

(0)
飞熊的头像飞熊bm

相关推荐

发表回复

登录后才能评论
极客之音——专业性很强的中文编程技术网站,欢迎收藏到浏览器,订阅我们!