MapRudce 单词统计 WordCount 案例代码
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.lihaoze</groupId>
<artifactId>hadoop</artifactId>
<version>1.0.0</version>
<packaging>jar</packaging>
<name>hadoop</name>
<url>http://maven.apache.org</url>
<properties>
<jdk.version>1.8</jdk.version>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<maven.test.failure.ignore>true</maven.test.failure.ignore>
<maven.test.skip>true</maven.test.skip>
</properties>
<dependencies>
<!-- junit-jupiter-api -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.9.2</version>
<scope>test</scope>
</dependency>
<!-- junit-jupiter-engine -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>5.9.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.26</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.20.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.5</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
<!-- commons-pool2 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
<version>2.11.1</version>
</dependency>
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<dependency>
<groupId>com.github.binarywang</groupId>
<artifactId>java-testdata-generator</artifactId>
<version>1.1.2</version>
</dependency>
</dependencies>
<build>
<finalName>${project.artifactId}</finalName>
<!--<outputDirectory>../package</outputDirectory>-->
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<!-- 设置编译字符编码 -->
<encoding>UTF-8</encoding>
<!-- 设置编译jdk版本 -->
<source>${jdk.version}</source>
<target>${jdk.version}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
<version>3.2.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.3.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-war-plugin</artifactId>
<version>3.3.2</version>
</plugin>
<!-- 编译级别 -->
<!-- 打包的时候跳过测试junit begin -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.2</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project>
map类 WordCountMapper
package com.lihaozhe.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author 李昊哲
* @version 1.0
* @Description
* @createTime 2023/4/12 上午9:35
* <p>
* 自定义的Mapper类 需要继承Hadoop提供的Mapper 并且根据具体业务指定输入数据和输出数据的数据类型
* <p>
* 输入数据的类型
* KEYIN, 读取文件的偏移量 数字(LongWritable)
* VALUEIN, 读取文件的一行数据 文本(Text)
* <p>
* 输出数据的类型
* KEYOUT, 输出数据的key的类型 就是一个单词(Text)
* VALUEOUT 输出数据value的类型 给单词的标记 1 数字(IntWritable)
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
context.write(new Text(word),new IntWritable(1));
}
}
}
reduce类 WordCountReduce
package com.lihaozhe.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author 李昊哲
* @version 1.0
* @Description
* @createTime 2023/4/12 上午9:46
*/
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int total = 0;
for (IntWritable value : values) {
// 对value累加进行累加 输出结果
total += value.get();
}
context.write(new Text(key),new IntWritable(total));
}
}
驱动类 WordCountDriver
package com.lihaozhe.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author 李昊哲
* @version 1.0
* @Description
* @createTime 2023/4/12 上午9:52
*/
public class WordCountDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
System.setProperty("HADOOP_USER_NAME", "root");
Configuration conf = new Configuration();
// windows 跨平台提交
conf.set("mapreduce.app-submission.cross-platform", "true");
// conf.addResource("core-site.xml");
// conf.addResource("mapred-site.xml");
// conf.addResource("yarn-site.xml");
// conf.addResource("hdfs-site.xml");
// 资源文件
Path src = new Path("/wordcount/input/wcdata.txt");
Path dst = new Path("/wordcount/result");
FileSystem fs = FileSystem.get(conf);
// 如果计算结果存储目录存在先删除
if (fs.exists(dst)) {
fs.delete(dst, true);
}
// 声明Job对象
Job job = Job.getInstance(conf, "词频统计");
// 指定当前Job的驱动类
// 本地提交注释该行
job.setJarByClass(WordCountDriver.class);
// 本地提交启用该行
// job.setJar("D:\\dev\\java\\code\\hadoop\\target\\hadoop.jar");
// 指定当前Job的 Mapper和Reducer
job.setMapperClass(WordCountMapper.class);
// job.setCombinerClass(WordCountReduce.class);
job.setReducerClass(WordCountReduce.class);
// 指定Map段输出数据的key的类型和输出数据value的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// FileInputFormat.addInputPath(job, new Path(args[0]));
// FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileInputFormat.setInputPaths(job, src);
FileOutputFormat.setOutputPath(job, dst);
// 提交Job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/188724.html