平时咱们使用ElasticSearch都是单字段进行去重,对于多字段进行去重还是少见的。
ElasticSearch单字段去重详见博文:ElasticSearch单字段查询去重详解_IT之一小佬的博客-CSDN博客
本博文将详细介绍多字段进行去重。本文示例数据详见上文单字段博文数据。
1、聚合获取多字段去重数量
# 聚合获取多字段去重数量
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"cardinality": {
"script": {
"lang": "painless",
"source": "doc['age'].value + doc['gender'].value"
}
}
}
}
}
运行结果:
{
"took" : 10,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"value" : 3
}
}
}
注意:使用script方法对于大数据去重时,可能会有小小的误差!
2、聚合去重查询/过滤重复数据
2.1 聚合(Aggregations)
# 查询.聚合
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"field": "age",
"size": 10
}
}
}
}
运行结果:
{
"took" : 80,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 25,
"doc_count" : 2
},
{
"key" : 26,
"doc_count" : 1
},
{
"key" : 27,
"doc_count" : 1
}
]
}
}
}
2.2 top_hits指标聚合器
top_hits指标聚合器跟踪要聚合的最相关文档,可以有效地用于通过存储桶聚合器按某些字段对结果集进行分组。
直接使用top_hits返回全部字段:
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"field": "age",
"size": 10
},
"aggs": {
"age_top": {
"top_hits": {
"sort": [{
"age": {
"order": "desc"
}
}],
"size": 1
}
}
}
}
}
}
运行结果:
{
"took" : 647,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 25,
"doc_count" : 2,
"age_top" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "刘一",
"age" : 25,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : 26,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "陈二",
"age" : 26,
"gender" : "女",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
26
]
}
]
}
}
},
{
"key" : 27,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "张三",
"age" : 27,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
27
]
}
]
}
}
}
]
}
}
}
使用_source includes返回需要的字段:
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"field": "age",
"size": 10
},
"aggs": {
"age_top": {
"top_hits": {
"sort": [{
"age": {
"order": "desc"
}
}],
"_source": {
"includes": [
"name",
"age",
"gender",
"provience",
"address"
]
},
"size": 1
}
}
}
}
}
}
运行结果:
{
"took" : 115,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 25,
"doc_count" : 2,
"age_top" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "刘一",
"age" : 25
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : 26,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "女",
"provience" : "北京",
"name" : "陈二",
"age" : 26
},
"sort" : [
26
]
}
]
}
}
},
{
"key" : 27,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "张三",
"age" : 27
},
"sort" : [
27
]
}
]
}
}
}
]
}
}
}
2.3 使用script进行聚合
常规的聚合无法在聚合中进行复杂操作,所以要加入脚本,修改terms中内容为下,将三个条件拼接起来。
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"script": {
"lang": "painless",
"source": "doc['age'].value + '#' + doc['gender'].value + '#' + doc['name.keyword']"
}
},
"aggs": {
"age_top": {
"top_hits": {
"sort": [{
"age": {
"order": "desc"
}
}],
"_source": {
"includes": [
"name",
"age",
"gender",
"provience",
"address"
]
},
"size": 1
}
}
}
}
}
}
运行结果:
- key:拼接的条件
- doc_count:每组重复的数目
{
"took" : 52,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "25#男#[刘一]",
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "刘一",
"age" : 25
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : "25#男#[王五]",
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "iFHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "王五",
"age" : 25
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : "26#女#[陈二]",
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "女",
"provience" : "北京",
"name" : "陈二",
"age" : 26
},
"sort" : [
26
]
}
]
}
}
},
{
"key" : "27#男#[张三]",
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "张三",
"age" : 27
},
"sort" : [
27
]
}
]
}
}
}
]
}
}
}
参考博文:
Elasticsearch Painless Script入门教程 – CodeAntenna
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/142825.html