1、SQL去重
在SQL中,用dinstinct语句进行去重:
- 获取去重后的结果:SELECT DISTINCT name, sex FROM person;
- 统计去重后的数量:SELECT COUNT(DISTINCT name, sex) FROM person;
2、ES数据构建
2.1 创建索引
from elasticsearch import Elasticsearch
# 连接es
es = Elasticsearch(hosts=["192.168.124.49:9200"], sniffer_timeout=60, timeout=30)
body = {
"mappings": {
"properties": {
"id": {
"type": "integer"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"age": {
"type": "integer"
},
"gender": {
"type": "keyword"
},
"email": {
"type": "text"
},
"province": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"address": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"state": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
# 创建 index
es.indices.create(index="person_info", body=body)
2.2 查看索引
2.3 使用kibana批量生成数据
POST person_info/_bulk
{"index": {"_index": "person_info"}}
{"id": 1, "name": "刘一", "age": 25, "gender": "男", "email": "111@qq.com", "provience": "北京", "address": "北京市朝阳区", "status": "正常"}
{"index": {"_index": "person_info"}}
{"id": 1, "name": "陈二", "age": 26, "gender": "女", "email": "111@qq.com", "provience": "山东", "address": "山东省青岛市", "status": "正常"}
{"index": {"_index": "person_info"}}
{"id": 1, "name": "张三", "age": 27, "gender": "男", "email": "111@qq.com", "provience": "北京", "address": "北京市朝阳区", "status": "正常"}
{"index": {"_index": "person_info"}}
{"id": 1, "name": "李四", "age": 28, "gender": "男", "email": "111@qq.com", "provience": "山东", "address": "山东省济南市", "status": "正常"}
{"index": {"_index": "person_info"}}
{"id": 1, "name": "王五", "age": 25, "gender": "男", "email": "111@qq.com", "provience": "北京", "address": "北京市朝阳区", "status": "正常"}
{"index": {"_index": "person_info"}}
{"id": 1, "name": "刘一", "age": 26, "gender": "男", "email": "111@qq.com", "provience": "山东", "address": "山东省青岛市", "status": "正常"}
{"index": {"_index": "person_info"}}
{"id": 1, "name": "陈二", "age": 26, "gender": "女", "email": "111@qq.com", "provience": "北京", "address": "北京市朝阳区", "status": "正常"}
2.4 查看生成的数据
3、ES获取去重结果
3.1 collapse折叠功能(ES5.3之后支持)
- 推荐。原因:性能高,占内存小
注意:去重的字段不能是text类型。如果xxxfield的mapping要有keyword,且通过xxxfield.keyword去重。
注意:如果去重字段是其他可以直接去重的类型,比如:数字类型、keyword、日期等,则直接用字段名就可以。即:如果本处xxxfield是keyword,则xxxfield.keyword处写成xxxfield就行。
查询province为北京的信息:
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
}
}
运行结果:
{
"took" : 15,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : 0.5753642,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : 0.5753642,
"_source" : {
"id" : 1,
"name" : "刘一",
"age" : 25,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
}
},
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : 0.5753642,
"_source" : {
"id" : 1,
"name" : "张三",
"age" : 27,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
}
},
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "iFHKl4YBPv2uoOpTcHMi",
"_score" : 0.5753642,
"_source" : {
"id" : 1,
"name" : "王五",
"age" : 25,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
}
},
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : 0.5753642,
"_source" : {
"id" : 1,
"name" : "陈二",
"age" : 26,
"gender" : "女",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
}
}
]
}
}
查询province为北京,且根据年龄去重的信息:
# collapse获取去重结果
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"collapse": {
"field": "age"
}
}
运行结果:
{
"took" : 14,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : 0.5753642,
"_source" : {
"id" : 1,
"name" : "刘一",
"age" : 25,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"fields" : {
"age" : [
25
]
}
},
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : 0.5753642,
"_source" : {
"id" : 1,
"name" : "张三",
"age" : 27,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"fields" : {
"age" : [
27
]
}
},
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : 0.5753642,
"_source" : {
"id" : 1,
"name" : "陈二",
"age" : 26,
"gender" : "女",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"fields" : {
"age" : [
26
]
}
}
]
}
}
3.2 字段聚合+top_hits聚合
- 不推荐。原因:性能差,占内存大
查询province为北京,且根据年龄去重的信息:
# 聚合获取去重结果
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"field": "age",
"size": 10
},
"aggs": {
"age_top": {
"top_hits": {
"sort": [{
"age": {
"order": "desc"
}
}],
"size": 1
}
}
}
}
}
}
运行结果:
{
"took" : 230,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 25,
"doc_count" : 2,
"age_top" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "刘一",
"age" : 25,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : 26,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "陈二",
"age" : 26,
"gender" : "女",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
26
]
}
]
}
}
},
{
"key" : 27,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "张三",
"age" : 27,
"gender" : "男",
"email" : "111@qq.com",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
27
]
}
]
}
}
}
]
}
}
}
4、ES统计去重后的数量
- 聚合+cardinality聚合函数
查询province为北京,且根据年龄去重的数量:
# 聚合获取去重数量
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"cardinality": {
"field": "age"
}
}
}
}
运行结果:
{
"took" : 68,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"value" : 3
}
}
}
参考博文:
ElasticSearch–去重查询/根据字段去重–方法/实例_IT利刃出鞘的博客-CSDN博客_elasticsearch统计去重后的数量准确值
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/142826.html