问题描述
我一直在努力一个星期,试图从Elasticsearch嵌套的聚集索引中获取正确的数据。下面是我的索引映射和两个示例文档。我想找到的是:
- 匹配字段 xforms.sentence.tokens.value 等于 24 的所有文档
- 在匹配的文档集中进行按 xforms.sentence.tokens.tag 分组的匹配计数,其中 xforms.sentence.tokens.value 等于 24
作为示例,在插入的文档下面,我期望的输出是:
{“ JJ”:1,“ NN”:1}
{
"_doc": {
"_Meta": {},"_source": {},"properties": {
"originalText": {
"type": "text"
},"testDataId": {
"type": "text"
},"xforms": {
"type": "nested","properties": {
"sentence": {
"type": "nested"
},"predicate": {
"type": "nested"
}
}
},"corpusId": {
"type": "text"
},"row": {
"type": "text"
},"batchId": {
"type": "text"
},"processor": {
"type": "text"
}
}
}
}
插入的示例文档如下:
{
"_id": "28","_source": {
"testDataId": "5e97e9bef033448b893e485baa0fdf15","originalText": "Some text with the word 24","xforms": [{
"sentence": {
"tokens": [{
"lemma": "Some","index": 1,"after": " ","tag": "JJ","value": "Some"
},{
"lemma": "text","index": 2,"tag": "NN","value": "text"
},{
"lemma": "with","index": 3,"value": "with"
},{
"lemma": "the","index": 4,"after": "","tag": "CD","value": "the"
},{
"lemma": "word","index": 5,"tag": "CC","value": "word"
},{
"lemma": "24","index": 6,"value": "24"
}
],"type": "RAW"
},"originalSentence": "Some text with the word 24 in it","id": "e724611d8c024bcb8f0158b60e3df87e"
}]
}
},{
"_id": "56","_source": {
"testDataId": "5e97e9bef033448b893e485baa0fad15","originalText": "24 word","xforms": [{
"sentence": {
"tokens": [{
"lemma": "24","value": "24"
},"value": "word"
}
],"originalSentence": "24 word","id": "e724611d8c024bcb8f0158b60e3d123"
}]
}
}
解决方法
扩展@Gibbs的答案@N Kiram,您还需要将tokens
设置为nested
:
{
"xforms":{
"type":"nested","properties":{
"sentence":{
"type":"nested","properties":{
"tokens":{ <----
"type":"nested"
}
}
},"predicate":{
"type":"nested"
}
}
}
}
然后只有这样,您的agg才会产生正确的计数:
{
"aggregations":{
"xforms":{
"doc_count":8,"inner":{
"doc_count":2,"tag_count":{
"doc_count_error_upper_bound":0,"sum_other_doc_count":0,"buckets":[
{
"key":"JJ","doc_count":1
},{
"key":"NN","doc_count":1
}
]
}
}
}
}
}
侧面说明:您必须重新索引才能应用更改的映射。
,{
"aggs": {
"xforms": {
"nested": { //Nested aggregation
"path": "xforms.sentence"
},"aggs": {
"inner": { //Counting only within the matching doc
"filter": {
"bool": {
"filter": { //Filtering docs with value=24
"terms": {
"xforms.sentence.tokens.value": [
"24"
]
}
}
}
},"aggs" : {
"tag_count":{ //On filtered doc,doing terms aggregation on tag's keyword version as tag is of type text
"terms":{
"field":"xforms.sentence.tokens.tag.keyword"
}
}
}
}
}
}
}
}
它提供以下输出
"aggregations": {
"xforms": {
"doc_count": 2,"inner": {
"doc_count": 2,"tag_count": {
"doc_count_error_upper_bound": 0,"sum_other_doc_count": 0,"buckets": [
{
"key": "JJ","doc_count": 2
},{
"key": "NN",{
"key": "CC","doc_count": 1
},{
"key": "CD","doc_count": 1
}
]
}
}
}
}