Elasticsearch composite group by queries across the documents - java

We have an elastic search document which has a dimension called city. Each document will have only one value for city field. I have a scenario where I need to query the person based on the city or cities.
Documents in Elasticsearch
{
person_id: "1",
property_value : 25000,
city: "Bangalore"
}
{
person_id: "2",
property_value : 100000,
city: "Bangalore"
}
{
person_id: "1",
property_value : 15000,
city: "Delhi"
}
Note: The aggregation should be performed on property_value and group by on person_id.
For eg.,
If I query for Bangalore it should return document with person_id 1 and 2.
If I query for both Delhi and Bangalore it should return this
{
person_id: "1",
property_value : 40000,
city: ["Bangalore", "Delhi"]
}

Looking at your data, I've come up with a sample mapping, request query and the response.
Mapping:
PUT my_index_city
{
"mappings": {
"properties": {
"person_id":{
"type": "keyword"
},
"city":{
"type":"text",
"fields":{
"keyword":{
"type": "keyword"
}
}
},
"property_value":{
"type": "long"
}
}
}
}
Sample Request:
Note that I've made use of simple query string to filter the documents having Bangalore and Delhi.
For aggregation I've made use of Terms Aggregation on person_id and Sum Aggregation on the property_value field.
POST my_index_city/_search
{
"size": 0,
"query": {
"query_string": {
"default_field": "city",
"query": "Bangalore Delhi"
}
},
"aggs": {
"my_person": {
"terms": {
"field": "person_id",
"size": 10,
"min_doc_count": 2
},
"aggs": {
"sum_property_value": {
"sum": {
"field": "property_value"
}
}
}
}
}
}
Sample Response:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"my_person" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 2,
"sum_property_value" : {
"value" : 40000.0
}
}
]
}
}
}
Note: This query would only work if the person_id has multiple documents but each document having unique/different city value.
What I mean to say is, if the person_id has multiple documents with same city, the aggregation would not give right answer.
Updated Answer:
There is no direct way to achieve what you are looking for unless you modify the mapping. What I've done is, made use of nested datatype and ingested all the documents for person_id as a single document.
Mapping:
PUT my_sample_city_index
{
"mappings": {
"properties": {
"person_id":{
"type": "keyword"
},
"property_details":{
"type":"nested", <------ Note this
"properties": {
"city":{
"type": "text",
"fields":{
"keyword":{
"type":"keyword"
}
}
},
"property_value":{
"type": "long"
}
}
}
}
}
}
Sample Documents:
POST my_sample_city_index/_doc/1
{
"person_id": "1",
"property_details":[
{
"property_value" : 25000,
"city": "Bangalore"
},
{
"property_value" : 15000,
"city": "Delhi"
}
]
}
POST my_sample_city_index/_doc/2
{
"person_id": "2",
"property_details":[
{
"property_value" : 100000,
"city": "Bangalore"
}
]
}
Aggregation Query:
POST my_sample_city_index/_search
{
"size": 0,
"query": {
"nested": {
"path": "property_details",
"query": {
"query_string": {
"default_field": "property_details.city",
"query": "bangalore delhi"
}
}
}
},
"aggs": {
"persons": {
"terms": {
"field": "person_id",
"size": 10
},
"aggs": {
"property_sum": {
"nested": { <------ Note this
"path": "property_details"
},
"aggs": {
"total_sum": {
"sum": {
"field": "property_details.property_value"
}
}
}
}
}
}
}
}
Note that I've applied initially a term query on person_id post which I've applied Nested Aggregation, further on which I've applied metric sum aggregation query.
This should also work correctly if a person has multiple properties in the same city.
Response:
{
"took" : 31,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"persons" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1,
"property_sum" : {
"doc_count" : 2,
"total_sum" : {
"value" : 40000.0
}
}
},
{
"key" : "2",
"doc_count" : 1,
"property_sum" : {
"doc_count" : 1,
"total_sum" : {
"value" : 100000.0
}
}
}
]
}
}
}
Let me know if this helps!

Related

Opensearch - get inner aggregations from aggregations using opensearch-java client

There is this opensearch query constructed using openserch-java
GET eventsearch/_search
{
"aggregations": {
"WEB": {
"aggregations": {
"eventDate": {
"date_histogram": {
"extended_bounds": {
"max": "2022-12-01T00:00:00Z",
"min": "2022-01-01T00:00:00Z"
},
"field": "eventDate",
"fixed_interval": "1d",
"min_doc_count": 0
}
}
},
"filter": {
"term": {
"channel": {
"value": "WEB",
"case_insensitive": true
}
}
}
}
},
"query": {
"bool": {
"filter": [
{
"range": {
"eventDate": {
"from": "2022-01-01T00:00:00Z",
"to": "2022-12-01T00:00:00Z"
}
}
}
],
"must": [
{
"match_all": {}
}
]
}
},
"size": 0
}
Running query, the response is this:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 26,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"WEB" : {
"doc_count" : 25,
"eventDate" : {
"buckets" : [
{
"key_as_string" : "2022-01-01T00:00:00.000Z",
"key" : 1640995200000,
"doc_count" : 0
},
{
"key_as_string" : "2022-01-02T00:00:00.000Z",
"key" : 1641081600000,
"doc_count" : 0
},
{
"key_as_string" : "2022-01-03T00:00:00.000Z",
"key" : 1641168000000,
"doc_count" : 0
},
{
"key_as_string" : "2022-01-04T00:00:00.000Z",
"key" : 1641254400000,
"doc_count" : 0
},
....................
]
}
}
}
}
In java I need to perform this query and get the results from there.
But after using the opensearchclient.search and then get the "aggregations" list method, I receive this (image attached) and get
If I try to get the "WEB" from the Map, there is no other "eventDate" aggregation to fetch.
Is there a way to fetch this inner aggregation using opensearch-java client? I had no luck with documentation.
opensearch-java 2.1.0
There is currently no feature like this, it exists an open bug, with merged code, but not released.
https://github.com/opensearch-project/opensearch-java/issues/197

Is it possible to create a map in runtime mode, which will be filled in passing through all documents and returned at the end ES

For ex: I have 2 documents with this body:
{
"id": "doc_one",
"name": "test_name",
"date_creation": "some_date_cr_1",
"date_updation": "some_date_up_1"
}
And the second doc:
{
"id": "doc_two",
"name": "test_name",
"date_creation": "some_date_cr_2",
"date_updation": "some_date_up_2"
}
What I want to do: to create two runtime field or Map('data_creation',count_of_doc_where_field_not_null_AND_the_condition_is_met).
For ex: I've got the 1st doc, there is date_creation IS NOT NULL and the condition startDate<=date_creation<=endDate is met, so, I create some field count = 0 and when I've got this case I do count++. When I will get all the docs I will set finally count value from map as result: Map('data_creation',final_count) and the same for another field but in the same map.
I tried to use script, but there is return Map for each doc, for ex:
{
"_index": "my_index_001",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"fields": {
"my_doubled_field": [
{
"NEW": 2
}
]
}
},
{
"_index": "my_index_001",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"fields": {
"my_doubled_field": [
{
"NEW": 2
}
]
}
}
I have index below 3 documents to index where one document dont have date_creation field:
POST sample/_doc
{
"id": "doc_two",
"name": "test_name",
"date_updation": "some_date_up_2"
}
POST sample/_doc
{
"id": "doc_one",
"name": "test_name",
"date_creation": "some_date_cr_1",
"date_updation": "some_date_up_1"
}
POST sample/_doc
{
"id": "doc_two",
"name": "test_name",
"date_creation": "some_date_cr_2",
"date_updation": "some_date_up_2"
}
Now you can use filter aggregation from elasticsearch as shown below:
{
"size": 0,
"aggs": {
"date_creation": {
"filter": {
"range": {
"date_creation": {
"gte": "2020-01-09T10:20:10"
}
}
}
},
"date_updation": {
"filter": {
"range": {
"date_updation": {
"gte": "2020-01-09T10:20:10"
}
}
}
}
}
}
Response:
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"date_updation" : {
"meta" : { },
"doc_count" : 3
},
"date_creation" : {
"meta" : { },
"doc_count" : 2
}
}
You can see date_updation field is available in 3 doc so it is showing count as 3 and date_creation field is available in 2 doc so it is showing count as 2.

must_not is not giving expected result in Elasticsearch for empty field

This is my sample es index document:
"hits" : [
{
"_index" : "project_note",
"_type" : "project_note",
"_id" : "19",
"_score" : 1.0,
"_source" : {
"createTime" : "2021-10-04T13:43:55.330",
"createTimeInMs" : 1633333435330,
"createdBy" : "test",
"editTime" : "2021-10-04T13:43:55.330",
"editTimeInMs" : 1633333435330,
"editedBy" : "test",
"versionId" : 1,
"id" : "19",
"organizationId" : "28",
"accessLevel" : "PUBLIC",
"status" : "ACTIVE",
"projectId" : "95",
"userId" : 129,
"noteType" : "SYSTEM_GENERATED",
"projectDemographicLogId" : "1"
},
{
"_index" : "project_note",
"_type" : "project_note",
"_id" : "19",
"_score" : 1.0,
"_source" : {
"createTime" : "2021-10-04T13:43:55.330",
"createTimeInMs" : 1633333435330,
"createdBy" : "test",
"editTime" : "2021-10-04T13:43:55.330",
"editTimeInMs" : 1633333435330,
"editedBy" : "test",
"versionId" : 1,
"id" : "19",
"organizationId" : "28",
"accessLevel" : "PUBLIC",
"status" : "ACTIVE",
"projectId" : "95",
"userId" : 129
}
]
In the first doc, it has noteType but in the second, I don't have that field stored in db.
I want to exclude the documents where noteType==null or noteType is absent.
But, I am getting only the docs which have noteType="SYSTEM_GENERATED"
My approach:
{
"query":
{
"bool" : {
"must" : [
{
"term" : {
"projectId" : {
"value" : "95",
"boost" : 1.0
}
}
},
{
"range" : {
"createTimeInMs" : {
"from" : null,
"to" : 1633594455000,
"include_lower" : true,
"include_upper" : true,
"boost" : 1.0
}
}
}
],
"must_not" : [
{
"term" : {
"noteType" : {
"value" : "SYSTEM_GENERATED",
"boost" : 1.0
}
}
}
],
"adjust_pure_negative" : true,
"boost" : 1.0
}
}
}
Equivalent java code:
BoolQueryBuilder queryBuilder= QueryBuilders.boolQuery();
queryBuilder.must(QueryBuilders.termQuery("projectId", requestInfo.getProjectId()));
queryBuilder.must(rangeQuery("createTimeInMs").lte(requestInfo.getCreateTimeInMs()));
if(!requestInfo.isIncludeLog()) {
queryBuilder.mustNot(QueryBuilders.termQuery("noteType", Defs.SYSTEM_NOTE_TYPE));
}
If only the must_not part of the query is used (excluding the must part)
{
"query": {
"bool": {
"must_not": [
{
"term": {
"noteType.keyword": {
"value": "SYSTEM_GENERATED",
"boost": 1.0
}
}
}
],
"adjust_pure_negative": true,
"boost": 1.0
}
}
}
The search result is similar to what you expect to get
"hits": [
{
"_index": "69477995",
"_type": "_doc",
"_id": "2",
"_score": 0.0,
"_source": {
"createTime": "2021-09-26T15:54:08.373",
"createTimeInMs": 1632650048373,
"createdBy": "test",
"editTime": "2021-09-26T15:54:08.373",
"editTimeInMs": 1632650048373,
"editedBy": "test",
"versionId": 1,
"id": "18",
"note": "note-1, simple note ",
"organizationId": "28",
"accessLevel": "PUBLIC",
"status": "ACTIVE",
"taskId": "5",
"userId": 129
}
}
]

How to return all fields instead of just id and count after sortByCount operation in Mongodb/Java?

I need to do sortByCount and return all the fields instead of just _id and count.
sortByCount returns:
{ "_id" : "1", "count" : 4 }
{ "_id" : "2", "count" : 3 }
{ "_id" : "3", "count" : 2 }
{ "_id" : "4", "count" : 2 }
{ "_id" : "5", "count" : 1 }
But, I need a complete document like below:
{
"_id": 1,
"title": "The Pillars of Society",
"artist": "Grosz",
"year": 1926,
"tags": ["painting", "satire", "Expressionism", "caricature"]
} {
"_id": 2,
"title": "Melancholy III",
"artist": "Munch",
"year": 1902,
"tags": ["woodcut", "Expressionism"]
} {
"_id": 3,
"title": "Dancer",
"artist": "Miro",
"year": 1925,
"tags": ["oil", "Surrealism", "painting"]
} {
"_id": 4,
"title": "The Great Wave off Kanagawa",
"artist": "Hokusai",
"tags": ["woodblock", "ukiyo-e"]
} {
"_id": 5,
"title": "The Persistence of Memory",
"artist": "Dali",
"year": 1931,
"tags": ["Surrealism", "painting", "oil"]
}
Is there any way to replace root after sortByCount? In Java, I don't see any push method after sortByCount
$sortByCount is essentially a combination of $group followed by $sort on count field of group stage. If you really want the entire documents, you can try this:
db.collection.aggregate([
{
"$group": {
"_id": {
"id": "$_id"
},
"count": {
$sum: 1
},
"reqItems": {
$push: {
"title": "$title",
"artist": "$artist"
}
}
}
},
{
$sort: {
count: -1
}
}
])
Playground link

Get multiple fields count from one query in MongoDB?

I have a collection of events its structure is as follows :
{
"_id" : ObjectId("537b3ff288f4ca2f471afcae"),
"Name" : "PREMISES MAP DELETED",
"ScreenName" : "AccessPointActivity",
"Timestamp" : NumberLong("1392113758000"),
"EventParams" : "null",
"TracInfo" : {
"ApplicationId" : "fa41f204bfc711e3b9f9c8cbb8c502c4",
"DeviceId" : "2_1VafJVPu4yfdbMWO1XGROjK6iQZhq4hAVCQL837W",
"UserId" : "pawan",
"SessionId" : "a8UHE16mowNwNGyuLXbW",
"WiFiAP" : "null",
"WiFiStrength" : 0,
"BluetoothID" : "null",
"BluetoothStrength" : 0,
"NetworkType" : "null",
"NetworkSubType" : "null",
"NetworkCarrier" : "Idea",
"Age" : 43,
"Gender" : "Female",
"OSVersion" : "16",
"Manufacturer" : "samsung",
"Resolution" : "600*976",
"Platform" : "Android",
"Latitude" : 40.42,
"Longitude" : -74,
"City" : "Monmouth County",
"CityLowerCase" : "monmouth county",
"Country" : "United States",
"CountryLowerCase" : "united states",
"Region" : "New Jersey",
"RegionLowerCase" : "new jersey",
"Time_zone" : "null",
"PinCode" : "07732",
"Locale" : ", Paradise Trailer Park",
"Accuracy" : 0,
"Timestamp" : NumberLong("1392113758000")
}
}
their are many event on different screens.
My expected output is as follows :
{
ApplicationId:"fa41f204bfc711e3b9f9c8cbb8c502c4",
EventName:"PREMISES MAP DELETED",
Eventcount:300,
ScreenviewCount:20,
DeviceCount:10,
UserCount:3
}
EventCount : It is count of EventName
ScreenviewCount : It is the count of distinct screenName distinct per session
DeviceCount : It is the count of distinct deviceId
UserCount : It is the count of distinct userCount
Their will be multiple event on multiple screens(ScreenName).
Currently i' am using following approach :
Using aggregation to get each event name and it count
eg :
{
_id:
{
ApplicationId:"fa41f204bfc711e3b9f9c8cbb8c502c4",
EventName:"PREMISES MAP DELETED"
}
EventCount:300
}
For each event name from above aggregation result I call following queries in while loop until aggregation output has documents:
a) Distinct query using eventName from aggregation output for screenview count(on event collection).
b) Distinct query eventName from aggregation output for device count(on event collection).
c) Distinct query eventName from aggregation output for user count(on event collection).
And the problem is its slow as it has 3 distinct queries on each result of aggregation output.
Is their any way to do it in single aggregation call or something else.
Thank you in advance!!!
The general case here that you seemed to have missed is that to get the "distinct" values of various fields in your document under the "event" totals you can use the $addToSet operator.
A "set" by definition has all of it's values "unique/distinct", so you just want to hold all those possible values in a "set" for your grouping level and then get the "size" of the array produced, which is exactly what the $size operator introduced in MongoDB 2.6 does.
db.collection.aggregate([
{ "$group": {
"_id": {
"ApplicationId": "$TracInfo.ApplicationId",
"EventName": "$Name",
},
"oScreenViewCount": {
"$addToSet": {
"ScreenName": "$ScreenName",
"SessionId": "$TracInfo.SessionId",
}
},
"oDeviceCount": { "$addToSet": "$TracInfo.DeviceId" },
"oUserCount": { "$addToSet": "$TracInfo.UserId" },
"oEventcount": { "$sum": 1 }
}},
{ "$project": {
"_id": 0,
"ApplicationId": "$_id.ApplicationId",
"EventName": "$_id.EventName",
"EventCount": "$oEventCount",
"ScreenViewCount": { "$size": "$oScreenViewCount" },
"DeviceCount": { "$size": "$oDeviceCount" },
"UserCount": { "$size": "$oUserCount" }
}}
])
Versions pre MongoDB 2.6 require a little more work, using $unwind and $group to count the arrays:
db.collection.aggregate([
{ "$group": {
"_id": {
"ApplicationId": "$TracInfo.ApplicationId",
"EventName": "$Name",
},
"oScreenviewCount": {
"$addToSet": {
"ScreenName": "$ScreenName",
"SessionId": "$TracInfo.SessionId",
}
},
"oDeviceCount": { "$addToSet": "$TracInfo.DeviceId" },
"oUserCount": { "$addToSet": "$TracInfo.UserId" },
"oEventcount": { "$sum": 1 }
}},
{ "$unwind": "$oScreeenviewCount" },
{ "$group": {
"_id": "$_id",
"oScreenviewCount": { "$sum": 1 },
"oDeviceCount": { "$first": "$oDeviceCount" },
"oUserCount": { "$first": "$oUserCount" },
"oEventcount": { "$first": "$oEventCount" }
}},
{ "$unwind": "$oDeviceCount" },
{ "$group": {
"_id": "$_id",
"oScreenviewCount": { "$first": "$oScreenViewCount" },
"oDeviceCount": { "$sum": "$oDeviceCount" },
"oUserCount": { "$first": "$oUserCount" },
"oEventcount": { "$first": "$oEventCount" }
}},
{ "$unwind": "$oUserCount" },
{ "$group": {
"_id": "$_id",
"oScreenviewCount": { "$first": "$oScreenViewCount" },
"oDeviceCount": { "$first": "$oDeviceCount" },
"oUserCount": { "$sum": "$oUserCount" },
"oEventcount": { "$first": "$oEventCount" }
}},
{ "$project": {
"_id": 0,
"ApplicationId": "$_id.ApplicationId",
"EventName": "$_id.EventName",
"EventCount": "$oEventCount",
"ScreenViewCount": "$oScreenViewCount",
"DeviceCount": "$oDeviceCount",
"UserCount": "$oUserCount"
}}
])
The end usage of $project in the second listing and all the general usage of the "o" prefixed names is really just for prettying up the result at the end and making sure the output field order is the same as in your sample result.
As a general disclaimer, your question lacks the information to determine the exact fields or combinations that are used for these totals, but the principles and approach are sound and should be near enough to the same implementation.
So essentially, you are getting the "distinct" values within the "group" by using $addToSet for whatever the field or combination is and then you are determining the "count" of those "sets" by whatever means is available to you.
Much better than issuing many queries and merging results in client code.

Categories

Resources