"match or null" query in elasticsearch - java

Suppose I have an index with all movies released from 2010 to 2019;
How can I translate this query in SQL to ElasticSearch?
Select *
From movies
Where
releaseDate between '2018-01-01' and '2019-01-01' and
gender is like 'action' and
(mainActorId = 42 or mainActorId is null)
I want all the action movies from 2018 with an specific main actor or no main actor at all. How would I translate that to an ElasticSearch query?
From what I've read so far in the documentation, I could use something like this:
{
"size":0,
"query":{
"bool":{
"must":[
{
"range":{
"releaseDate":{
"from":"2018-01-01T00:00:01.531Z",
"to":"2019-01-01T00:00:01.531Z",
"include_lower":true,
"include_upper":true,
"boost":1.0
}
}
},
{
"terms":{
"gender":[
"action"
],
"boost":1.0
}
},
{
"terms":{
"mainActorId":[
42,
56
],
"boost":1.0
}
}
],
"must_not":[
{
"exists":{
"field":"mainActorId",
"boost":1.0
}
}
],
"adjust_pure_negative":true,
"boost":1.0
}
}
}
But that's not giving me any hits, even though there are action movies released in 2018 with the main actor I want (or no main actors at all). If I take away the "must_not exist" clause, the query works fine and gives me the action movies with the main actors I want, but I also want action movies with no main actors...

Great start so far!! You're almost there, see the query below, it should do just what you expect:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"range": {
"releaseDate": {
"from": "2018-01-01T00:00:01.531Z",
"to": "2019-01-01T00:00:01.531Z",
"include_lower": true,
"include_upper": true,
"boost": 1
}
}
},
{
"terms": {
"gender": [
"action"
],
"boost": 1
}
}
],
"should": [
{
"bool": {
"must_not": [
{
"exists": {
"field": "mainActorId"
}
}
]
}
},
{
"terms": {
"mainActorId": [
42,
56
]
}
}
],
"minimum_should_match": 1,
"adjust_pure_negative": true,
"boost": 1
}
}
}

Related

ElastiSearch combine result set of two queries

I have below data structure in ElastiSearch.
[{
"name": "Kapil",
"age": 32,
"hobbies": ["Cricket", "Football", "Swimming"]
},
{
"name": "John",
"age": 33,
"hobbies": ["Baseball", "Football", "Swimming"]
},
{
"name": "Vick",
"age": 30,
"hobbies": ["Baseball", "Karate", "Swimming"]
}]
I want to get all records from the data in following order:
Get all users which has Football as hobby and sort them by age desc.
Get all other users sort by age desc.
So expecting result in following order John, Kapil and Vick.
I used below query to get result for #1.
{
"size": 500,
"query": {
"bool": {
"must": [
{
"match_phrase": {
"hobbies.keyword": "Football"
}
}
]
}
},
"sort": [
{
"age": {
"order": "desc"
}
}
]
}
and used below for point #2
{
"size": 500,
"query": {
"bool": {
"must_not": [
{
"match_phrase": {
"hobbies.keyword": "Football"
}
}
]
}
},
"sort": [
{
"age": {
"order": "desc"
}
}
]
}
With above, I am not able to maintain the paging logic. It also requires me to execute both queries separately. Can Someone please help how to achieve this?
Try this out:
{
"query": {
"bool": {
"must": {
"match_all": {} <-- retrieve all docs
},
"should": { <-- give higher score to docs that match this clause
"match_phrase": {
"hobbies.keyword": "Football"
}
}
}
},
"sort": [
"_score", <-- sort by doc score first
{
"age": { <-- when score is equal then sort by age
"order": "desc"
}
}
]
}

ElasticSearch sorting isn't sorting by field

I'm trying to perform a field sort on the specified field but to no avail. The query keeps returning the same position when I run the script.
Here is the ElasticSearch script:
{
"from": 0,
"size": 10,
"timeout": "60s",
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"query_string": {
"query": "random",
"fields": [],
"type": "best_fields",
"default_operator": "or",
"max_determinized_states": 10000,
"enable_position_increments": true,
"fuzziness": "AUTO",
"fuzzy_prefix_length": 0,
"fuzzy_max_expansions": 50,
"phrase_slop": 0,
"escape": false,
"auto_generate_synonyms_phrase_query": true,
"fuzzy_transpositions": true,
"boost": 1
}
},
{
"nested": {
"query": {
"bool": {
"must": [
{
"match": {
"reviews.source": {
"query": "TEST",
"operator": "AND",
"prefix_length": 0,
"max_expansions": 50,
"fuzzy_transpositions": true,
"lenient": false,
"zero_terms_query": "NONE",
"auto_generate_synonyms_phrase_query": true,
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
},
"path": "reviews",
"ignore_unmapped": false,
"score_mode": "avg",
"boost": 1,
"inner_hits": {
"name": "reviews",
"ignore_unmapped": false,
"from": 0,
"size": 3,
"version": false,
"seq_no_primary_term": false,
"explain": false,
"track_scores": false
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
],
"should": [
{
"match": {
"dataset": {
"query": "QUERY_TEST",
"operator": "OR",
"prefix_length": 0,
"max_expansions": 50,
"fuzzy_transpositions": true,
"lenient": false,
"zero_terms_query": "NONE",
"auto_generate_synonyms_phrase_query": true,
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"minimum_should_match": "1",
"boost": 1
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"reviews.openedAt": {
"order": "desc",
"nested": {
"path": "reviews"
}
}
}
]
}
The mapping I'm currently using:
"reviews": {
"type": "nested",
"properties": {
"id": {
"type": "keyword",
"copy_to": "fulltext"
},
"updatedAt": {
"type": "date",
"format": "strict_date_time",
"index": false
},
"openedAt": {
"type": "date",
"format": "strict_date_time"
}
I'm trying to sort the records based on a specific date in the reviews section. If a user inputs ASC, the returning values (reviews) should be in ascending order based on the openedAt date. I believe the sorting function isn't necessarily hitting the appropriate path. What should the sorting function look like?
I have a Java API that I created that calls the request and creates its own set of records:
public SearchResponse(SearchResponse response, SearchRequest searchRequest) {
this.facets = new ArrayList<>();
if (searchRequest == null || searchRequest.getRestricted().isEmpty()) {
this.records =
Stream.of(response.getHits().getHits()).map(SearchHit::getSourceAsMap).collect(Collectors.toList());
} else {
this.records = processRestrictedResults(response, searchRequest);
}
if (response.getAggregations() != null) {
for (Map.Entry<String, Aggregation> entry : response.getAggregations().getAsMap().entrySet()) {
this.facets.add(Facet.create(entry));
}
}
this.totalRecords = getTotalMatched(response);
}
To answer the original question, the top-level hits are indeed being sorted by the latest reviews.openedAt in the descending order — one of the reviews from doc#2 has the value 2021-04-06T08:13:53.552Z which is greater than the only reviews.openedAt from doc#1 (2021-03-30T08:13:53.552Z), thus #2 comes before #1.
What you're missing, though, is sorted inner_hits, as I explained here and here.
In your particular use case this would mean:
{
"from": 0,
"size": 10,
"timeout": "60s",
"query": {
"bool": {
"must": [
... // your original queries
{
"nested": {
"path": "reviews", <-- we need to enforce the nested context
"query": {
"match_all": {} <-- this could've been `"exists": { "field": "reviews.openedAt" }` too
},
"inner_hits": {
"sort": {
"reviews.openedAt": { <-- sorting the inner hits under the nested context
"order": "desc"
}
}
}
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"reviews.openedAt": { <-- sorting the top-level hits, as you previously were
"order": "desc",
"nested": {
"path": "reviews"
}
}
}
]
}
When you run the above query, each top-level hit will include an inner_hits attribute containing the sorted reviews which you can then post-process in your java backend.

Average of difference between the dates

A snippet of my elasticsearch data is like below. Status field is nested.
status: [
{
"updated_at": "2020-08-04 17:18:41",
"created_at": "2020-08-04 17:18:39",
"sub_stage": "Stage1"
},
{
"updated_at": "2020-08-04 17:21:15",
"created_at": "2020-08-04 17:18:41",
"sub_stage": "Stage2"
},
{
"updated_at": "2020-08-04 17:21:15",
"created_at": "2020-08-04 17:21:07",
"sub_stage": "Stage3"
}
]
After aggregating based on some field, I have for each bucket some documents and every document will have status field. Now, what I want is to find the average of time difference between stage1 and stage3.
For ex: Suppose for id = 1 bucket consists of 100 documents. Then for each document I have to find the time difference between stage 1 and stage 3. Then, finally take the average of it.
I am able to perform till aggregation but stuck at finding average.
With some effort, I am using below script but have no idea whether it is correct :
Map findEvent(List events, String type) {
return events.find(it -> it.sub_stage == type);
}
return ChronoUnit.DAYS.between(Instant.parse(findEvent(params._source.events, 'Stage1').timestamp), Instant.parse(findEvent(params._source.events, 'Stage3').timestamp););
Is there any way I can perform this in Java with this script or any other script ?
Roughly, Query looks like:
{
"from": 0,
"size": 0,
"query": {
"bool": {
"must": [
{
"nested": {
"query": {
"bool": {
"should": [
{
"match": {
"status.sub_stage": {
"query": "Stage1",
"operator": "OR",
"prefix_length": 0,
"max_expansions": 50,
"fuzzy_transpositions": true,
"lenient": false,
"zero_terms_query": "NONE",
"auto_generate_synonyms_phrase_query": true,
"boost": 1.0
}
}
}
],
"adjust_pure_negative": true,
"minimum_should_match": "1",
"boost": 1.0
}
},
"path": "status",
"ignore_unmapped": false,
"score_mode": "none",
"boost": 1.0
}
}
],
"adjust_pure_negative": true,
"minimum_should_match": "1",
"boost": 1.0
}
},
"aggregations": {
"id": {
"terms": {
"field": "id.keyword",
"size": 1000,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
},
"aggregations": {
"avg time": {
"avg": {
"script": {
"source": "Map findStage(List events, String type) { return events.find(it -> it.sub_stage == type); } return ChronoUnit.DAYS.between(Instant.parse(findStage(ctx._source.status, 'Stage1').timestamp), Instant.parse(findStage(ctx._source.status, 'Stage3').timestamp));",
"lang": "painless"
}
}
}
}
}
}
}

Mongo db java driver query convert

I have the following data structure
[{
"id": "1c7bbebd-bc3d-4352-9ac0-98c01d13189d",
"version": 0,
"groups": [
{
"internalName": "Admin group",
"fields": [
{
"internalName": "Is verified",
"uiProperties": {
"isShow": true
}
},
{
"internalName": "Hide",
"uiProperties": {
"isHide": false
}
},
...
]
},
...
]
},
{
"id": "2b7bbebd-bc3d-4352-9ac0-98c01d13189d",
"version": 0,
"groups": [
{
"internalName": "User group",
"fields": [
{
"internalName": "Is verified",
"uiProperties": {
"isShow": true
}
},
{
"internalName": "Blocked",
"uiProperties": {
"isBlocked": true
}
},
...
]
},
...
]
},
...
]
Internal names of the fields can be repeated. I want to group by group.field.internalName and cut the array(for pagination) and get the output like:
{
"totalCount": 3,
"items": [
{
"internalName": "Blocked"
},
{
"internalName": "Hide"
},
{
"internalName": "Is verified"
}
]}
I wrote a query that works,
db.layouts.aggregate(
{
$unwind : "$groups"
},
{
$unwind : "$groups.fields"
},
{
$group: {
"_id" : {
"internalName" : "$groups.fields.internalName",
},
"internalName" : {
$first : "$groups.fields.internalName"
}
}
},
{
$group: {
"_id" : null,
"items" : {
$push : "$$ROOT"
},
"totalCount" : {
$sum : 1
}
}
},
{
$project: {
"items" : {
$slice : [ "$items", 0, 20 ]
},
"totalCount": 1
}
})
but I have the problem of translating it to java api. Notice that i need to use mongoTemplate approach. Here is what i have and where i'm struck
final List<AggregationOperation> aggregationOperations = new ArrayList<>();
aggregationOperations.add(unwind("groups"));
aggregationOperations.add(unwind("groups.fields"));
aggregationOperations.add(
group("groups.fields.internalName")
.first("groups.fields.internalName").as("internalName")
);
aggregationOperations.add(
group()
.push("$$ROOT").as("fields")
.sum("1").as("totalCount") // ERROR only string ref can be placed, but i need a number?
);
aggregationOperations.add(
project()
.andInclude("totalCount")
.and("fields").slice(size, page * size)
);
final Aggregation aggregation = newAggregation(aggregationOperations);
mongoTemplate.aggregate(aggregation, LAYOUTS, FieldLites.class).getMappedResults()
With this query i have the problem with sum(), because i can place only a String ref by api(but need a number) and with project operation - got an exception
java.lang.IllegalArgumentException: Invalid reference 'totalCount'!] with root cause
Can you help me with this query translation?
You can use count
group()
.push("$$ROOT").as("fields")
.count().as("totalCount")

How to build an Elasticsearch-Query with startsWith-functionality and special characters

I have JsonObjects that i search with Elasticsearch from a Java Application, using the Java API to build searchQueries. The objects contain a field called "such" that contains a searchString with which the JsonObject should be found, for example a simple searchString would be "STVBBM160A". Besides the usual characters a-Z 0-9 the searchString could also look like the following examples:
"STV-157ABR", "F-G/42-W3" or "DDM000.074.6652"
The search should return results already when only the first characters are put into a searchfield, which it does for a search like "F-G/42"
My Problem: The search sometimes doesn't return results at all, but when the last character is typed it finds the right document.
What i tried: First I wanted to use a WildcardQuery where the query would be "typedStuff*", but the WildcardQuery didn't return any results at all, as soon as I typed anything but * (It used to work for other searchFields with other values)
Now I am using a QueryStringQuery, which also takes the input and puts a * character to the end. By escaping the QueryString, I am able to search for Strings like "F-G/42" and so on, but the search for "DDM000.074.6652" doesn't return any results until elasticsearch has the whole String to search. Also, when i type "STV" all results with "STV-xxxxx" (containing the "-" after STV) are returned, but not the object with "STVBBM160A", again until the whole String is given for the search (without showing any results inbetween as soon as the searchString is "STVB")
This is the query I'm using right now:
{
"size": 1000,
"min_score": 1,
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "MY_DATA_TYPE",
"fields": [
"doc.db_doc_type"
]
}
},
{
"query_string": {
"query": "MY_SPECIFIC_TYPE",
"fields": [
"doc.db_doc_specific"
]
}
}
],
"should": {
"query_string": {
"query": "STV*",
"fields": [
"doc.such"
],
"boost": 3,
"escape": true
}
}
}
}
}
This is the old Query with the WildCardQuery, which doesn't return any results at all unless there is no queryString but *:
{
"size": 50,
"min_score": 1,
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "MY_DATA_TYPE",
"fields": [
"doc.db_doc_type"
]
}
},
{
"query_string": {
"query": "MY_SPECIFIC_TYPE",
"fields": [
"doc.db_doc_specific"
]
}
}
],
"should": {
"wildcard": {
"doc.such": {
"wildcard": "STV*",
"boost": 3
}
}
}
}
}
}
When using a PrefixQuery, the search also doesn't return any results at all (with and without the *):
{
"size": 50,
"min_score": 1,
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "MY_DATA_TYPE",
"fields": [
"doc.db_doc_type"
]
}
},
{
"query_string": {
"query": "MY_SPECIFIC_TYPE",
"fields": [
"doc.db_doc_specific"
]
}
}
],
"should": {
"prefix": {
"doc.such": {
"prefix": "HSTKV*",
"boost": 3
}
}
}
}
}
}
How can this query be changed to achieve the goal of getting all results starting with the specified String, no matter if the field doc.such also contains Numbers or special chars like "_" or "." or "/" ?
Thanks in advance
As soon as you want to query prefixes, suffixes or substring in a serious way, you need to leverage nGrams. In your case, since you're only after prefixes, an edgeNGram tokenizer would be in order. You need to change the settings of your index to be like this one:
PUT your_index
{
"settings": {
"analysis": {
"analyzer": {
"prefix_analyzer": {
"tokenizer": "prefix_tokenizer",
"filter": [
"lowercase"
]
},
"search_prefix_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"prefix_tokenizer": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "25"
}
}
}
},
"mappings": {
"your_type": {
"properties": {
"doc": {
"properties": {
"such": {
"type": "string",
"fields": {
"starts_with": {
"type": "string",
"analyzer": "prefix_analyzer",
"search_analyzer": "search_prefix_analyzer"
}
}
}
}
}
}
}
}
}
What will happen with this analyzer is that when indexing F-G/42-W3 the following tokens will be indexed: f, f-, f-g, f-g/, f-g/4, f-g/42, f-g/42-, f-g/42-w, f-g/42-w3.
At search time, we'll simply lowercase the user input and the prefix will be matched against the indexed tokens.
Then your query can simply be transformed to a match query:
{
"size": 1000,
"min_score": 1,
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "MY_DATA_TYPE",
"fields": [
"doc.db_doc_type"
]
}
},
{
"query_string": {
"query": "MY_SPECIFIC_TYPE",
"fields": [
"doc.db_doc_specific"
]
}
}
],
"should": {
"match": {
"doc.such": {
"query": "F-G/4"
}
}
}
}
}
}

Categories

Resources