1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
45 use List::MoreUtils qw/ each_array /;
54 my $simple_query = $builder->build_query("hello", %options)
56 This will build a query that can be issued to elasticsearch from the provided
57 string input. This expects a lucene style search form (see
58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
61 It'll make an attempt to respect the various query options.
63 Additional options can be provided with the C<%options> hash.
69 This should be an arrayref of hashrefs, each containing a C<field> and an
70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
78 my ( $self, $query, %options ) = @_;
80 my $stemming = C4::Context->preference("QueryStemming") || 0;
81 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
82 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
91 fuzziness => $fuzzy_enabled ? 'auto' : '0',
92 default_operator => 'AND',
93 default_field => '_all',
94 lenient => JSON::true,
95 fields => $options{fields} || [],
99 if ( $options{sort} ) {
100 foreach my $sort ( @{ $options{sort} } ) {
101 my ( $f, $d ) = @$sort{qw/ field direction /};
102 die "Invalid sort direction, $d"
103 if $d && ( $d ne 'asc' && $d ne 'desc' );
104 $d = 'asc' unless $d;
106 $f = $self->_sort_field($f);
107 push @{ $res->{sort} }, { $f => { order => $d } };
111 # See _convert_facets in Search.pm for how these get turned into
112 # things that Koha can use.
113 $res->{aggregations} = {
114 author => { terms => { field => "author__facet" } },
115 subject => { terms => { field => "subject__facet" } },
116 itype => { terms => { field => "itype__facet" } },
117 location => { terms => { field => "location__facet" } },
118 'su-geo' => { terms => { field => "su-geo__facet" } },
119 se => { terms => { field => "se__facet" } },
120 ccode => { terms => { field => "ccode__facet" } },
123 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
124 if ( $display_library_facets eq 'both'
125 or $display_library_facets eq 'home' ) {
126 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
128 if ( $display_library_facets eq 'both'
129 or $display_library_facets eq 'holding' ) {
130 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
132 if ( my $ef = $options{expanded_facet} ) {
133 $res->{aggregations}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
138 =head2 build_browse_query
140 my $browse_query = $builder->build_browse_query($field, $query);
142 This performs a "starts with" style query on a particular field. The field
143 to be searched must have been indexed with an appropriate mapping as a
144 "phrase" subfield, which pretty much everything has.
148 # XXX this isn't really a browse query like we want in the end
149 sub build_browse_query {
150 my ( $self, $field, $query ) = @_;
152 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
154 return { query => '*' } if !defined $query;
156 # TODO this should come from Koha::SearchEngine::Elasticsearch
157 my %field_whitelist = (
161 $field = 'title' if !exists $field_whitelist{$field};
162 my $sort = $self->_sort_field($field);
165 match_phrase_prefix => {
169 fuzziness => $fuzzy_enabled ? 'auto' : '0',
173 sort => [ { $sort => { order => "asc" } } ],
177 =head2 build_query_compat
180 $error, $query, $simple_query, $query_cgi,
181 $query_desc, $limit, $limit_cgi, $limit_desc,
182 $stopwords_removed, $query_type
184 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
185 \@limits, \@sort_by, $scan, $lang );
187 This handles a search using the same api as L<C4::Search::buildQuery> does.
189 A very simple query will go in with C<$operands> set to ['query'], and
190 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
191 C<$query> set to something that can perform the search, C<$simple_query>
192 set to just the search term, C<$query_cgi> set to something that can
193 reproduce this search, and C<$query_desc> set to something else.
197 sub build_query_compat {
198 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
202 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
203 my @sort_params = $self->_convert_sort_fields(@$sort_by);
204 my @index_params = $self->_convert_index_fields(@$indexes);
205 my $limits = $self->_fix_limit_special_cases($orig_limits);
206 if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
208 # Merge the indexes in with the search terms and the operands so that
209 # each search thing is a handy unit.
210 unshift @$operators, undef; # The first one can't have an op
212 my $ea = each_array( @$operands, @$operators, @index_params );
213 while ( my ( $oand, $otor, $index ) = $ea->() ) {
214 next if ( !defined($oand) || $oand eq '' );
215 push @search_params, {
216 operand => $self->_clean_search_term($oand), # the search terms
217 operator => defined($otor) ? uc $otor : undef, # AND and so on
218 $index ? %$index : (),
222 # We build a string query from limits and the queries. An alternative
223 # would be to pass them separately into build_query and let it build
224 # them into a structured ES query itself. Maybe later, though that'd be
226 my $query_str = join( ' AND ',
227 join( ' ', $self->_create_query_string(@search_params) ) || (),
228 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
231 if ( defined($params->{weighted_fields}) && $params->{weighted_fields} ) {
232 push @fields, sprintf("%s^%s", $_->name, $_->weight) for Koha::SearchFields->weighted_fields;
235 # If there's no query on the left, let's remove the junk left behind
236 $query_str =~ s/^ AND //;
238 $options{fields} = \@fields;
239 $options{sort} = \@sort_params;
240 $options{expanded_facet} = $params->{expanded_facet};
241 my $query = $self->build_query( $query_str, %options );
244 # We roughly emulate the CGI parameters of the zebra query builder
246 $query_cgi = 'q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
248 $simple_query = $operands->[0] if @$operands == 1;
249 my $query_desc = $simple_query;
250 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
251 my $limit_cgi = ( $orig_limits and @$orig_limits )
252 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
255 $limit_desc = "$limit" if $limit;
257 undef, $query, $simple_query, $query_cgi, $query_desc,
258 $limit, $limit_cgi, $limit_desc, undef, undef
262 =head2 build_authorities_query
264 my $query = $builder->build_authorities_query(\%search);
266 This takes a nice description of an authority search and turns it into a black-box
267 query that can then be passed to the appropriate searcher.
269 The search description is a hashref that looks something like:
274 where => 'Heading', # search the main entry
275 operator => 'exact', # require an exact match
276 value => 'frogs', # the search string
279 where => '', # search all entries
280 operator => '', # default keyword, right truncation
288 authtypecode => 'TOPIC_TERM',
293 sub build_authorities_query {
294 my ( $self, $search ) = @_;
296 # Start by making the query parts
299 foreach my $s ( @{ $search->{searches} } ) {
300 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
301 $wh = '_all' if $wh eq '';
302 if ( $op eq 'is' || $op eq '=' || $op eq 'exact' ) {
304 # look for something that matches a term completely
305 # note, '=' is about numerical vals. May need special handling.
306 # Also, we lowercase our search because the ES
307 # index lowercases its values, and term searches don't get the
308 # search analyzer applied to them.
309 push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
311 elsif ( $op eq 'start' ) {
312 # startswith search, uses lowercase untokenized version of heading
313 push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
316 # regular wordlist stuff
317 # push @query_parts, { match => {$wh => { query => $val, operator => 'and' }} };
318 my @values = split(' ',$val);
319 foreach my $v (@values) {
320 push @query_parts, { wildcard => { "$wh.phrase" => "*" . lc $v . "*" } };
325 # Merge the query parts appropriately
326 # 'should' behaves like 'or'
327 # 'must' behaves like 'and'
328 # Zebra results seem to match must so using that here
329 my $query = { query =>
331 { must => \@query_parts }
336 if ( exists $search->{sort} ) {
337 foreach my $k ( keys %{ $search->{sort} } ) {
338 my $f = $self->_sort_field($k);
339 $s{$f} = $search->{sort}{$k};
341 $search->{sort} = \%s;
345 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
351 =head2 build_authorities_query_compat
354 $builder->build_authorities_query_compat( \@marclist, \@and_or,
355 \@excluding, \@operator, \@value, $authtypecode, $orderby );
357 This builds a query for searching for authorities, in the style of
358 L<C4::AuthoritiesMarc::SearchAuthorities>.
366 An arrayref containing where the particular term should be searched for.
367 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
368 thesaurus. If left blank, any field is used.
372 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
380 What form of search to do. Options are: is (phrase, no truncation, whole field
381 must match), = (number exact match), exact (phrase, no truncation, whole field
382 must match). If left blank, then word list, right truncated, anywhere is used.
386 The actual user-provided string value to search for.
390 The authority type code to search within. If blank, then all will be searched.
394 The order to sort the results by. Options are Relevance, HeadingAsc,
395 HeadingDsc, AuthidAsc, AuthidDsc.
399 marclist, operator, and value must be the same length, and the values at
400 index /i/ all relate to each other.
402 This returns a query, which is a black box object that can be passed to the
403 appropriate search object.
407 our $koha_to_index_name = {
408 mainmainentry => 'Heading-Main',
409 mainentry => 'Heading',
411 'match-heading' => 'Match-heading',
412 'see-from' => 'Match-heading-see-from',
413 thesaurus => 'Subject-heading-thesaurus',
418 sub build_authorities_query_compat {
419 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
420 $authtypecode, $orderby )
423 # This turns the old-style many-options argument form into a more
424 # extensible hash form that is understood by L<build_authorities_query>.
427 # Make sure everything exists
428 foreach my $m (@$marclist) {
429 Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
430 unless exists $koha_to_index_name->{$m};
432 for ( my $i = 0 ; $i < @$value ; $i++ ) {
433 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
436 where => $koha_to_index_name->{$marclist->[$i]},
437 operator => $operator->[$i],
438 value => $value->[$i],
444 ( $orderby =~ /^Heading/ ) ? 'Heading'
445 : ( $orderby =~ /^Auth/ ) ? 'Local-number'
448 my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
449 %sort = ( $sort_field => $sort_order, );
452 searches => \@searches,
453 authtypecode => $authtypecode,
455 $search{sort} = \%sort if %sort;
456 my $query = $self->build_authorities_query( \%search );
460 =head2 _convert_sort_fields
462 my @sort_params = _convert_sort_fields(@sort_by)
464 Converts the zebra-style sort index information into elasticsearch-style.
466 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
467 something that can be sent to L<build_query>.
471 sub _convert_sort_fields {
472 my ( $self, @sort_by ) = @_;
474 # Turn the sorting into something we care about.
475 my %sort_field_convert = (
476 acqdate => 'acqdate',
478 call_number => 'callnum',
479 popularity => 'issues',
480 relevance => undef, # default
482 pubdate => 'pubdate',
484 my %sort_order_convert =
485 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
487 # Convert the fields and orders, drop anything we don't know about.
488 grep { $_->{field} } map {
489 my ( $f, $d ) = /(.+)_(.+)/;
491 field => $sort_field_convert{$f},
492 direction => $sort_order_convert{$d}
497 =head2 _convert_index_fields
499 my @index_params = $self->_convert_index_fields(@indexes);
501 Converts zebra-style search index notation into elasticsearch-style.
503 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
504 and it returns something that can be sent to L<build_query>.
506 B<TODO>: this will pull from the elasticsearch mappings table to figure out
511 our %index_field_convert = (
517 'se' => 'title-series',
518 'callnum' => 'callnum',
521 'branch' => 'homebranch',
525 'hi' => 'Host-Item-Number',
528 sub _convert_index_fields {
529 my ( $self, @indexes ) = @_;
531 my %index_type_convert =
532 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
534 # Convert according to our table, drop anything that doesn't convert.
535 # If a field starts with mc- we save it as it's used (and removed) later
536 # when joining things, to indicate we make it an 'OR' join.
537 # (Sorry, this got a bit ugly after special cases were found.)
538 grep { $_->{field} } map {
539 my ( $f, $t ) = split /,/;
546 field => $index_field_convert{$f},
547 type => $index_type_convert{ $t // '__default' }
549 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
554 =head2 _convert_index_strings
556 my @searches = $self->_convert_index_strings(@searches);
558 Similar to L<_convert_index_fields>, this takes strings of the form
559 B<field:search term> and rewrites the field from zebra-style to
560 elasticsearch-style. Anything it doesn't understand is returned verbatim.
564 sub _convert_index_strings {
565 my ( $self, @searches ) = @_;
567 foreach my $s (@searches) {
569 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
570 unless ( defined($field) && defined($term) ) {
574 my ($conv) = $self->_convert_index_fields($field);
575 unless ( defined($conv) ) {
579 push @res, $conv->{field} . ":"
580 . $self->_modify_string_by_type( %$conv, operand => $term );
585 =head2 _convert_index_strings_freeform
587 my $search = $self->_convert_index_strings_freeform($search);
589 This is similar to L<_convert_index_strings>, however it'll search out the
590 things to change within the string. So it can handle strings such as
591 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
593 If there is something of the form "su,complete-subfield" or something, the
594 second part is stripped off as we can't yet handle that. Making it work
595 will have to wait for a real query parser.
599 sub _convert_index_strings_freeform {
600 my ( $self, $search ) = @_;
601 while ( my ( $zeb, $es ) = each %index_field_convert ) {
602 $search =~ s/\b$zeb(?:,[\w\-]*)?:/$es:/g;
607 =head2 _modify_string_by_type
609 my $str = $self->_modify_string_by_type(%index_field);
611 If you have a search term (operand) and a type (phrase, right-truncated), this
612 will convert the string to have the function in lucene search terms, e.g.
613 wrapping quotes around it.
617 sub _modify_string_by_type {
618 my ( $self, %idx ) = @_;
620 my $type = $idx{type} || '';
621 my $str = $idx{operand};
622 return $str unless $str; # Empty or undef, we can't use it.
624 $str .= '*' if $type eq 'right-truncate';
625 $str = '"' . $str . '"' if $type eq 'phrase';
631 my $query_str = $self->_join_queries(@query_parts);
633 This takes a list of query parts, that might be search terms on their own, or
634 booleaned together, or specifying fields, or whatever, wraps them in
635 parentheses, and ANDs them all together. Suitable for feeding to the ES
638 Note: doesn't AND them together if they specify an index that starts with "mc"
639 as that was a special case in the original code for dealing with multiple
640 choice options (you can't search for something that has an itype of A and
641 and itype of B otherwise.)
646 my ( $self, @parts ) = @_;
648 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
650 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
651 return () unless @norm_parts + @mc_parts;
652 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
654 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
656 # Handy trick: $x || () inside a join means that if $x ends up as an
657 # empty string, it gets replaced with (), which makes join ignore it.
658 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
661 join( ' AND ', map { "($_)" } @norm_parts ) || (),
667 my @phrased_queries = $self->_make_phrases(@query_parts);
669 This takes the supplied queries and forces them to be phrases by wrapping
670 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
671 the quotes outside of them if they're there.
676 my ( $self, @parts ) = @_;
677 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
680 =head2 _create_query_string
682 my @query_strings = $self->_create_query_string(@queries);
684 Given a list of hashrefs, it will turn them into a lucene-style query string.
685 The hash should contain field, type (both for the indexes), operator, and
690 sub _create_query_string {
691 my ( $self, @queries ) = @_;
694 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
695 my $field = $_->{field} ? $_->{field} . ':' : '';
697 my $oand = $self->_modify_string_by_type(%$_);
698 "$otor($field$oand)";
702 =head2 _clean_search_term
704 my $term = $self->_clean_search_term($term);
706 This cleans a search term by removing any funny characters that may upset
707 ES and give us an error. It also calls L<_convert_index_strings_freeform>
708 to ensure those parts are correct.
712 sub _clean_search_term {
713 my ( $self, $term ) = @_;
715 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
717 # Some hardcoded searches (like with authorities) produce things like
718 # 'an=123', when it ought to be 'an:123' for our purposes.
720 $term = $self->_convert_index_strings_freeform($term);
722 $term = $self->_truncate_terms($term) if ($auto_truncation);
726 =head2 _fix_limit_special_cases
728 my $limits = $self->_fix_limit_special_cases($limits);
730 This converts any special cases that the limit specifications have into things
731 that are more readily processable by the rest of the code.
733 The argument should be an arrayref, and it'll return an arrayref.
737 sub _fix_limit_special_cases {
738 my ( $self, $limits ) = @_;
741 foreach my $l (@$limits) {
743 # This is set up by opac-search.pl
744 if ( $l =~ /^yr,st-numeric,ge=/ ) {
745 my ( $start, $end ) =
746 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
747 next unless defined($start) && defined($end);
748 push @new_lim, "copydate:[$start TO $end]";
750 elsif ( $l =~ /^yr,st-numeric=/ ) {
751 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
752 next unless defined($date);
753 push @new_lim, "copydate:$date";
755 elsif ( $l =~ /^available$/ ) {
756 push @new_lim, 'onloan:0';
767 my $field = $self->_sort_field($field);
769 Given a field name, this works out what the actual name of the field to sort
770 on should be. A '__sort' suffix is added for fields with a sort version, and
771 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
772 to avoid sorting on a tokenized value.
779 my $mappings = $self->get_elasticsearch_mappings();
780 my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
781 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
783 # We need to add '.phrase' to text fields, otherwise it'll sort
784 # based on the tokenised form.
785 $f .= '.phrase' if $textField;
787 # We need to add '.raw' to text fields without a sort field,
788 # otherwise it'll sort based on the tokenised form.
789 $f .= '.raw' if $textField;
794 =head2 _truncate_terms
796 my $query = $self->_truncate_terms($query);
798 Given a string query this function appends '*' wildcard to all terms except
799 operands and double quoted strings.
803 sub _truncate_terms {
804 my ( $self, $query ) = @_;
806 # '"donald duck" title:"the mouse" and peter" get split into
807 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
808 my @tokens = split /((?:[\w\-.]+:)?"[^"]+"|\s+)/, $query;
810 # Filter out empty tokens
811 my @words = grep { $_ !~ /^\s*$/ } @tokens;
813 # Append '*' to words if needed, ie. if it's not surrounded by quotes, not
814 # terminated by '*' and not a keyword
817 (/"$/ or /\*$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
820 return join ' ', @terms;