Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51 use Koha::Caches;
  52
  53 =head2 build_query
  54
  55     my $simple_query = $builder->build_query("hello", %options)
  56
  57 This will build a query that can be issued to elasticsearch from the provided
  58 string input. This expects a lucene style search form (see
  59 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
  60 for details.)
  61
  62 It'll make an attempt to respect the various query options.
  63
  64 Additional options can be provided with the C<%options> hash.
  65
  66 =over 4
  67
  68 =item sort
  69
  70 This should be an arrayref of hashrefs, each containing a C<field> and an
  71 C<direction> (optional, defaults to C<asc>.) The results will be sorted
  72 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
  73
  74 =back
  75
  76 =cut
  77
  78 sub build_query {
  79     my ( $self, $query, %options ) = @_;
  80
  81     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
  82     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
  83     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
  84
  85     $query = '*' unless defined $query;
  86
  87     my $res;
  88     my $fields = $self->_search_fields({
  89         is_opac => $options{is_opac},
  90         weighted_fields => $options{weighted_fields},
  91     });
  92     if ($options{whole_record}) {
  93         push @$fields, 'marc_data_array.*';
  94     }
  95     $res->{query} = {
  96         query_string => {
  97             query            => $query,
  98             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
  99             default_operator => 'AND',
 100             fields           => $fields,
 101             lenient          => JSON::true,
 102             analyze_wildcard => JSON::true,
 103         }
 104     };
 105
 106     if ( $options{sort} ) {
 107         foreach my $sort ( @{ $options{sort} } ) {
 108             my ( $f, $d ) = @$sort{qw/ field direction /};
 109             die "Invalid sort direction, $d"
 110               if $d && ( $d ne 'asc' && $d ne 'desc' );
 111             $d = 'asc' unless $d;
 112
 113             $f = $self->_sort_field($f);
 114             push @{ $res->{sort} }, { $f => { order => $d } };
 115         }
 116     }
 117
 118     # See _convert_facets in Search.pm for how these get turned into
 119     # things that Koha can use.
 120     my $size = C4::Context->preference('FacetMaxCount');
 121     $res->{aggregations} = {
 122         author         => { terms => { field => "author__facet" , size => $size } },
 123         subject        => { terms => { field => "subject__facet", size => $size } },
 124         itype          => { terms => { field => "itype__facet", size => $size} },
 125         location       => { terms => { field => "location__facet", size => $size } },
 126         'su-geo'       => { terms => { field => "su-geo__facet", size => $size} },
 127         'title-series' => { terms => { field => "title-series__facet", size => $size } },
 128         ccode          => { terms => { field => "ccode__facet", size => $size } },
 129         ln             => { terms => { field => "ln__facet", size => $size } },
 130     };
 131
 132     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 133     if (   $display_library_facets eq 'both'
 134         or $display_library_facets eq 'home' ) {
 135         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
 136     }
 137     if (   $display_library_facets eq 'both'
 138         or $display_library_facets eq 'holding' ) {
 139         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
 140     }
 141     return $res;
 142 }
 143
 144 =head2 build_query_compat
 145
 146     my (
 147         $error,             $query, $simple_query, $query_cgi,
 148         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 149         $stopwords_removed, $query_type
 150       )
 151       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 152         \@limits, \@sort_by, $scan, $lang, $params );
 153
 154 This handles a search using the same api as L<C4::Search::buildQuery> does.
 155
 156 A very simple query will go in with C<$operands> set to ['query'], and
 157 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 158 C<$query> set to something that can perform the search, C<$simple_query>
 159 set to just the search term, C<$query_cgi> set to something that can
 160 reproduce this search, and C<$query_desc> set to something else.
 161
 162 =cut
 163
 164 sub build_query_compat {
 165     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 166         $lang, $params )
 167       = @_;
 168
 169     my $query;
 170     my $query_str = '';
 171     my $search_param_query_str = '';
 172     my $limits = ();
 173     if ( $scan ) {
 174         ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
 175         $search_param_query_str = $query_str;
 176     } else {
 177         my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 178         my @index_params = $self->_convert_index_fields(@$indexes);
 179         $limits       = $self->_fix_limit_special_cases($orig_limits);
 180         if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
 181         # Merge the indexes in with the search terms and the operands so that
 182         # each search thing is a handy unit.
 183         unshift @$operators, undef;    # The first one can't have an op
 184         my @search_params;
 185         my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 186         my $ea = each_array( @$operands, @$operators, @index_params );
 187         while ( my ( $oand, $otor, $index ) = $ea->() ) {
 188             next if ( !defined($oand) || $oand eq '' );
 189             $oand = $self->_clean_search_term($oand);
 190             $oand = $self->_truncate_terms($oand) if ($truncate);
 191             push @search_params, {
 192                 operand => $oand,      # the search terms
 193                 operator => defined($otor) ? uc $otor : undef,    # AND and so on
 194                 $index ? %$index : (),
 195             };
 196         }
 197
 198         # We build a string query from limits and the queries. An alternative
 199         # would be to pass them separately into build_query and let it build
 200         # them into a structured ES query itself. Maybe later, though that'd be
 201         # more robust.
 202         $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
 203         $query_str = join( ' AND ',
 204             $search_param_query_str || (),
 205             $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 206
 207         # If there's no query on the left, let's remove the junk left behind
 208         $query_str =~ s/^ AND //;
 209         my %options;
 210         $options{sort} = \@sort_params;
 211         $options{is_opac} = $params->{is_opac};
 212         $options{weighted_fields} = $params->{weighted_fields};
 213         $options{whole_record} = $params->{whole_record};
 214         $query = $self->build_query( $query_str, %options );
 215     }
 216
 217     # We roughly emulate the CGI parameters of the zebra query builder
 218     my $query_cgi = '';
 219     shift @$operators; # Shift out the one we unshifted before
 220     my $ea = each_array( @$operands, @$operators, @$indexes );
 221     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 222         $query_cgi .= '&' if $query_cgi;
 223         $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
 224         $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
 225     }
 226     $query_cgi .= '&scan=1' if ( $scan );
 227
 228     my $simple_query;
 229     $simple_query = $operands->[0] if @$operands == 1;
 230     my $query_desc;
 231     if ( $simple_query ) {
 232         $query_desc = $simple_query;
 233     } else {
 234         $query_desc = $search_param_query_str;
 235     }
 236     my $limit     = $self->_join_queries( $self->_convert_index_strings(@$limits));
 237     my $limit_cgi = ( $orig_limits and @$orig_limits )
 238       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 239       : '';
 240     my $limit_desc;
 241     $limit_desc = "$limit" if $limit;
 242
 243     return (
 244         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 245         $limit, $limit_cgi, $limit_desc,   undef,      undef
 246     );
 247 }
 248
 249 =head2 build_authorities_query
 250
 251     my $query = $builder->build_authorities_query(\%search);
 252
 253 This takes a nice description of an authority search and turns it into a black-box
 254 query that can then be passed to the appropriate searcher.
 255
 256 The search description is a hashref that looks something like:
 257
 258     {
 259         searches => [
 260             {
 261                 where    => 'Heading',    # search the main entry
 262                 operator => 'exact',        # require an exact match
 263                 value    => 'frogs',        # the search string
 264             },
 265             {
 266                 where    => '',             # search all entries
 267                 operator => '',             # default keyword, right truncation
 268                 value    => 'pond',
 269             },
 270         ],
 271         sort => {
 272             field => 'Heading',
 273             order => 'desc',
 274         },
 275         authtypecode => 'TOPIC_TERM',
 276     }
 277
 278 =cut
 279
 280 sub build_authorities_query {
 281     my ( $self, $search ) = @_;
 282
 283     # Start by making the query parts
 284     my @query_parts;
 285
 286     foreach my $s ( @{ $search->{searches} } ) {
 287         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 288         if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
 289             if ($wh) {
 290                 # Match the whole field, case insensitive, UTF normalized.
 291                 push @query_parts, { term => { "$wh.ci_raw" => $val } };
 292             }
 293             else {
 294                 # Match the whole field for all searchable fields, case insensitive,
 295                 # UTF normalized.
 296                 # Given that field data is "The quick brown fox"
 297                 # "The quick brown fox" and "the quick brown fox" will match
 298                 # but not "quick brown fox".
 299                 push @query_parts, {
 300                     multi_match => {
 301                         query => $val,
 302                         fields => $self->_search_fields({ subfield => 'ci_raw' }),
 303                     }
 304                 };
 305             }
 306         }
 307         elsif ( defined $op && $op eq 'start') {
 308             # Match the prefix within a field for all searchable fields.
 309             # Given that field data is "The quick brown fox"
 310             # "The quick bro" will match, but not "quick bro"
 311
 312             # Does not seems to be a multi prefix query
 313             # so we need to create one
 314             if ($wh) {
 315                 # Match prefix of the field.
 316                 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
 317             }
 318             else {
 319                 my @prefix_queries;
 320                 foreach my $field (@{$self->_search_fields()}) {
 321                     push @prefix_queries, {
 322                         prefix => { "$field.ci_raw" => $val }
 323                     };
 324                 }
 325                 push @query_parts, {
 326                     'bool' => {
 327                         'should' => \@prefix_queries,
 328                         'minimum_should_match' => 1
 329                     }
 330                 };
 331             }
 332         }
 333         else {
 334             # Query all searchable fields.
 335             # Given that field data is "The quick brown fox"
 336             # a search containing any of the words will match, regardless
 337             # of order.
 338
 339             my @tokens = $self->_split_query( $val );
 340             foreach my $token ( @tokens ) {
 341                 $token = $self->_truncate_terms(
 342                     $self->_clean_search_term( $token )
 343                 );
 344             }
 345             my $query = $self->_join_queries( @tokens );
 346
 347             if ($wh) {
 348                 push @query_parts, { query_string => {
 349                     default_field => $wh,
 350                     analyze_wildcard => JSON::true,
 351                     query => $query
 352                 } };
 353             }
 354             else {
 355                 push @query_parts, {
 356                     query_string => {
 357                         analyze_wildcard => JSON::true,
 358                         query => $query,
 359                         fields => $self->_search_fields(),
 360                     }
 361                 };
 362             }
 363         }
 364     }
 365
 366     # Merge the query parts appropriately
 367     # 'should' behaves like 'or'
 368     # 'must' behaves like 'and'
 369     # Zebra behaviour seem to match must so using that here
 370     my $elastic_query = {};
 371     $elastic_query->{bool}->{must} = \@query_parts;
 372
 373     # Filter by authtypecode if set
 374     if ($search->{authtypecode}) {
 375         $elastic_query->{bool}->{filter} = {
 376             term => {
 377                 "authtype.raw" => $search->{authtypecode}
 378             }
 379         };
 380     }
 381
 382     my $query = {
 383         query => $elastic_query
 384     };
 385
 386     # Add the sort stuff
 387     $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
 388
 389     return $query;
 390 }
 391
 392 =head2 build_authorities_query_compat
 393
 394     my ($query) =
 395       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 396         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 397
 398 This builds a query for searching for authorities, in the style of
 399 L<C4::AuthoritiesMarc::SearchAuthorities>.
 400
 401 Arguments:
 402
 403 =over 4
 404
 405 =item marclist
 406
 407 An arrayref containing where the particular term should be searched for.
 408 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 409 thesaurus. If left blank, any field is used.
 410
 411 =item and_or
 412
 413 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 414
 415 =item excluding
 416
 417 Also ignored.
 418
 419 =item operator
 420
 421 What form of search to do. Options are: is (phrase, no truncation, whole field
 422 must match), = (number exact match), exact (phrase, no truncation, whole field
 423 must match). If left blank, then word list, right truncated, anywhere is used.
 424
 425 =item value
 426
 427 The actual user-provided string value to search for.
 428
 429 =item authtypecode
 430
 431 The authority type code to search within. If blank, then all will be searched.
 432
 433 =item orderby
 434
 435 The order to sort the results by. Options are Relevance, HeadingAsc,
 436 HeadingDsc, AuthidAsc, AuthidDsc.
 437
 438 =back
 439
 440 marclist, operator, and value must be the same length, and the values at
 441 index /i/ all relate to each other.
 442
 443 This returns a query, which is a black box object that can be passed to the
 444 appropriate search object.
 445
 446 =cut
 447
 448 our $koha_to_index_name = {
 449     mainmainentry   => 'heading-main',
 450     mainentry       => 'heading',
 451     match           => 'match',
 452     'match-heading' => 'match-heading',
 453     'see-from'      => 'match-heading-see-from',
 454     thesaurus       => 'subject-heading-thesaurus',
 455     any             => '',
 456     all             => ''
 457 };
 458
 459 sub build_authorities_query_compat {
 460     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 461         $authtypecode, $orderby )
 462       = @_;
 463
 464     # This turns the old-style many-options argument form into a more
 465     # extensible hash form that is understood by L<build_authorities_query>.
 466     my @searches;
 467     my $mappings = $self->get_elasticsearch_mappings();
 468
 469     # Convert to lower case
 470     $marclist = [map(lc, @{$marclist})];
 471     $orderby  = lc $orderby;
 472
 473     my @indexes;
 474     # Make sure everything exists
 475     foreach my $m (@$marclist) {
 476
 477         $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
 478         push @indexes, $m;
 479         warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '');
 480     }
 481     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 482         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 483         push @searches,
 484           {
 485             where    => $indexes[$i],
 486             operator => $operator->[$i],
 487             value    => $value->[$i],
 488           };
 489     }
 490
 491     my %sort;
 492     my $sort_field =
 493         ( $orderby =~ /^heading/ ) ? 'heading__sort'
 494       : ( $orderby =~ /^auth/ )    ? 'local-number__sort'
 495       :                              undef;
 496     if ($sort_field) {
 497         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 498         %sort = ( $sort_field => $sort_order, );
 499     }
 500     my %search = (
 501         searches     => \@searches,
 502         authtypecode => $authtypecode,
 503     );
 504     $search{sort} = \%sort if %sort;
 505     my $query = $self->build_authorities_query( \%search );
 506     return $query;
 507 }
 508
 509 =head2 _build_scan_query
 510
 511     my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
 512
 513 This will build an aggregation scan query that can be issued to elasticsearch from
 514 the provided string input.
 515
 516 =cut
 517
 518 our %scan_field_convert = (
 519     'ti' => 'title',
 520     'au' => 'author',
 521     'su' => 'subject',
 522     'se' => 'title-series',
 523     'pb' => 'publisher',
 524 );
 525
 526 sub _build_scan_query {
 527     my ( $self, $operands, $indexes ) = @_;
 528
 529     my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
 530     my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
 531
 532     my ( $f, $d ) = split( /,/, $index);
 533     $index = $scan_field_convert{$f} || $f;
 534
 535     my $res;
 536     $res->{query} = {
 537         query_string => {
 538             query => '*'
 539         }
 540     };
 541     $res->{aggregations} = {
 542         $index => {
 543             terms => {
 544                 field => $index . '__facet',
 545                 order => { '_term' => 'asc' },
 546                 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
 547             }
 548         }
 549     };
 550     return ($res, $term);
 551 }
 552
 553 =head2 _create_regex_filter
 554
 555     my $filter = $builder->_create_regex_filter('term')
 556
 557 This will create a regex filter that can be used with an aggregation query.
 558
 559 =cut
 560
 561 sub _create_regex_filter {
 562     my ($self, $term) = @_;
 563
 564     my $result = '';
 565     foreach my $c (split(//, quotemeta($term))) {
 566         my $lc = lc($c);
 567         my $uc = uc($c);
 568         $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
 569     }
 570     return $result;
 571 }
 572
 573 =head2 _convert_sort_fields
 574
 575     my @sort_params = _convert_sort_fields(@sort_by)
 576
 577 Converts the zebra-style sort index information into elasticsearch-style.
 578
 579 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 580 something that can be sent to L<build_query>.
 581
 582 =cut
 583
 584 sub _convert_sort_fields {
 585     my ( $self, @sort_by ) = @_;
 586
 587     # Turn the sorting into something we care about.
 588     my %sort_field_convert = (
 589         acqdate     => 'date-of-acquisition',
 590         author      => 'author',
 591         call_number => 'local-classification',
 592         popularity  => 'issues',
 593         relevance   => undef,       # default
 594         title       => 'title',
 595         pubdate     => 'date-of-publication',
 596     );
 597     my %sort_order_convert =
 598       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 599
 600     # Convert the fields and orders, drop anything we don't know about.
 601     grep { $_->{field} } map {
 602         my ( $f, $d ) = /(.+)_(.+)/;
 603         {
 604             field     => $sort_field_convert{$f},
 605             direction => $sort_order_convert{$d}
 606         }
 607     } @sort_by;
 608 }
 609
 610 =head2 _convert_index_fields
 611
 612     my @index_params = $self->_convert_index_fields(@indexes);
 613
 614 Converts zebra-style search index notation into elasticsearch-style.
 615
 616 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 617 and it returns something that can be sent to L<build_query>.
 618
 619 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 620 types.
 621
 622 =cut
 623
 624 our %index_field_convert = (
 625     'kw' => '',
 626     'ab' => 'abstract',
 627     'au' => 'author',
 628     'lcn' => 'local-classification',
 629     'callnum' => 'local-classification',
 630     'record-type' => 'rtype',
 631     'mc-rtype' => 'rtype',
 632     'mus' => 'rtype',
 633     'lc-card' => 'lc-card-number',
 634     'sn' => 'local-number',
 635     'biblionumber' => 'local-number',
 636     'yr' => 'date-of-publication',
 637     'pubdate' => 'date-of-publication',
 638     'acqdate' => 'date-of-acquisition',
 639     'date/time-last-modified' => 'date-time-last-modified',
 640     'dtlm' => 'date-time-last-modified',
 641     'diss' => 'dissertation-information',
 642     'nb' => 'isbn',
 643     'ns' => 'issn',
 644     'music-number' => 'identifier-publisher-for-music',
 645     'number-music-publisher' => 'identifier-publisher-for-music',
 646     'music' => 'identifier-publisher-for-music',
 647     'ident' => 'identifier-standard',
 648     'cpn' => 'corporate-name',
 649     'cfn' => 'conference-name',
 650     'pn' => 'personal-name',
 651     'pb' => 'publisher',
 652     'pv' => 'provider',
 653     'nt' => 'note',
 654     'notes' => 'note',
 655     'rcn' => 'record-control-number',
 656     'su' => 'subject',
 657     'su-to' => 'subject',
 658     #'su-geo' => 'subject',
 659     'su-ut' => 'subject',
 660     'ti' => 'title',
 661     'se' => 'title-series',
 662     'ut' => 'title-uniform',
 663     'an' => 'koha-auth-number',
 664     'authority-number' => 'koha-auth-number',
 665     'at' => 'authtype',
 666     'he' => 'heading',
 667     'rank' => 'relevance',
 668     'phr' => 'st-phrase',
 669     'wrdl' => 'st-word-list',
 670     'rt' => 'right-truncation',
 671     'rtrn' => 'right-truncation',
 672     'ltrn' => 'left-truncation',
 673     'rltrn' => 'left-and-right',
 674     'mc-itemtype' => 'itemtype',
 675     'mc-ccode' => 'ccode',
 676     'branch' => 'homebranch',
 677     'mc-loc' => 'location',
 678     'loc' => 'location',
 679     'stocknumber' => 'number-local-acquisition',
 680     'inv' => 'number-local-acquisition',
 681     'bc' => 'barcode',
 682     'mc-itype' => 'itype',
 683     'aub' => 'author-personal-bibliography',
 684     'auo' => 'author-in-order',
 685     'ff8-22' => 'ta',
 686     'aud' => 'ta',
 687     'audience' => 'ta',
 688     'frequency-code' => 'ff8-18',
 689     'illustration-code' => 'ff8-18-21',
 690     'regularity-code' => 'ff8-19',
 691     'type-of-serial' => 'ff8-21',
 692     'format' => 'ff8-23',
 693     'conference-code' => 'ff8-29',
 694     'festschrift-indicator' => 'ff8-30',
 695     'index-indicator' => 'ff8-31',
 696     'fiction' => 'lf',
 697     'fic' => 'lf',
 698     'literature-code' => 'lf',
 699     'biography' => 'bio',
 700     'ff8-34' => 'bio',
 701     'biography-code' => 'bio',
 702     'l-format' => 'ff7-01-02',
 703     'lex' => 'lexile-number',
 704     'hi' => 'host-item-number',
 705     'itu' => 'index-term-uncontrolled',
 706     'itg' => 'index-term-genre',
 707 );
 708 my $field_name_pattern = '[\w\-]+';
 709 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 710
 711 sub _convert_index_fields {
 712     my ( $self, @indexes ) = @_;
 713
 714     my %index_type_convert =
 715       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
 716
 717     # Convert according to our table, drop anything that doesn't convert.
 718     # If a field starts with mc- we save it as it's used (and removed) later
 719     # when joining things, to indicate we make it an 'OR' join.
 720     # (Sorry, this got a bit ugly after special cases were found.)
 721     map {
 722         # Lower case all field names
 723         my ( $f, $t ) = map(lc, split /,/);
 724         my $mc = '';
 725         if ($f =~ /^mc-/) {
 726             $mc = 'mc-';
 727             $f =~ s/^mc-//;
 728         }
 729         my $r = {
 730             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 731             type  => $index_type_convert{ $t // '__default' }
 732         };
 733         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 734         $r->{field} ? $r : undef;
 735     } @indexes;
 736 }
 737
 738 =head2 _convert_index_strings
 739
 740     my @searches = $self->_convert_index_strings(@searches);
 741
 742 Similar to L<_convert_index_fields>, this takes strings of the form
 743 B<field:search term> and rewrites the field from zebra-style to
 744 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 745
 746 =cut
 747
 748 sub _convert_index_strings {
 749     my ( $self, @searches ) = @_;
 750     my @res;
 751     foreach my $s (@searches) {
 752         next if $s eq '';
 753         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 754         unless ( defined($field) && defined($term) ) {
 755             push @res, $s;
 756             next;
 757         }
 758         my ($conv) = $self->_convert_index_fields($field);
 759         unless ( defined($conv) ) {
 760             push @res, $s;
 761             next;
 762         }
 763         push @res, ($conv->{field} ? $conv->{field} . ':' : '')
 764             . $self->_modify_string_by_type( %$conv, operand => $term );
 765     }
 766     return @res;
 767 }
 768
 769 =head2 _convert_index_strings_freeform
 770
 771     my $search = $self->_convert_index_strings_freeform($search);
 772
 773 This is similar to L<_convert_index_strings>, however it'll search out the
 774 things to change within the string. So it can handle strings such as
 775 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 776
 777 If there is something of the form "su,complete-subfield" or something, the
 778 second part is stripped off as we can't yet handle that. Making it work
 779 will have to wait for a real query parser.
 780
 781 =cut
 782
 783 sub _convert_index_strings_freeform {
 784     my ( $self, $search ) = @_;
 785     # @TODO: Currenty will alter also fields contained within quotes:
 786     # `searching for "stuff cn:123"` for example will become
 787     # `searching for "stuff local-number:123"
 788     #
 789     # Fixing this is tricky, one possibility:
 790     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 791     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 792     #
 793     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 794     # them back when processing is done.
 795
 796     # Lower case field names
 797     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 798     # Resolve possible field aliases
 799     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 800     return $search;
 801 }
 802
 803 =head2 _modify_string_by_type
 804
 805     my $str = $self->_modify_string_by_type(%index_field);
 806
 807 If you have a search term (operand) and a type (phrase, right-truncated), this
 808 will convert the string to have the function in lucene search terms, e.g.
 809 wrapping quotes around it.
 810
 811 =cut
 812
 813 sub _modify_string_by_type {
 814     my ( $self, %idx ) = @_;
 815
 816     my $type = $idx{type} || '';
 817     my $str = $idx{operand};
 818     return $str unless $str;    # Empty or undef, we can't use it.
 819
 820     $str .= '*' if $type eq 'right-truncate';
 821     $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
 822     if ($type eq 'st-year') {
 823         if ($str =~ /^(.*)-(.*)$/) {
 824             my $from = $1 || '*';
 825             my $until = $2 || '*';
 826             $str = "[$from TO $until]";
 827         }
 828     }
 829     return $str;
 830 }
 831
 832 =head2 _join_queries
 833
 834     my $query_str = $self->_join_queries(@query_parts);
 835
 836 This takes a list of query parts, that might be search terms on their own, or
 837 booleaned together, or specifying fields, or whatever, wraps them in
 838 parentheses, and ANDs them all together. Suitable for feeding to the ES
 839 query string query.
 840
 841 Note: doesn't AND them together if they specify an index that starts with "mc"
 842 as that was a special case in the original code for dealing with multiple
 843 choice options (you can't search for something that has an itype of A and
 844 and itype of B otherwise.)
 845
 846 =cut
 847
 848 sub _join_queries {
 849     my ( $self, @parts ) = @_;
 850
 851     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 852     my @mc_parts =
 853       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 854     return () unless @norm_parts + @mc_parts;
 855     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 856
 857     # Group limits by field, so they can be OR'ed together
 858     my %mc_limits;
 859     foreach my $mc_part (@mc_parts) {
 860         my ($field, $value) = split /:/, $mc_part, 2;
 861         $mc_limits{$field} //= [];
 862         push @{ $mc_limits{$field} }, $value;
 863     }
 864
 865     @mc_parts = map {
 866         sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
 867     } sort keys %mc_limits;
 868
 869     @norm_parts = map { "($_)" } @norm_parts;
 870
 871     return join( ' AND ', @norm_parts, @mc_parts);
 872 }
 873
 874 =head2 _make_phrases
 875
 876     my @phrased_queries = $self->_make_phrases(@query_parts);
 877
 878 This takes the supplied queries and forces them to be phrases by wrapping
 879 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 880 the quotes outside of them if they're there.
 881
 882 =cut
 883
 884 sub _make_phrases {
 885     my ( $self, @parts ) = @_;
 886     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 887 }
 888
 889 =head2 _create_query_string
 890
 891     my @query_strings = $self->_create_query_string(@queries);
 892
 893 Given a list of hashrefs, it will turn them into a lucene-style query string.
 894 The hash should contain field, type (both for the indexes), operator, and
 895 operand.
 896
 897 =cut
 898
 899 sub _create_query_string {
 900     my ( $self, @queries ) = @_;
 901
 902     map {
 903         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 904         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 905
 906         my $oand = $self->_modify_string_by_type(%$_);
 907         $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
 908         "$otor($field$oand)";
 909     } @queries;
 910 }
 911
 912 =head2 _clean_search_term
 913
 914     my $term = $self->_clean_search_term($term);
 915
 916 This cleans a search term by removing any funny characters that may upset
 917 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 918 to ensure those parts are correct.
 919
 920 =cut
 921
 922 sub _clean_search_term {
 923     my ( $self, $term ) = @_;
 924
 925     # Lookahead for checking if we are inside quotes
 926     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 927
 928     # Some hardcoded searches (like with authorities) produce things like
 929     # 'an=123', when it ought to be 'an:123' for our purposes.
 930     $term =~ s/=/:/g;
 931
 932     $term = $self->_convert_index_strings_freeform($term);
 933     $term =~ s/[{}]/"/g;
 934
 935     # Remove unbalanced quotes
 936     my $unquoted = $term;
 937     my $count = ($unquoted =~ tr/"/ /);
 938     if ($count % 2 == 1) {
 939         $term = $unquoted;
 940     }
 941
 942     # Remove unquoted colons that have whitespace on either side of them
 943     $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
 944
 945     $term = $self->_query_regex_escape_process($term);
 946
 947     return $term;
 948 }
 949
 950 =head2 _query_regex_escape_process
 951
 952     my $query = $self->_query_regex_escape_process($query);
 953
 954 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
 955
 956 =cut
 957
 958 sub _query_regex_escape_process {
 959     my ($self, $query) = @_;
 960     my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
 961     if ($regex_escape_options ne 'dont_escape') {
 962         if ($regex_escape_options eq 'escape') {
 963             # Will escape unescaped slashes (/) while preserving
 964             # unescaped slashes within quotes
 965             # @TODO: assumes quotes are always balanced and will
 966             # not handle escaped qoutes properly, should perhaps be
 967             # replaced with a more general parser solution
 968             # so that this function is ever only provided with unqouted
 969             # query parts
 970             $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
 971         }
 972         elsif($regex_escape_options eq 'unescape_escaped') {
 973             # Will unescape escaped slashes (\/) and escape
 974             # unescaped slashes (/) while preserving slashes within quotes
 975             # The same limitatations as above apply for handling of quotes
 976             $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
 977         }
 978     }
 979     return $query;
 980 }
 981
 982 =head2 _fix_limit_special_cases
 983
 984     my $limits = $self->_fix_limit_special_cases($limits);
 985
 986 This converts any special cases that the limit specifications have into things
 987 that are more readily processable by the rest of the code.
 988
 989 The argument should be an arrayref, and it'll return an arrayref.
 990
 991 =cut
 992
 993 sub _fix_limit_special_cases {
 994     my ( $self, $limits ) = @_;
 995
 996     my @new_lim;
 997     foreach my $l (@$limits) {
 998
 999         # This is set up by opac-search.pl
1000         if ( $l =~ /^yr,st-numeric,ge=/ ) {
1001             my ( $start, $end ) =
1002               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
1003             next unless defined($start) && defined($end);
1004             push @new_lim, "copydate:[$start TO $end]";
1005         }
1006         elsif ( $l =~ /^yr,st-numeric=/ ) {
1007             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
1008             next unless defined($date);
1009             $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1010             push @new_lim, "copydate:$date";
1011         }
1012         elsif ( $l =~ /^available$/ ) {
1013             push @new_lim, 'onloan:false';
1014         }
1015         else {
1016             my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1017             $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
1018             if ( defined($field) && defined($term) ) {
1019                 push @new_lim, "$field:(\"$term\")";
1020             }
1021             else {
1022                 push @new_lim, $l;
1023             }
1024         }
1025     }
1026     return \@new_lim;
1027 }
1028
1029 =head2 _sort_field
1030
1031     my $field = $self->_sort_field($field);
1032
1033 Given a field name, this works out what the actual name of the field to sort
1034 on should be. A '__sort' suffix is added for fields with a sort version, and
1035 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1036 to avoid sorting on a tokenized value.
1037
1038 =cut
1039
1040 sub _sort_field {
1041     my ($self, $f) = @_;
1042
1043     my $mappings = $self->get_elasticsearch_mappings();
1044     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1045     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1046         $f .= '__sort';
1047     } else {
1048         # We need to add '.raw' to text fields without a sort field,
1049         # otherwise it'll sort based on the tokenised form.
1050         $f .= '.raw' if $textField;
1051     }
1052     return $f;
1053 }
1054
1055 =head2 _truncate_terms
1056
1057     my $query = $self->_truncate_terms($query);
1058
1059 Given a string query this function appends '*' wildcard  to all terms except
1060 operands and double quoted strings.
1061
1062 =cut
1063
1064 sub _truncate_terms {
1065     my ( $self, $query ) = @_;
1066
1067     my @tokens = $self->_split_query( $query );
1068
1069     # Filter out empty tokens
1070     my @words = grep { $_ !~ /^\s*$/ } @tokens;
1071
1072     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1073     my @terms = map {
1074         my $w = $_;
1075         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1076     } @words;
1077
1078     return join ' ', @terms;
1079 }
1080
1081 =head2 _split_query
1082
1083     my @token = $self->_split_query($query_str);
1084
1085 Given a string query this function splits it to tokens taking into account
1086 any field prefixes and quoted strings.
1087
1088 =cut
1089
1090 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1091
1092 sub _split_query {
1093     my ( $self, $query ) = @_;
1094
1095     # '"donald duck" title:"the mouse" and peter" get split into
1096     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1097     my @tokens = split $tokenize_split_re, $query;
1098
1099     # Filter out empty values
1100     @tokens = grep( /\S/, @tokens );
1101
1102     return @tokens;
1103 }
1104
1105 =head2 _search_fields
1106     my $weighted_fields = $self->_search_fields({
1107         is_opac => 0,
1108         weighted_fields => 1,
1109         subfield => 'raw'
1110     });
1111
1112 Generate a list of searchable fields to be used for Elasticsearch queries
1113 applied to multiple fields.
1114
1115 Returns an arrayref of field names for either OPAC or Staff client, with
1116 possible weights and subfield appended to each field name depending on the
1117 options provided.
1118
1119 =over 4
1120
1121 =item C<$params>
1122
1123 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1124 fields for OPAC or Staff client should be retrieved. If C<weighted_fields> is set
1125 fields weights will be applied on returned fields. C<subfield> can be used to
1126 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1127
1128 =back
1129
1130 =cut
1131
1132 sub _search_fields {
1133     my ($self, $params) = @_;
1134     $params //= {
1135         is_opac => 0,
1136         weighted_fields => 0,
1137         whole_record => 0,
1138         # This is a hack for authorities build_authorities_query
1139         # can hopefully be removed in the future
1140         subfield => undef,
1141     };
1142     my $cache = Koha::Caches->get_instance();
1143     my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1144     my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1145     if (!$search_fields) {
1146         # The reason we don't use Koha::SearchFields->search here is we don't
1147         # want or need resultset wrapped as Koha::SearchField object.
1148         # It does not make any sense in this context and would cause
1149         # unnecessary overhead sice we are only querying for data
1150         # Also would not work, or produce strange results, with the "columns"
1151         # option.
1152         my $schema = Koha::Database->schema;
1153         my $result = $schema->resultset('SearchField')->search(
1154             {
1155                 $params->{is_opac} ? (
1156                     'opac' => 1,
1157                 ) : (
1158                     'staff_client' => 1
1159                 ),
1160                 'type' => { '!=' => 'boolean' },
1161                 'search_marc_map.index_name' => $self->index,
1162                 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1163                 'search_marc_to_fields.search' => 1,
1164             },
1165             {
1166                 columns => [qw/name weight/],
1167                 collapse => 1,
1168                 join => {search_marc_to_fields => 'search_marc_map'},
1169             }
1170         );
1171         my @search_fields;
1172         while (my $search_field = $result->next) {
1173             push @search_fields, [
1174                 lc $search_field->name,
1175                 $search_field->weight ? $search_field->weight : ()
1176             ];
1177         }
1178         $search_fields = \@search_fields;
1179         $cache->set_in_cache($cache_key, $search_fields);
1180     }
1181     if ($params->{subfield}) {
1182         my $subfield = $params->{subfield};
1183         $search_fields = [
1184             map {
1185                 # Copy values to avoid mutating cached
1186                 # data (since unsafe is used)
1187                 my ($field, $weight) = @{$_};
1188                 ["${field}.${subfield}", $weight];
1189             } @{$search_fields}
1190         ];
1191     }
1192     if ($params->{weighted_fields}) {
1193         return [map { join('^', @{$_}) } @{$search_fields}];
1194     }
1195     else {
1196         # Exclude weight from field
1197         return [map { $_->[0] } @{$search_fields}];
1198     }
1199 }
1200
1201 1;