Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51
  52 =head2 build_query
  53
  54     my $simple_query = $builder->build_query("hello", %options)
  55
  56 This will build a query that can be issued to elasticsearch from the provided
  57 string input. This expects a lucene style search form (see
  58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
  59 for details.)
  60
  61 It'll make an attempt to respect the various query options.
  62
  63 Additional options can be provided with the C<%options> hash.
  64
  65 =over 4
  66
  67 =item sort
  68
  69 This should be an arrayref of hashrefs, each containing a C<field> and an
  70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
  71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
  72
  73 =back
  74
  75 =cut
  76
  77 sub build_query {
  78     my ( $self, $query, %options ) = @_;
  79
  80     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
  81     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
  82     my $weight_fields    = C4::Context->preference("QueryWeightFields")    || 0;
  83     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
  84
  85     $query = '*' unless defined $query;
  86
  87     my $res;
  88     $res->{query} = {
  89         query_string => {
  90             query            => $query,
  91             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
  92             default_operator => 'AND',
  93             default_field    => '_all',
  94             lenient          => JSON::true,
  95             fields           => $options{fields} || [],
  96         }
  97     };
  98
  99     if ( $options{sort} ) {
 100         foreach my $sort ( @{ $options{sort} } ) {
 101             my ( $f, $d ) = @$sort{qw/ field direction /};
 102             die "Invalid sort direction, $d"
 103               if $d && ( $d ne 'asc' && $d ne 'desc' );
 104             $d = 'asc' unless $d;
 105
 106             $f = $self->_sort_field($f);
 107             push @{ $res->{sort} }, { $f => { order => $d } };
 108         }
 109     }
 110
 111     # See _convert_facets in Search.pm for how these get turned into
 112     # things that Koha can use.
 113     $res->{aggregations} = {
 114         author         => { terms => { field => "author__facet" } },
 115         subject        => { terms => { field => "subject__facet" } },
 116         itype          => { terms => { field => "itype__facet" } },
 117         location       => { terms => { field => "location__facet" } },
 118         'su-geo'       => { terms => { field => "su-geo__facet" } },
 119         'title-series' => { terms => { field => "title-series__facet" } },
 120         ccode          => { terms => { field => "ccode__facet" } },
 121         ln             => { terms => { field => "ln__facet" } },
 122     };
 123
 124     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 125     if (   $display_library_facets eq 'both'
 126         or $display_library_facets eq 'home' ) {
 127         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
 128     }
 129     if (   $display_library_facets eq 'both'
 130         or $display_library_facets eq 'holding' ) {
 131         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
 132     }
 133     if ( my $ef = $options{expanded_facet} ) {
 134         $res->{aggregations}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
 135     };
 136     return $res;
 137 }
 138
 139 =head2 build_browse_query
 140
 141     my $browse_query = $builder->build_browse_query($field, $query);
 142
 143 This performs a "starts with" style query on a particular field. The field
 144 to be searched must have been indexed with an appropriate mapping as a
 145 "phrase" subfield, which pretty much everything has.
 146
 147 =cut
 148
 149 # XXX this isn't really a browse query like we want in the end
 150 sub build_browse_query {
 151     my ( $self, $field, $query ) = @_;
 152
 153     my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
 154
 155     return { query => '*' } if !defined $query;
 156
 157     # TODO this should come from Koha::SearchEngine::Elasticsearch
 158     my %field_whitelist = (
 159         title  => 1,
 160         author => 1,
 161     );
 162     $field = 'title' if !exists $field_whitelist{$field};
 163     my $sort = $self->_sort_field($field);
 164     my $res = {
 165         query => {
 166             match_phrase_prefix => {
 167                 "$field.phrase" => {
 168                     query     => $query,
 169                     operator  => 'or',
 170                     fuzziness => $fuzzy_enabled ? 'auto' : '0',
 171                 }
 172             }
 173         },
 174         sort => [ { $sort => { order => "asc" } } ],
 175     };
 176 }
 177
 178 =head2 build_query_compat
 179
 180     my (
 181         $error,             $query, $simple_query, $query_cgi,
 182         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 183         $stopwords_removed, $query_type
 184       )
 185       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 186         \@limits, \@sort_by, $scan, $lang );
 187
 188 This handles a search using the same api as L<C4::Search::buildQuery> does.
 189
 190 A very simple query will go in with C<$operands> set to ['query'], and
 191 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 192 C<$query> set to something that can perform the search, C<$simple_query>
 193 set to just the search term, C<$query_cgi> set to something that can
 194 reproduce this search, and C<$query_desc> set to something else.
 195
 196 =cut
 197
 198 sub build_query_compat {
 199     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 200         $lang, $params )
 201       = @_;
 202
 203 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
 204     my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 205     my @index_params = $self->_convert_index_fields(@$indexes);
 206     my $limits       = $self->_fix_limit_special_cases($orig_limits);
 207     if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
 208
 209     # Merge the indexes in with the search terms and the operands so that
 210     # each search thing is a handy unit.
 211     unshift @$operators, undef;    # The first one can't have an op
 212     my @search_params;
 213     my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 214     my $ea = each_array( @$operands, @$operators, @index_params );
 215     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 216         next if ( !defined($oand) || $oand eq '' );
 217         $oand = $self->_clean_search_term($oand);
 218         $oand = $self->_truncate_terms($oand) if ($truncate);
 219         push @search_params, {
 220             operand => $oand,      # the search terms
 221             operator => defined($otor) ? uc $otor : undef,    # AND and so on
 222             $index ? %$index : (),
 223         };
 224     }
 225
 226     # We build a string query from limits and the queries. An alternative
 227     # would be to pass them separately into build_query and let it build
 228     # them into a structured ES query itself. Maybe later, though that'd be
 229     # more robust.
 230     my $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
 231     my $query_str = join( ' AND ',
 232         $search_param_query_str || (),
 233         $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 234
 235     my @fields = '_all';
 236     if ( defined($params->{weighted_fields}) && $params->{weighted_fields} ) {
 237         push @fields, sprintf("%s^%s", $_->name, $_->weight) for Koha::SearchFields->weighted_fields;
 238     }
 239
 240     # If there's no query on the left, let's remove the junk left behind
 241     $query_str =~ s/^ AND //;
 242     my %options;
 243     $options{fields} = \@fields;
 244     $options{sort} = \@sort_params;
 245     $options{expanded_facet} = $params->{expanded_facet};
 246     my $query = $self->build_query( $query_str, %options );
 247
 248     # We roughly emulate the CGI parameters of the zebra query builder
 249     my $query_cgi = '';
 250     shift @$operators; # Shift out the one we unshifted before
 251     $ea = each_array( @$operands, @$operators, @$indexes );
 252     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 253         $query_cgi .= '&' if $query_cgi;
 254         $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
 255         $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
 256     }
 257     $query_cgi .= '&scan=1' if ( $scan );
 258
 259     my $simple_query;
 260     $simple_query = $operands->[0] if @$operands == 1;
 261     my $query_desc;
 262     if ( $simple_query ) {
 263         $query_desc = $simple_query;
 264     } else {
 265         $query_desc = $search_param_query_str;
 266     }
 267     my $limit     = $self->_join_queries( $self->_convert_index_strings(@$limits));
 268     my $limit_cgi = ( $orig_limits and @$orig_limits )
 269       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 270       : '';
 271     my $limit_desc;
 272     $limit_desc = "$limit" if $limit;
 273
 274     return (
 275         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 276         $limit, $limit_cgi, $limit_desc,   undef,      undef
 277     );
 278 }
 279
 280 =head2 build_authorities_query
 281
 282     my $query = $builder->build_authorities_query(\%search);
 283
 284 This takes a nice description of an authority search and turns it into a black-box
 285 query that can then be passed to the appropriate searcher.
 286
 287 The search description is a hashref that looks something like:
 288
 289     {
 290         searches => [
 291             {
 292                 where    => 'Heading',    # search the main entry
 293                 operator => 'exact',        # require an exact match
 294                 value    => 'frogs',        # the search string
 295             },
 296             {
 297                 where    => '',             # search all entries
 298                 operator => '',             # default keyword, right truncation
 299                 value    => 'pond',
 300             },
 301         ],
 302         sort => {
 303             field => 'Heading',
 304             order => 'desc',
 305         },
 306         authtypecode => 'TOPIC_TERM',
 307     }
 308
 309 =cut
 310
 311 sub build_authorities_query {
 312     my ( $self, $search ) = @_;
 313
 314     # Start by making the query parts
 315     my @query_parts;
 316
 317     foreach my $s ( @{ $search->{searches} } ) {
 318         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 319         $wh = '_all' if $wh eq '';
 320         if ( $op eq 'is' || $op eq '='  || $op eq 'exact' ) {
 321
 322             # look for something that matches a term completely
 323             # note, '=' is about numerical vals. May need special handling.
 324             # Also, we lowercase our search because the ES
 325             # index lowercases its values, and term searches don't get the
 326             # search analyzer applied to them.
 327             push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
 328         }
 329         elsif ( $op eq 'start' ) {
 330             # startswith search, uses lowercase untokenized version of heading
 331             push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
 332         }
 333         else {
 334             # regular wordlist stuff
 335             my @tokens = $self->_split_query( $val );
 336             foreach my $token ( @tokens ) {
 337                 $token = $self->_truncate_terms(
 338                     $self->_clean_search_term( $token )
 339                 );
 340             }
 341             my $query = $self->_join_queries( @tokens );
 342             push @query_parts, { query_string => { default_field => $wh, query => $query } };
 343         }
 344     }
 345
 346     # Merge the query parts appropriately
 347     # 'should' behaves like 'or'
 348     # 'must' behaves like 'and'
 349     # Zebra results seem to match must so using that here
 350     my $query = { query =>
 351                  { bool =>
 352                      { must => \@query_parts  }
 353                  }
 354              };
 355
 356     my %s;
 357     if ( exists $search->{sort} ) {
 358         foreach my $k ( keys %{ $search->{sort} } ) {
 359             my $f = $self->_sort_field($k);
 360             $s{$f} = $search->{sort}{$k};
 361         }
 362         $search->{sort} = \%s;
 363     }
 364
 365     # add the sort stuff
 366     $query->{sort} = [ $search->{sort} ]  if exists $search->{sort};
 367
 368     return $query;
 369 }
 370
 371
 372 =head2 build_authorities_query_compat
 373
 374     my ($query) =
 375       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 376         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 377
 378 This builds a query for searching for authorities, in the style of
 379 L<C4::AuthoritiesMarc::SearchAuthorities>.
 380
 381 Arguments:
 382
 383 =over 4
 384
 385 =item marclist
 386
 387 An arrayref containing where the particular term should be searched for.
 388 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 389 thesaurus. If left blank, any field is used.
 390
 391 =item and_or
 392
 393 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 394
 395 =item excluding
 396
 397 Also ignored.
 398
 399 =item operator
 400
 401 What form of search to do. Options are: is (phrase, no truncation, whole field
 402 must match), = (number exact match), exact (phrase, no truncation, whole field
 403 must match). If left blank, then word list, right truncated, anywhere is used.
 404
 405 =item value
 406
 407 The actual user-provided string value to search for.
 408
 409 =item authtypecode
 410
 411 The authority type code to search within. If blank, then all will be searched.
 412
 413 =item orderby
 414
 415 The order to sort the results by. Options are Relevance, HeadingAsc,
 416 HeadingDsc, AuthidAsc, AuthidDsc.
 417
 418 =back
 419
 420 marclist, operator, and value must be the same length, and the values at
 421 index /i/ all relate to each other.
 422
 423 This returns a query, which is a black box object that can be passed to the
 424 appropriate search object.
 425
 426 =cut
 427
 428 our $koha_to_index_name = {
 429     mainmainentry   => 'heading-main',
 430     mainentry       => 'heading',
 431     match           => 'match',
 432     'match-heading' => 'match-heading',
 433     'see-from'      => 'match-heading-see-from',
 434     thesaurus       => 'subject-heading-thesaurus',
 435     any             => '',
 436     all             => ''
 437 };
 438
 439 sub build_authorities_query_compat {
 440     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 441         $authtypecode, $orderby )
 442       = @_;
 443
 444     # This turns the old-style many-options argument form into a more
 445     # extensible hash form that is understood by L<build_authorities_query>.
 446     my @searches;
 447
 448     # Convert to lower case
 449     $marclist = [map(lc, @{$marclist})];
 450     $orderby  = lc $orderby;
 451
 452     # Make sure everything exists
 453     foreach my $m (@$marclist) {
 454         Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
 455             unless exists $koha_to_index_name->{$m};
 456     }
 457     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 458         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 459         push @searches,
 460           {
 461             where    => $koha_to_index_name->{$marclist->[$i]},
 462             operator => $operator->[$i],
 463             value    => $value->[$i],
 464           };
 465     }
 466
 467     my %sort;
 468     my $sort_field =
 469         ( $orderby =~ /^heading/ ) ? 'heading'
 470       : ( $orderby =~ /^auth/ )    ? 'local-number'
 471       :                              undef;
 472     if ($sort_field) {
 473         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 474         %sort = ( $sort_field => $sort_order, );
 475     }
 476     my %search = (
 477         searches     => \@searches,
 478         authtypecode => $authtypecode,
 479     );
 480     $search{sort} = \%sort if %sort;
 481     my $query = $self->build_authorities_query( \%search );
 482     return $query;
 483 }
 484
 485 =head2 _convert_sort_fields
 486
 487     my @sort_params = _convert_sort_fields(@sort_by)
 488
 489 Converts the zebra-style sort index information into elasticsearch-style.
 490
 491 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 492 something that can be sent to L<build_query>.
 493
 494 =cut
 495
 496 sub _convert_sort_fields {
 497     my ( $self, @sort_by ) = @_;
 498
 499     # Turn the sorting into something we care about.
 500     my %sort_field_convert = (
 501         acqdate     => 'date-of-acquisition',
 502         author      => 'author',
 503         call_number => 'local-classification',
 504         popularity  => 'issues',
 505         relevance   => undef,       # default
 506         title       => 'title',
 507         pubdate     => 'date-of-publication',
 508     );
 509     my %sort_order_convert =
 510       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 511
 512     # Convert the fields and orders, drop anything we don't know about.
 513     grep { $_->{field} } map {
 514         my ( $f, $d ) = /(.+)_(.+)/;
 515         {
 516             field     => $sort_field_convert{$f},
 517             direction => $sort_order_convert{$d}
 518         }
 519     } @sort_by;
 520 }
 521
 522 =head2 _convert_index_fields
 523
 524     my @index_params = $self->_convert_index_fields(@indexes);
 525
 526 Converts zebra-style search index notation into elasticsearch-style.
 527
 528 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 529 and it returns something that can be sent to L<build_query>.
 530
 531 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 532 types.
 533
 534 =cut
 535
 536 our %index_field_convert = (
 537     'kw' => '_all',
 538     'ab' => 'abstract',
 539     'au' => 'author',
 540     'lcn' => 'local-classification',
 541     'callnum' => 'local-classification',
 542     'record-type' => 'rtype',
 543     'mc-rtype' => 'rtype',
 544     'mus' => 'rtype',
 545     'lc-card' => 'lc-card-number',
 546     'sn' => 'local-number',
 547     'yr' => 'date-of-publication',
 548     'pubdate' => 'date-of-publication',
 549     'acqdate' => 'date-of-acquisition',
 550     'date/time-last-modified' => 'date-time-last-modified',
 551     'dtlm' => 'date/time-last-modified',
 552     'diss' => 'dissertation-information',
 553     'nb' => 'isbn',
 554     'ns' => 'issn',
 555     'music-number' => 'identifier-publisher-for-music',
 556     'number-music-publisher' => 'identifier-publisher-for-music',
 557     'music' => 'identifier-publisher-for-music',
 558     'ident' => 'identifier-standard',
 559     'cpn' => 'corporate-name',
 560     'cfn' => 'conference-name',
 561     'pn' => 'personal-name',
 562     'pb' => 'publisher',
 563     'pv' => 'provider',
 564     'nt' => 'note',
 565     'notes' => 'note',
 566     'rcn' => 'record-control-number',
 567     'su' => 'subject',
 568     'su-to' => 'subject',
 569     #'su-geo' => 'subject',
 570     'su-ut' => 'subject',
 571     'ti' => 'title',
 572     'se' => 'title-series',
 573     'ut' => 'title-uniform',
 574     'an' => 'koha-auth-number',
 575     'authority-number' => 'koha-auth-number',
 576     'at' => 'authtype',
 577     'he' => 'heading',
 578     'rank' => 'relevance',
 579     'phr' => 'st-phrase',
 580     'wrdl' => 'st-word-list',
 581     'rt' => 'right-truncation',
 582     'rtrn' => 'right-truncation',
 583     'ltrn' => 'left-truncation',
 584     'rltrn' => 'left-and-right',
 585     'mc-itemtype' => 'itemtype',
 586     'mc-ccode' => 'ccode',
 587     'branch' => 'homebranch',
 588     'mc-loc' => 'location',
 589     'stocknumber' => 'number-local-acquisition',
 590     'inv' => 'number-local-acquisition',
 591     'bc' => 'barcode',
 592     'mc-itype' => 'itype',
 593     'aub' => 'author-personal-bibliography',
 594     'auo' => 'author-in-order',
 595     'ff8-22' => 'ta',
 596     'aud' => 'ta',
 597     'audience' => 'ta',
 598     'frequency-code' => 'ff8-18',
 599     'illustration-code' => 'ff8-18-21',
 600     'regularity-code' => 'ff8-19',
 601     'type-of-serial' => 'ff8-21',
 602     'format' => 'ff8-23',
 603     'conference-code' => 'ff8-29',
 604     'festschrift-indicator' => 'ff8-30',
 605     'index-indicator' => 'ff8-31',
 606     'fiction' => 'lf',
 607     'fic' => 'lf',
 608     'literature-code' => 'lf',
 609     'biography' => 'bio',
 610     'ff8-34' => 'bio',
 611     'biography-code' => 'bio',
 612     'l-format' => 'ff7-01-02',
 613     'lex' => 'lexile-number',
 614     'hi' => 'host-item-number',
 615     'itu' => 'index-term-uncontrolled',
 616     'itg' => 'index-term-genre',
 617 );
 618 my $field_name_pattern = '[\w\-]+';
 619 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 620
 621 sub _convert_index_fields {
 622     my ( $self, @indexes ) = @_;
 623
 624     my %index_type_convert =
 625       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
 626
 627     # Convert according to our table, drop anything that doesn't convert.
 628     # If a field starts with mc- we save it as it's used (and removed) later
 629     # when joining things, to indicate we make it an 'OR' join.
 630     # (Sorry, this got a bit ugly after special cases were found.)
 631     grep { $_->{field} } map {
 632         # Lower case all field names
 633         my ( $f, $t ) = map(lc, split /,/);
 634         my $mc = '';
 635         if ($f =~ /^mc-/) {
 636             $mc = 'mc-';
 637             $f =~ s/^mc-//;
 638         }
 639         my $r = {
 640             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 641             type  => $index_type_convert{ $t // '__default' }
 642         };
 643         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 644         $r;
 645     } @indexes;
 646 }
 647
 648 =head2 _convert_index_strings
 649
 650     my @searches = $self->_convert_index_strings(@searches);
 651
 652 Similar to L<_convert_index_fields>, this takes strings of the form
 653 B<field:search term> and rewrites the field from zebra-style to
 654 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 655
 656 =cut
 657
 658 sub _convert_index_strings {
 659     my ( $self, @searches ) = @_;
 660     my @res;
 661     foreach my $s (@searches) {
 662         next if $s eq '';
 663         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 664         unless ( defined($field) && defined($term) ) {
 665             push @res, $s;
 666             next;
 667         }
 668         my ($conv) = $self->_convert_index_fields($field);
 669         unless ( defined($conv) ) {
 670             push @res, $s;
 671             next;
 672         }
 673         push @res, $conv->{field} . ":"
 674           . $self->_modify_string_by_type( %$conv, operand => $term );
 675     }
 676     return @res;
 677 }
 678
 679 =head2 _convert_index_strings_freeform
 680
 681     my $search = $self->_convert_index_strings_freeform($search);
 682
 683 This is similar to L<_convert_index_strings>, however it'll search out the
 684 things to change within the string. So it can handle strings such as
 685 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 686
 687 If there is something of the form "su,complete-subfield" or something, the
 688 second part is stripped off as we can't yet handle that. Making it work
 689 will have to wait for a real query parser.
 690
 691 =cut
 692
 693 sub _convert_index_strings_freeform {
 694     my ( $self, $search ) = @_;
 695     # @TODO: Currenty will alter also fields contained within quotes:
 696     # `searching for "stuff cn:123"` for example will become
 697     # `searching for "stuff local-number:123"
 698     #
 699     # Fixing this is tricky, one possibility:
 700     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 701     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 702     #
 703     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 704     # them back when processing is done.
 705
 706     # Lower case field names
 707     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 708     # Resolve possible field aliases
 709     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 710     return $search;
 711 }
 712
 713 =head2 _modify_string_by_type
 714
 715     my $str = $self->_modify_string_by_type(%index_field);
 716
 717 If you have a search term (operand) and a type (phrase, right-truncated), this
 718 will convert the string to have the function in lucene search terms, e.g.
 719 wrapping quotes around it.
 720
 721 =cut
 722
 723 sub _modify_string_by_type {
 724     my ( $self, %idx ) = @_;
 725
 726     my $type = $idx{type} || '';
 727     my $str = $idx{operand};
 728     return $str unless $str;    # Empty or undef, we can't use it.
 729
 730     $str .= '*' if $type eq 'right-truncate';
 731     $str = '"' . $str . '"' if $type eq 'phrase';
 732     return $str;
 733 }
 734
 735 =head2 _join_queries
 736
 737     my $query_str = $self->_join_queries(@query_parts);
 738
 739 This takes a list of query parts, that might be search terms on their own, or
 740 booleaned together, or specifying fields, or whatever, wraps them in
 741 parentheses, and ANDs them all together. Suitable for feeding to the ES
 742 query string query.
 743
 744 Note: doesn't AND them together if they specify an index that starts with "mc"
 745 as that was a special case in the original code for dealing with multiple
 746 choice options (you can't search for something that has an itype of A and
 747 and itype of B otherwise.)
 748
 749 =cut
 750
 751 sub _join_queries {
 752     my ( $self, @parts ) = @_;
 753
 754     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 755     my @mc_parts =
 756       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 757     return () unless @norm_parts + @mc_parts;
 758     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 759     my $grouped_mc =
 760       @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
 761
 762     # Handy trick: $x || () inside a join means that if $x ends up as an
 763     # empty string, it gets replaced with (), which makes join ignore it.
 764     # (bad effect: this'll also happen to '0', this hopefully doesn't matter
 765     # in this case.)
 766     join( ' AND ',
 767         join( ' AND ', map { "($_)" } @norm_parts ) || (),
 768         $grouped_mc || () );
 769 }
 770
 771 =head2 _make_phrases
 772
 773     my @phrased_queries = $self->_make_phrases(@query_parts);
 774
 775 This takes the supplied queries and forces them to be phrases by wrapping
 776 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 777 the quotes outside of them if they're there.
 778
 779 =cut
 780
 781 sub _make_phrases {
 782     my ( $self, @parts ) = @_;
 783     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 784 }
 785
 786 =head2 _create_query_string
 787
 788     my @query_strings = $self->_create_query_string(@queries);
 789
 790 Given a list of hashrefs, it will turn them into a lucene-style query string.
 791 The hash should contain field, type (both for the indexes), operator, and
 792 operand.
 793
 794 =cut
 795
 796 sub _create_query_string {
 797     my ( $self, @queries ) = @_;
 798
 799     map {
 800         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 801         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 802
 803         my $oand = $self->_modify_string_by_type(%$_);
 804         $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1;
 805         "$otor($field$oand)";
 806     } @queries;
 807 }
 808
 809 =head2 _clean_search_term
 810
 811     my $term = $self->_clean_search_term($term);
 812
 813 This cleans a search term by removing any funny characters that may upset
 814 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 815 to ensure those parts are correct.
 816
 817 =cut
 818
 819 sub _clean_search_term {
 820     my ( $self, $term ) = @_;
 821
 822     # Lookahead for checking if we are inside quotes
 823     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 824
 825     # Some hardcoded searches (like with authorities) produce things like
 826     # 'an=123', when it ought to be 'an:123' for our purposes.
 827     $term =~ s/=/:/g;
 828
 829     $term = $self->_convert_index_strings_freeform($term);
 830     $term =~ s/[{}]/"/g;
 831
 832     # Remove unbalanced quotes
 833     my $unquoted = $term;
 834     my $count = ($unquoted =~ tr/"/ /);
 835     if ($count % 2 == 1) {
 836         $term = $unquoted;
 837     }
 838
 839     # Remove unquoted colons that have whitespace on either side of them
 840     $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
 841
 842     return $term;
 843 }
 844
 845 =head2 _fix_limit_special_cases
 846
 847     my $limits = $self->_fix_limit_special_cases($limits);
 848
 849 This converts any special cases that the limit specifications have into things
 850 that are more readily processable by the rest of the code.
 851
 852 The argument should be an arrayref, and it'll return an arrayref.
 853
 854 =cut
 855
 856 sub _fix_limit_special_cases {
 857     my ( $self, $limits ) = @_;
 858
 859     my @new_lim;
 860     foreach my $l (@$limits) {
 861
 862         # This is set up by opac-search.pl
 863         if ( $l =~ /^yr,st-numeric,ge=/ ) {
 864             my ( $start, $end ) =
 865               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
 866             next unless defined($start) && defined($end);
 867             push @new_lim, "copydate:[$start TO $end]";
 868         }
 869         elsif ( $l =~ /^yr,st-numeric=/ ) {
 870             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
 871             next unless defined($date);
 872             push @new_lim, "copydate:$date";
 873         }
 874         elsif ( $l =~ /^available$/ ) {
 875             push @new_lim, 'onloan:0';
 876         }
 877         else {
 878             push @new_lim, $l;
 879         }
 880     }
 881     return \@new_lim;
 882 }
 883
 884 =head2 _sort_field
 885
 886     my $field = $self->_sort_field($field);
 887
 888 Given a field name, this works out what the actual name of the field to sort
 889 on should be. A '__sort' suffix is added for fields with a sort version, and
 890 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
 891 to avoid sorting on a tokenized value.
 892
 893 =cut
 894
 895 sub _sort_field {
 896     my ($self, $f) = @_;
 897
 898     my $mappings = $self->get_elasticsearch_mappings();
 899     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
 900     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
 901         $f .= '__sort';
 902         # We need to add '.phrase' to text fields, otherwise it'll sort
 903         # based on the tokenised form.
 904         $f .= '.phrase' if $textField;
 905     } else {
 906         # We need to add '.raw' to text fields without a sort field,
 907         # otherwise it'll sort based on the tokenised form.
 908         $f .= '.raw' if $textField;
 909     }
 910     return $f;
 911 }
 912
 913 =head2 _truncate_terms
 914
 915     my $query = $self->_truncate_terms($query);
 916
 917 Given a string query this function appends '*' wildcard  to all terms except
 918 operands and double quoted strings.
 919
 920 =cut
 921
 922 sub _truncate_terms {
 923     my ( $self, $query ) = @_;
 924
 925     my @tokens = $self->_split_query( $query );
 926
 927     # Filter out empty tokens
 928     my @words = grep { $_ !~ /^\s*$/ } @tokens;
 929
 930     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
 931     my @terms = map {
 932         my $w = $_;
 933         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
 934     } @words;
 935
 936     return join ' ', @terms;
 937 }
 938
 939 =head2 _split_query
 940
 941     my @token = $self->_split_query($query_str);
 942
 943 Given a string query this function splits it to tokens taking into account
 944 any field prefixes and quoted strings.
 945
 946 =cut
 947
 948 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
 949
 950 sub _split_query {
 951     my ( $self, $query ) = @_;
 952
 953     # '"donald duck" title:"the mouse" and peter" get split into
 954     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
 955     my @tokens = split $tokenize_split_re, $query;
 956
 957     # Filter out empty values
 958     @tokens = grep( /\S/, @tokens );
 959
 960     return @tokens;
 961 }
 962
 963 1;