Bug 19365: Fix several issues with the Elasticsearch code

author Ere Maijala <ere.maijala@helsinki.fi>

Fri, 14 Sep 2018 03:40:24 +0000 (06:40 +0300)

committer Nick Clemens <nick@bywatersolutions.com>

Mon, 26 Nov 2018 16:11:01 +0000 (16:11 +0000)
author Ere Maijala <ere.maijala@helsinki.fi>
Fri, 14 Sep 2018 03:40:24 +0000 (06:40 +0300)
committer Nick Clemens <nick@bywatersolutions.com>
Mon, 26 Nov 2018 16:11:01 +0000 (16:11 +0000)
diff --git a/C4/Heading.pm b/C4/Heading.pm

index e323b5b..193be16 100644 (file)
--- a/C4/Heading.pm
+++ b/C4/Heading.pm
@@ -200,12 +200,24 @@ sub _search {
     #        push @operator, 'is';
     #        push @value, $self->{'thesaurus'};
     #    }
-    require C4::AuthoritiesMarc;
-    return C4::AuthoritiesMarc::SearchAuthorities(
+
+    require Koha::SearchEngine::QueryBuilder;
+    require Koha::SearchEngine::Search;
+
+    # Use state variables to avoid recreating the objects every time.
+    # With Elasticsearch this also avoids creating a massive amount of
+    # ES connectors that would eventually run out of file descriptors.
+    state $builder = Koha::SearchEngine::QueryBuilder->new(
+        { index => $Koha::SearchEngine::AUTHORITIES_INDEX } );
+    state $searcher = Koha::SearchEngine::Search->new(
+        {index => $Koha::SearchEngine::AUTHORITIES_INDEX} );
+
+    my $search_query = $builder->build_authorities_query_compat(
         \@marclist, \@and_or, \@excluding, \@operator,
-        \@value,    0,        20,          $self->{'auth_type'},
-        'AuthidAsc',         $skipmetadata
+        \@value,    $self->{'auth_type'},
+        'AuthidAsc'
     );
+    return $searcher->search_auth_compat( $search_query, 0, 20, $skipmetadata );
 }
 
 =head1 INTERNAL FUNCTIONS
diff --git a/C4/Matcher.pm b/C4/Matcher.pm

index d19dd64..dc84804 100644 (file)
--- a/C4/Matcher.pm
+++ b/C4/Matcher.pm
@@ -664,7 +664,10 @@ sub get_matches {
                     #NOTE: double-quote the values so you don't get a "Embedded truncation not supported" error when a term has a ? in it.
             }
 
-            my $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX});
+            # Use state variables to avoid recreating the objects every time.
+            # With Elasticsearch this also avoids creating a massive amount of
+            # ES connectors that would eventually run out of file descriptors.
+            state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX});
             ( $error, $searchresults, $total_hits ) =
               $searcher->simple_search_compat( $query, 0, $max_matches, undef, skip_normalize => 1 );
 
@@ -703,8 +706,11 @@ sub get_matches {
                 push @operator, 'exact';
                 push @value,    $key;
             }
-            my $builder  = Koha::SearchEngine::QueryBuilder->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
-            my $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
+            # Use state variables to avoid recreating the objects every time.
+            # With Elasticsearch this also avoids creating a massive amount of
+            # ES connectors that would eventually run out of file descriptors.
+            state $builder  = Koha::SearchEngine::QueryBuilder->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
+            state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
             my $search_query = $builder->build_authorities_query_compat(
                 \@marclist, \@and_or, \@excluding, \@operator,
                 \@value, undef, 'AuthidAsc'
diff --git a/Koha/SearchEngine/Elasticsearch/Indexer.pm b/Koha/SearchEngine/Elasticsearch/Indexer.pm

index 4774d78..703f39f 100644 (file)
--- a/Koha/SearchEngine/Elasticsearch/Indexer.pm
+++ b/Koha/SearchEngine/Elasticsearch/Indexer.pm
@@ -398,7 +398,8 @@ sub _sanitise_records {
         # tears in rain...
         if ( $rec ) {
             $rec->delete_fields($rec->field('999'));
-            $rec->append_fields(MARC::Field->new('999','','','c' => $bibnum, 'd' => $bibnum));
+            # Make sure biblionumber is a string. Elasticsearch would consider int and string different IDs.
+            $rec->append_fields(MARC::Field->new('999','','','c' => "" . $bibnum, 'd' => "" . $bibnum));
         }
     }
 }
diff --git a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

index 57836e9..254fba2 100644 (file)
--- a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm
+++ b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm
@@ -103,10 +103,8 @@ sub build_query {
               if $d && ( $d ne 'asc' && $d ne 'desc' );
             $d = 'asc' unless $d;
 
-            # TODO account for fields that don't have a 'phrase' type
-
             $f = $self->_sort_field($f);
-            push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
+            push @{ $res->{sort} }, { $f => { order => $d } };
         }
     }
 
@@ -172,7 +170,7 @@ sub build_browse_query {
                 }
             }
         },
-        sort => [ { "$sort.phrase" => { order => "asc" } } ],
+        sort => [ { $sort => { order => "asc" } } ],
     };
 }
 
@@ -301,22 +299,18 @@ sub build_authorities_query {
     foreach my $s ( @{ $search->{searches} } ) {
         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
         $wh = '_all' if $wh eq '';
-        if ( $op eq 'is' || $op eq '=' ) {
+        if ( $op eq 'is' || $op eq '='  || $op eq 'exact' ) {
 
             # look for something that matches a term completely
             # note, '=' is about numerical vals. May need special handling.
             # Also, we lowercase our search because the ES
             # index lowercases its values, and term searches don't get the
             # search analyzer applied to them.
-            push @query_parts, { term => {"$wh.phrase" => lc $val} };
-        }
-        elsif ( $op eq 'exact' ) {
-            # left and right truncation, otherwise an exact phrase
             push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
         }
         elsif ( $op eq 'start' ) {
             # startswith search, uses lowercase untokenized version of heading
-            push @query_parts, { prefix => {"$wh.lc_raw" => lc $val} };
+            push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
         }
         else {
             # regular wordlist stuff
@@ -332,19 +326,17 @@ sub build_authorities_query {
     # 'should' behaves like 'or'
     # 'must' behaves like 'and'
     # Zebra results seem to match must so using that here
-    my $query = { query=>
+    my $query = { query =>
                  { bool =>
                      { must => \@query_parts  }
                  }
              };
 
-    # We need to add '.phrase' to all the sort headings otherwise it'll sort
-    # based on the tokenised form.
     my %s;
     if ( exists $search->{sort} ) {
         foreach my $k ( keys %{ $search->{sort} } ) {
             my $f = $self->_sort_field($k);
-            $s{"$f.phrase"} = $search->{sort}{$k};
+            $s{$f} = $search->{sort}{$k};
         }
         $search->{sort} = \%s;
     }
@@ -385,9 +377,9 @@ Also ignored.
 
 =item operator
 
-What form of search to do. Options are: is (phrase, no trunction, whole field
-must match), = (number exact match), exact (phrase, but with left and right
-truncation). If left blank, then word list, right truncted, anywhere is used.
+What form of search to do. Options are: is (phrase, no truncation, whole field
+must match), = (number exact match), exact (phrase, no truncation, whole field
+must match). If left blank, then word list, right truncated, anywhere is used.
 
 =item value
 
@@ -419,7 +411,8 @@ our $koha_to_index_name = {
     'match-heading' => 'Match-heading',
     'see-from'      => 'Match-heading-see-from',
     thesaurus       => 'Subject-heading-thesaurus',
-    all              => ''
+    any             => '',
+    all             => ''
 };
 
 sub build_authorities_query_compat {
@@ -448,8 +441,8 @@ sub build_authorities_query_compat {
 
     my %sort;
     my $sort_field =
-        ( $orderby =~ /^Heading/ ) ? 'Heading__sort'
-      : ( $orderby =~ /^Auth/ )    ? 'Local-Number'
+        ( $orderby =~ /^Heading/ ) ? 'Heading'
+      : ( $orderby =~ /^Auth/ )    ? 'Local-number'
       :                              undef;
     if ($sort_field) {
         my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
@@ -773,16 +766,27 @@ sub _fix_limit_special_cases {
 
     my $field = $self->_sort_field($field);
 
-Given a field name, this works out what the actual name of the version to sort
-on should be. Often it's the same, sometimes it involves sticking "__sort" on
-the end. Maybe it'll be something else in the future, who knows?
+Given a field name, this works out what the actual name of the field to sort
+on should be. A '__sort' suffix is added for fields with a sort version, and
+for text fields either '.phrase' (for sortable versions) or '.raw' is appended
+to avoid sorting on a tokenized value.
 
 =cut
 
 sub _sort_field {
     my ($self, $f) = @_;
-    if ($self->sort_fields()->{$f}) {
+
+    my $mappings = $self->get_elasticsearch_mappings();
+    my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
+    if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
         $f .= '__sort';
+        # We need to add '.phrase' to text fields, otherwise it'll sort
+        # based on the tokenised form.
+        $f .= '.phrase' if $textField;
+    } else {
+        # We need to add '.raw' to text fields without a sort field,
+        # otherwise it'll sort based on the tokenised form.
+        $f .= '.raw' if $textField;
     }
     return $f;
 }
diff --git a/Koha/SearchEngine/Elasticsearch/Search.pm b/Koha/SearchEngine/Elasticsearch/Search.pm

index 6917f19..b7f139f 100644 (file)
--- a/Koha/SearchEngine/Elasticsearch/Search.pm
+++ b/Koha/SearchEngine/Elasticsearch/Search.pm
@@ -84,23 +84,21 @@ sub search {
     my ($self, $query, $page, $count, %options) = @_;
 
     my $params = $self->get_elasticsearch_params();
-    my %paging;
     # 20 is the default number of results per page
-    $paging{limit} = $count || 20;
-    # ES/Catmandu doesn't want pages, it wants a record to start from.
+    $query->{size} = $count || 20;
+    # ES doesn't want pages, it wants a record to start from.
     if (exists $options{offset}) {
-        $paging{start} = $options{offset};
+        $query->{from} = $options{offset};
     } else {
         $page = (!defined($page) || ($page <= 0)) ? 0 : $page - 1;
-        $paging{start} = $page * $paging{limit};
+        $query->{from} = $page * $query->{size};
     }
-    $self->store(
-        Catmandu::Store::ElasticSearch->new(
-            %$params,
-        )
-    ) unless $self->store;
+    my $elasticsearch = $self->get_elasticsearch();
     my $results = eval {
-        $self->store->bag->search( %$query, %paging );
+        $elasticsearch->search(
+            index => $params->{index_name},
+            body => $query
+        );
     };
     if ($@) {
         die $self->process_error($@);
@@ -163,13 +161,15 @@ sub search_compat {
     # opac-search expects results to be put in the
     # right place in the array, according to $offset
     my $index = $offset;
-    $results->each(sub {
-        $records[$index++] = $self->decode_record_from_result(@_);
-    });
+    my $hits = $results->{'hits'};
+    foreach my $es_record (@{$hits->{'hits'}}) {
+        $records[$index++] = $self->decode_record_from_result($es_record->{'_source'});
+    }
+
     # consumers of this expect a name-spaced result, we provide the default
     # configuration.
     my %result;
-    $result{biblioserver}{hits} = $results->total;
+    $result{biblioserver}{hits} = $hits->{'total'};
     $result{biblioserver}{RECORDS} = \@records;
     return (undef, \%result, $self->_convert_facets($results->{aggregations}, $expanded_facet));
 }
@@ -177,7 +177,7 @@ sub search_compat {
 =head2 search_auth_compat
 
     my ( $results, $total ) =
-      $searcher->search_auth_compat( $query, $page, $count, %options );
+      $searcher->search_auth_compat( $query, $offset, $count, $skipmetadata, %options );
 
 This has a similar calling convention to L<search>, however it returns its
 results in a form the same as L<C4::AuthoritiesMarc::SearchAuthorities>.
@@ -185,32 +185,39 @@ results in a form the same as L<C4::AuthoritiesMarc::SearchAuthorities>.
 =cut
 
 sub search_auth_compat {
-    my $self = shift;
+    my ($self, $query, $offset, $count, $skipmetadata, %options) = @_;
 
-    # TODO handle paging
+    if ( !defined $offset or $offset <= 0 ) {
+        $offset = 1;
+    }
+    # Uh, authority search uses 1-based offset..
+    $options{offset} = $offset - 1;
     my $database = Koha::Database->new();
     my $schema   = $database->schema();
-    my $res      = $self->search(@_);
+    my $res      = $self->search($query, undef, $count, %options);
+
     my $bib_searcher = Koha::SearchEngine::Elasticsearch::Search->new({index => 'biblios'});
     my @records;
-    $res->each(
-        sub {
-            my %result;
+    my $hits = $res->{'hits'};
+    foreach my $es_record (@{$hits->{'hits'}}) {
+        my $record = $es_record->{'_source'};
+        my %result;
 
-            # I wonder if these should be real values defined in the mapping
-            # rather than hard-coded conversions.
-            my $record    = $_[0];
-            # Handle legacy nested arrays indexed with splitting enabled.
-            my $authid = $record->{ 'Local-number' }[0];
-            $authid = @$authid[0] if (ref $authid eq 'ARRAY');
+        # I wonder if these should be real values defined in the mapping
+        # rather than hard-coded conversions.
+        #my $record    = $_[0];
+        # Handle legacy nested arrays indexed with splitting enabled.
+        my $authid = $record->{ 'Local-number' }[0];
+        $authid = @$authid[0] if (ref $authid eq 'ARRAY');
 
-            $result{authid} = $authid;
+        $result{authid} = $authid;
 
+        if (!defined $skipmetadata || !$skipmetadata) {
             # TODO put all this info into the record at index time so we
             # don't have to go and sort it all out now.
             my $authtypecode = $record->{authtype};
             my $rs           = $schema->resultset('AuthType')
-              ->search( { authtypecode => $authtypecode } );
+            ->search( { authtypecode => $authtypecode } );
 
             # FIXME there's an assumption here that we will get a result.
             # the original code also makes an assumption that some provided
@@ -219,7 +226,7 @@ sub search_auth_compat {
             # it's not reproduced here yet.
             my $authtype           = $rs->single;
             my $auth_tag_to_report = $authtype ? $authtype->auth_tag_to_report : "";
-            my $marc               = $self->decode_record_from_result(@_);
+            my $marc               = $self->decode_record_from_result($record);
             my $mainentry          = $marc->field($auth_tag_to_report);
             my $reported_tag;
             if ($mainentry) {
@@ -233,13 +240,13 @@ sub search_auth_compat {
 
             # Reimplementing BuildSummary is out of scope because it'll be hard
             $result{summary} =
-              C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid},
+            C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid},
                 $authtypecode );
             $result{used} = $self->count_auth_use($bib_searcher, $authid);
-            push @records, \%result;
         }
-    );
-    return ( \@records, $res->total );
+        push @records, \%result;
+    }
+    return ( \@records, $hits->{'total'} );
 }
 
 =head2 count_auth_use
diff --git a/Koha/SearchEngine/Zebra/Search.pm b/Koha/SearchEngine/Zebra/Search.pm

index 576b9f5..850469e 100644 (file)
--- a/Koha/SearchEngine/Zebra/Search.pm
+++ b/Koha/SearchEngine/Zebra/Search.pm
@@ -100,12 +100,12 @@ This passes the search query on to C4::AuthoritiesMarc::SearchAuthorities
 =cut
 
 sub search_auth_compat {
-    my ( $self, $q, $startfrom, $resperpage ) = @_;
+    my ( $self, $q, $startfrom, $resperpage, $skipmetadata ) = @_;
 
     my @params = (
         @{$q}{ 'marclist', 'and_or', 'excluding', 'operator', 'value' },
         $startfrom - 1,
-        $resperpage, @{$q}{ 'authtypecode', 'orderby' }
+        $resperpage, @{$q}{ 'authtypecode', 'orderby' }, $skipmetadata
     );
     C4::AuthoritiesMarc::SearchAuthorities(@params);
 }
diff --git a/misc/link_bibs_to_authorities.pl b/misc/link_bibs_to_authorities.pl

index b139a76..2711d16 100755 (executable)
--- a/misc/link_bibs_to_authorities.pl
+++ b/misc/link_bibs_to_authorities.pl
@@ -195,9 +195,10 @@ sub process_bib {
         return;
     }
 
+    my $frameworkcode = GetFrameworkCode($biblionumber);
+
     my ( $headings_changed, $results ) =
-      LinkBibHeadingsToAuthorities( $linker, $bib,
-        GetFrameworkCode($biblionumber) );
+      LinkBibHeadingsToAuthorities( $linker, $bib, $frameworkcode );
     foreach my $key ( keys %{ $results->{'unlinked'} } ) {
         $unlinked_headings{$key} += $results->{'unlinked'}->{$key};
     }
@@ -211,11 +212,15 @@ sub process_bib {
     if ($headings_changed) {
         if ($verbose) {
             my $title = substr( $bib->title, 0, 20 );
-            print
-"Bib $biblionumber ($title): $headings_changed headings changed\n";
+            printf(
+                "Bib %12d (%-20s): %3d headings changed\n",
+                $biblionumber,
+                $title,
+                $headings_changed
+            );
         }
         if ( not $test_only ) {
-            ModBiblio( $bib, $biblionumber, GetFrameworkCode($biblionumber) );
+            ModBiblio( $bib, $biblionumber, $frameworkcode );
             $num_bibs_modified++;
         }
     }
author	Ere Maijala <ere.maijala@helsinki.fi>
	Fri, 14 Sep 2018 03:40:24 +0000 (06:40 +0300)
committer	Nick Clemens <nick@bywatersolutions.com>
	Mon, 26 Nov 2018 16:11:01 +0000 (16:11 +0000)
C4/Heading.pm		patch \| blob \| history
C4/Matcher.pm		patch \| blob \| history
Koha/SearchEngine/Elasticsearch/Indexer.pm		patch \| blob \| history
Koha/SearchEngine/Elasticsearch/QueryBuilder.pm		patch \| blob \| history
Koha/SearchEngine/Elasticsearch/Search.pm		patch \| blob \| history
Koha/SearchEngine/Zebra/Search.pm		patch \| blob \| history
misc/link_bibs_to_authorities.pl		patch \| blob \| history