Also optimize it so it's actually usable.
Test plan:
1. To test it properly you need biblio and authority data. You might get away with enabling AutoCreateAuthorities and BiblioAddsAuthorities so that authorities are created in the process. Another option would be to import authorities first e.g. from LoC.
2. Make sure the authority index has been properly created with "misc/search_tools/rebuild_elastic_search.pl -a -d"
3. Run "misc/link_bibs_to_authorities.pl -v -l" twice and observe the results.
Sponsored-by: National Library of Finland
Signed-off-by: Brendan Gallagher <brendan@bywatersolutions.com>
Signed-off-by: Martin Renvoize <martin.renvoize@ptfs-europe.com>
Signed-off-by: Nick Clemens <nick@bywatersolutions.com>
# push @operator, 'is';
# push @value, $self->{'thesaurus'};
# }
- require C4::AuthoritiesMarc;
- return C4::AuthoritiesMarc::SearchAuthorities(
+
+ require Koha::SearchEngine::QueryBuilder;
+ require Koha::SearchEngine::Search;
+
+ # Use state variables to avoid recreating the objects every time.
+ # With Elasticsearch this also avoids creating a massive amount of
+ # ES connectors that would eventually run out of file descriptors.
+ state $builder = Koha::SearchEngine::QueryBuilder->new(
+ { index => $Koha::SearchEngine::AUTHORITIES_INDEX } );
+ state $searcher = Koha::SearchEngine::Search->new(
+ {index => $Koha::SearchEngine::AUTHORITIES_INDEX} );
+
+ my $search_query = $builder->build_authorities_query_compat(
\@marclist, \@and_or, \@excluding, \@operator,
- \@value, 0, 20, $self->{'auth_type'},
- 'AuthidAsc', $skipmetadata
+ \@value, $self->{'auth_type'},
+ 'AuthidAsc'
);
+ return $searcher->search_auth_compat( $search_query, 0, 20, $skipmetadata );
}
=head1 INTERNAL FUNCTIONS
#NOTE: double-quote the values so you don't get a "Embedded truncation not supported" error when a term has a ? in it.
}
- my $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX});
+ # Use state variables to avoid recreating the objects every time.
+ # With Elasticsearch this also avoids creating a massive amount of
+ # ES connectors that would eventually run out of file descriptors.
+ state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX});
( $error, $searchresults, $total_hits ) =
$searcher->simple_search_compat( $query, 0, $max_matches, undef, skip_normalize => 1 );
push @operator, 'exact';
push @value, $key;
}
- my $builder = Koha::SearchEngine::QueryBuilder->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
- my $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
+ # Use state variables to avoid recreating the objects every time.
+ # With Elasticsearch this also avoids creating a massive amount of
+ # ES connectors that would eventually run out of file descriptors.
+ state $builder = Koha::SearchEngine::QueryBuilder->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
+ state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
my $search_query = $builder->build_authorities_query_compat(
\@marclist, \@and_or, \@excluding, \@operator,
\@value, undef, 'AuthidAsc'
# tears in rain...
if ( $rec ) {
$rec->delete_fields($rec->field('999'));
- $rec->append_fields(MARC::Field->new('999','','','c' => $bibnum, 'd' => $bibnum));
+ # Make sure biblionumber is a string. Elasticsearch would consider int and string different IDs.
+ $rec->append_fields(MARC::Field->new('999','','','c' => "" . $bibnum, 'd' => "" . $bibnum));
}
}
}
if $d && ( $d ne 'asc' && $d ne 'desc' );
$d = 'asc' unless $d;
- # TODO account for fields that don't have a 'phrase' type
-
$f = $self->_sort_field($f);
- push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
+ push @{ $res->{sort} }, { $f => { order => $d } };
}
}
}
}
},
- sort => [ { "$sort.phrase" => { order => "asc" } } ],
+ sort => [ { $sort => { order => "asc" } } ],
};
}
foreach my $s ( @{ $search->{searches} } ) {
my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
$wh = '_all' if $wh eq '';
- if ( $op eq 'is' || $op eq '=' ) {
+ if ( $op eq 'is' || $op eq '=' || $op eq 'exact' ) {
# look for something that matches a term completely
# note, '=' is about numerical vals. May need special handling.
# Also, we lowercase our search because the ES
# index lowercases its values, and term searches don't get the
# search analyzer applied to them.
- push @query_parts, { term => {"$wh.phrase" => lc $val} };
- }
- elsif ( $op eq 'exact' ) {
- # left and right truncation, otherwise an exact phrase
push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
}
elsif ( $op eq 'start' ) {
# startswith search, uses lowercase untokenized version of heading
- push @query_parts, { prefix => {"$wh.lc_raw" => lc $val} };
+ push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
}
else {
# regular wordlist stuff
# 'should' behaves like 'or'
# 'must' behaves like 'and'
# Zebra results seem to match must so using that here
- my $query = { query=>
+ my $query = { query =>
{ bool =>
{ must => \@query_parts }
}
};
- # We need to add '.phrase' to all the sort headings otherwise it'll sort
- # based on the tokenised form.
my %s;
if ( exists $search->{sort} ) {
foreach my $k ( keys %{ $search->{sort} } ) {
my $f = $self->_sort_field($k);
- $s{"$f.phrase"} = $search->{sort}{$k};
+ $s{$f} = $search->{sort}{$k};
}
$search->{sort} = \%s;
}
=item operator
-What form of search to do. Options are: is (phrase, no trunction, whole field
-must match), = (number exact match), exact (phrase, but with left and right
-truncation). If left blank, then word list, right truncted, anywhere is used.
+What form of search to do. Options are: is (phrase, no truncation, whole field
+must match), = (number exact match), exact (phrase, no truncation, whole field
+must match). If left blank, then word list, right truncated, anywhere is used.
=item value
'match-heading' => 'Match-heading',
'see-from' => 'Match-heading-see-from',
thesaurus => 'Subject-heading-thesaurus',
- all => ''
+ any => '',
+ all => ''
};
sub build_authorities_query_compat {
my %sort;
my $sort_field =
- ( $orderby =~ /^Heading/ ) ? 'Heading__sort'
- : ( $orderby =~ /^Auth/ ) ? 'Local-Number'
+ ( $orderby =~ /^Heading/ ) ? 'Heading'
+ : ( $orderby =~ /^Auth/ ) ? 'Local-number'
: undef;
if ($sort_field) {
my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
my $field = $self->_sort_field($field);
-Given a field name, this works out what the actual name of the version to sort
-on should be. Often it's the same, sometimes it involves sticking "__sort" on
-the end. Maybe it'll be something else in the future, who knows?
+Given a field name, this works out what the actual name of the field to sort
+on should be. A '__sort' suffix is added for fields with a sort version, and
+for text fields either '.phrase' (for sortable versions) or '.raw' is appended
+to avoid sorting on a tokenized value.
=cut
sub _sort_field {
my ($self, $f) = @_;
- if ($self->sort_fields()->{$f}) {
+
+ my $mappings = $self->get_elasticsearch_mappings();
+ my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
+ if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
$f .= '__sort';
+ # We need to add '.phrase' to text fields, otherwise it'll sort
+ # based on the tokenised form.
+ $f .= '.phrase' if $textField;
+ } else {
+ # We need to add '.raw' to text fields without a sort field,
+ # otherwise it'll sort based on the tokenised form.
+ $f .= '.raw' if $textField;
}
return $f;
}
my ($self, $query, $page, $count, %options) = @_;
my $params = $self->get_elasticsearch_params();
- my %paging;
# 20 is the default number of results per page
- $paging{limit} = $count || 20;
- # ES/Catmandu doesn't want pages, it wants a record to start from.
+ $query->{size} = $count || 20;
+ # ES doesn't want pages, it wants a record to start from.
if (exists $options{offset}) {
- $paging{start} = $options{offset};
+ $query->{from} = $options{offset};
} else {
$page = (!defined($page) || ($page <= 0)) ? 0 : $page - 1;
- $paging{start} = $page * $paging{limit};
+ $query->{from} = $page * $query->{size};
}
- $self->store(
- Catmandu::Store::ElasticSearch->new(
- %$params,
- )
- ) unless $self->store;
+ my $elasticsearch = $self->get_elasticsearch();
my $results = eval {
- $self->store->bag->search( %$query, %paging );
+ $elasticsearch->search(
+ index => $params->{index_name},
+ body => $query
+ );
};
if ($@) {
die $self->process_error($@);
# opac-search expects results to be put in the
# right place in the array, according to $offset
my $index = $offset;
- $results->each(sub {
- $records[$index++] = $self->decode_record_from_result(@_);
- });
+ my $hits = $results->{'hits'};
+ foreach my $es_record (@{$hits->{'hits'}}) {
+ $records[$index++] = $self->decode_record_from_result($es_record->{'_source'});
+ }
+
# consumers of this expect a name-spaced result, we provide the default
# configuration.
my %result;
- $result{biblioserver}{hits} = $results->total;
+ $result{biblioserver}{hits} = $hits->{'total'};
$result{biblioserver}{RECORDS} = \@records;
return (undef, \%result, $self->_convert_facets($results->{aggregations}, $expanded_facet));
}
=head2 search_auth_compat
my ( $results, $total ) =
- $searcher->search_auth_compat( $query, $page, $count, %options );
+ $searcher->search_auth_compat( $query, $offset, $count, $skipmetadata, %options );
This has a similar calling convention to L<search>, however it returns its
results in a form the same as L<C4::AuthoritiesMarc::SearchAuthorities>.
=cut
sub search_auth_compat {
- my $self = shift;
+ my ($self, $query, $offset, $count, $skipmetadata, %options) = @_;
- # TODO handle paging
+ if ( !defined $offset or $offset <= 0 ) {
+ $offset = 1;
+ }
+ # Uh, authority search uses 1-based offset..
+ $options{offset} = $offset - 1;
my $database = Koha::Database->new();
my $schema = $database->schema();
- my $res = $self->search(@_);
+ my $res = $self->search($query, undef, $count, %options);
+
my $bib_searcher = Koha::SearchEngine::Elasticsearch::Search->new({index => 'biblios'});
my @records;
- $res->each(
- sub {
- my %result;
+ my $hits = $res->{'hits'};
+ foreach my $es_record (@{$hits->{'hits'}}) {
+ my $record = $es_record->{'_source'};
+ my %result;
- # I wonder if these should be real values defined in the mapping
- # rather than hard-coded conversions.
- my $record = $_[0];
- # Handle legacy nested arrays indexed with splitting enabled.
- my $authid = $record->{ 'Local-number' }[0];
- $authid = @$authid[0] if (ref $authid eq 'ARRAY');
+ # I wonder if these should be real values defined in the mapping
+ # rather than hard-coded conversions.
+ #my $record = $_[0];
+ # Handle legacy nested arrays indexed with splitting enabled.
+ my $authid = $record->{ 'Local-number' }[0];
+ $authid = @$authid[0] if (ref $authid eq 'ARRAY');
- $result{authid} = $authid;
+ $result{authid} = $authid;
+ if (!defined $skipmetadata || !$skipmetadata) {
# TODO put all this info into the record at index time so we
# don't have to go and sort it all out now.
my $authtypecode = $record->{authtype};
my $rs = $schema->resultset('AuthType')
- ->search( { authtypecode => $authtypecode } );
+ ->search( { authtypecode => $authtypecode } );
# FIXME there's an assumption here that we will get a result.
# the original code also makes an assumption that some provided
# it's not reproduced here yet.
my $authtype = $rs->single;
my $auth_tag_to_report = $authtype ? $authtype->auth_tag_to_report : "";
- my $marc = $self->decode_record_from_result(@_);
+ my $marc = $self->decode_record_from_result($record);
my $mainentry = $marc->field($auth_tag_to_report);
my $reported_tag;
if ($mainentry) {
# Reimplementing BuildSummary is out of scope because it'll be hard
$result{summary} =
- C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid},
+ C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid},
$authtypecode );
$result{used} = $self->count_auth_use($bib_searcher, $authid);
- push @records, \%result;
}
- );
- return ( \@records, $res->total );
+ push @records, \%result;
+ }
+ return ( \@records, $hits->{'total'} );
}
=head2 count_auth_use
=cut
sub search_auth_compat {
- my ( $self, $q, $startfrom, $resperpage ) = @_;
+ my ( $self, $q, $startfrom, $resperpage, $skipmetadata ) = @_;
my @params = (
@{$q}{ 'marclist', 'and_or', 'excluding', 'operator', 'value' },
$startfrom - 1,
- $resperpage, @{$q}{ 'authtypecode', 'orderby' }
+ $resperpage, @{$q}{ 'authtypecode', 'orderby' }, $skipmetadata
);
C4::AuthoritiesMarc::SearchAuthorities(@params);
}
return;
}
+ my $frameworkcode = GetFrameworkCode($biblionumber);
+
my ( $headings_changed, $results ) =
- LinkBibHeadingsToAuthorities( $linker, $bib,
- GetFrameworkCode($biblionumber) );
+ LinkBibHeadingsToAuthorities( $linker, $bib, $frameworkcode );
foreach my $key ( keys %{ $results->{'unlinked'} } ) {
$unlinked_headings{$key} += $results->{'unlinked'}->{$key};
}
if ($headings_changed) {
if ($verbose) {
my $title = substr( $bib->title, 0, 20 );
- print
-"Bib $biblionumber ($title): $headings_changed headings changed\n";
+ printf(
+ "Bib %12d (%-20s): %3d headings changed\n",
+ $biblionumber,
+ $title,
+ $headings_changed
+ );
}
if ( not $test_only ) {
- ModBiblio( $bib, $biblionumber, GetFrameworkCode($biblionumber) );
+ ModBiblio( $bib, $biblionumber, $frameworkcode );
$num_bibs_modified++;
}
}