return $self;
}
+=head2 get_elasticsearch
+
+ my $elasticsearch_client = $self->get_elasticsearch();
+
+Returns a C<Search::Elasticsearch> client. The client is cached on a C<Koha::SearchEngine::ElasticSearch>
+instance level and will be reused if method is called multiple times.
+
+=cut
+
sub get_elasticsearch {
my $self = shift @_;
unless (defined $self->{elasticsearch}) {
my $settings = $self->get_elasticsearch_settings();
-This provides the settings provided to elasticsearch when an index is created.
-These can do things like define tokenisation methods.
+This provides the settings provided to Elasticsearch when an index is created.
+These can do things like define tokenization methods.
A hashref containing the settings is returned.
my $mappings = $self->get_elasticsearch_mappings();
-This provides the mappings that get passed to elasticsearch when an index is
+This provides the mappings that get passed to Elasticsearch when an index is
created.
=cut
=head2 _get_elasticsearch_mapping
-Get the ES mappings for the given purpose and data type
+Get the Elasticsearch mappings for the given purpose and data type.
$mapping = _get_elasticsearch_mapping('search', 'text');
return $self->_sort_fields_accessor();
}
+=head2 _process_mappings($mappings, $data, $record_document)
+
+Process all C<$mappings> targets operating on a specific MARC field C<$data> applied to C<$record_document>
+Since we group all mappings by MARC field targets C<$mappings> will contain all targets for C<$data>
+and thus we need to fetch the MARC field only once.
+
+=over 4
+
+=item C<$mappings>
+
+Arrayref of mappings containing arrayrefs on the format [C<$taget>, C<$options>] where
+C<$target> is the name of the target field and C<$options> is a hashref containing processing
+directives for this particular mapping.
+
+=item C<$data>
+
+The source data from a MARC record field.
+
+=item C<$record_document>
+
+Hashref representing the Elasticsearch document on which mappings should be applied.
+
+=back
+
+=cut
+
+sub _process_mappings {
+ my ($_self, $mappings, $data, $record_document) = @_;
+ foreach my $mapping (@{$mappings}) {
+ my ($target, $options) = @{$mapping};
+ # Copy (scalar) data since can have multiple targets
+ # with differing options for (possibly) mutating data
+ # so need a different copy for each
+ my $_data = $data;
+ $record_document->{$target} //= [];
+ if (defined $options->{substr}) {
+ my ($start, $length) = @{$options->{substr}};
+ $_data = length($data) > $start ? substr $data, $start, $length : '';
+ }
+ if (defined $options->{value_callbacks}) {
+ $_data = reduce { $b->($a) } ($_data, @{$options->{value_callbacks}});
+ }
+ if (defined $options->{property}) {
+ $_data = {
+ $options->{property} => $_data
+ }
+ }
+ push @{$record_document->{$target}}, $_data;
+ }
+}
+
+=head2 marc_records_to_documents($marc_records)
+
+ my @record_documents = $self->marc_records_to_documents($marc_records);
+
+Using mappings stored in database convert C<$marc_records> to Elasticsearch documents.
+
+Returns array of hash references, representing Elasticsearch documents,
+acceptable as body payload in C<Search::Elasticsearch> requests.
+
+=over 4
+
+=item C<$marc_documents>
+
+Reference to array of C<MARC::Record> objects to be converted to Elasticsearch documents.
+
+=back
+
+=cut
+
sub marc_records_to_documents {
my ($self, $records) = @_;
- my $rules = $self->get_marc_mapping_rules();
+ my $rules = $self->_get_marc_mapping_rules();
my $control_fields_rules = $rules->{control_fields};
my $data_fields_rules = $rules->{data_fields};
my $marcflavour = lc C4::Context->preference('marcflavour');
- my $serialization_format = C4::Context->preference('ElasticsearchMARCSerializationFormat');
my @record_documents;
- sub _process_mappings {
- my ($mappings, $data, $record_document) = @_;
- foreach my $mapping (@{$mappings}) {
- my ($target, $options) = @{$mapping};
- # Copy (scalar) data since can have multiple targets
- # with differing options for (possibly) mutating data
- # so need a different copy for each
- my $_data = $data;
- $record_document->{$target} //= [];
- if (defined $options->{substr}) {
- my ($start, $length) = @{$options->{substr}};
- $_data = length($data) > $start ? substr $data, $start, $length : '';
- }
- if (defined $options->{value_callbacks}) {
- $_data = reduce { $b->($a) } ($_data, @{$options->{value_callbacks}});
- }
- if (defined $options->{property}) {
- $_data = {
- $options->{property} => $_data
- }
- }
- push @{$record_document->{$target}}, $_data;
- }
- }
foreach my $record (@{$records}) {
my $record_document = {};
my $mappings = $rules->{leader};
if ($mappings) {
- _process_mappings($mappings, $record->leader(), $record_document);
+ $self->_process_mappings($mappings, $record->leader(), $record_document);
}
foreach my $field ($record->fields()) {
if($field->is_control_field()) {
my $mappings = $control_fields_rules->{$field->tag()};
if ($mappings) {
- _process_mappings($mappings, $field->data(), $record_document);
+ $self->_process_mappings($mappings, $field->data(), $record_document);
}
}
else {
$mappings = [@{$mappings}, @{$wildcard_mappings}];
}
if (@{$mappings}) {
- _process_mappings($mappings, $data, $record_document);
+ $self->_process_mappings($mappings, $data, $record_document);
}
}
)
);
if ($data) {
- _process_mappings($subfields_join_mappings->{$subfields_group}, $data, $record_document);
+ $self->_process_mappings($subfields_join_mappings->{$subfields_group}, $data, $record_document);
}
}
}
return \@record_documents;
}
-# Provides the rules for marc to Elasticsearch JSON document conversion.
-sub get_marc_mapping_rules {
- my ($self) = @_;
+=head2 _field_mappings($facet, $suggestible, $sort, $target_name, $target_type, $range)
- my $marcflavour = lc C4::Context->preference('marcflavour');
- my @rules;
+Get mappings, an internal data structure later used by L<_process_mappings($mappings, $data, $record_document)>
+to process MARC target data, for a MARC mapping.
+
+The returned C<$mappings> is to to be confused with mappings provided by C<_foreach_mapping>, rather this
+sub accepts properties from a mapping as provided by C<_foreach_mapping> and expands it to this internal
+data stucture. In the caller context (C<_get_marc_mapping_rules>) the returned C<@mappings> is then
+applied to each MARC target (leader, control field data, subfield or joined subfields) and
+integrated into the mapping rules data structure used in C<marc_records_to_documents> to
+transform MARC records into Elasticsearch documents.
+
+=over 4
- sub _field_mappings {
- my ($facet, $suggestible, $sort, $target_name, $target_type, $range) = @_;
- my %mapping_defaults = ();
- my @mappings;
-
- my $substr_args = undef;
- if ($range) {
- # TODO: use value_callback instead?
- my ($start, $end) = map(int, split /-/, $range, 2);
- $substr_args = [$start];
- push @{$substr_args}, (defined $end ? $end - $start + 1 : 1);
+=item C<$facet>
+
+Boolean indicating whether to create a facet field for this mapping.
+
+=item C<$suggestible>
+
+Boolean indicating whether to create a suggestion field for this mapping.
+
+=item C<$sort>
+
+Boolean indicating whether to create a sort field for this mapping.
+
+=item C<$target_name>
+
+Elasticsearch document target field name.
+
+=item C<$target_type>
+
+Elasticsearch document target field type.
+
+=item C<$range>
+
+An optinal range as a string on the format "<START>-<END>" or "<START>",
+where "<START>" and "<END>" are integers specifying a range that will be used
+for extracting a substing from MARC data as Elasticsearch field target value.
+
+The first character position is "1", and the range is inclusive,
+so "1-3" means the first three characters of MARC data.
+
+If only "<START>" is provided only one character as position "<START>" will
+be extracted.
+
+=back
+
+=cut
+
+sub _field_mappings {
+ my ($_self, $facet, $suggestible, $sort, $target_name, $target_type, $range) = @_;
+ my %mapping_defaults = ();
+ my @mappings;
+
+ my $substr_args = undef;
+ if ($range) {
+ # TODO: use value_callback instead?
+ my ($start, $end) = map(int, split /-/, $range, 2);
+ $substr_args = [$start];
+ push @{$substr_args}, (defined $end ? $end - $start + 1 : 1);
+ }
+ my $default_options = {};
+ if ($substr_args) {
+ $default_options->{substr} = $substr_args;
+ }
+
+ # TODO: Should probably have per type value callback/hook
+ # but hard code for now
+ if ($target_type eq 'boolean') {
+ $default_options->{value_callbacks} //= [];
+ push @{$default_options->{value_callbacks}}, sub {
+ my ($value) = @_;
+ # Trim whitespace at both ends
+ $value =~ s/^\s+|\s+$//g;
+ return $value ? 'true' : 'false';
+ };
+ }
+
+ my $mapping = [$target_name, $default_options];
+ push @mappings, $mapping;
+
+ my @suffixes = ();
+ push @suffixes, 'facet' if $facet;
+ push @suffixes, 'suggestion' if $suggestible;
+ push @suffixes, 'sort' if !defined $sort || $sort;
+
+ foreach my $suffix (@suffixes) {
+ my $mapping = ["${target_name}__$suffix"];
+ # TODO: Hack, fix later in less hideous manner
+ if ($suffix eq 'suggestion') {
+ push @{$mapping}, {%{$default_options}, property => 'input'};
}
- my $default_options = {};
- if ($substr_args) {
- $default_options->{substr} = $substr_args;
+ else {
+ push @{$mapping}, $default_options;
}
+ push @mappings, $mapping;
+ }
+ return @mappings;
+};
- # TODO: Should probably have per type value callback/hook
- # but hard code for now
- if ($target_type eq 'boolean') {
- $default_options->{value_callbacks} //= [];
- push @{$default_options->{value_callbacks}}, sub {
- my ($value) = @_;
- # Trim whitespace at both ends
- $value =~ s/^\s+|\s+$//g;
- return $value ? 'true' : 'false';
- };
- }
+=head2 _get_marc_mapping_rules
- my $mapping = [$target_name, $default_options];
- push @mappings, $mapping;
+ my $mapping_rules = $self->_get_marc_mapping_rules()
- my @suffixes = ();
- push @suffixes, 'facet' if $facet;
- push @suffixes, 'suggestion' if $suggestible;
- push @suffixes, 'sort' if !defined $sort || $sort;
+Generates rules from mappings stored in database for MARC records to Elasticsearch JSON document conversion.
+
+Since field retrieval is slow in C<MARC::Records> (all fields are itereted through for
+each call to C<MARC::Record>->field) we create an optimized structure of mapping
+rules keyed by MARC field tags holding all the mapping rules for that particular tag.
+
+We can then iterate through all MARC fields for each record and apply all relevant
+rules once per fields instead of retreiving fields multiple times for each mapping rule
+wich is terribly slow.
+
+=cut
+
+# TODO: This structure can be used for processing multiple MARC::Records so is currently
+# rebuilt for each batch. Since it is cacheable it could also be stored in an in
+# memory cache which it is currently not. The performance gain of caching
+# would probably be marginal, but to do this could be a further improvement.
+
+sub _get_marc_mapping_rules {
+ my ($self) = @_;
+
+ my $marcflavour = lc C4::Context->preference('marcflavour');
+ my @rules;
- foreach my $suffix (@suffixes) {
- my $mapping = ["${target_name}__$suffix"];
- # Hack, fix later in less hideous manner
- if ($suffix eq 'suggestion') {
- push @{$mapping}, {%{$default_options}, property => 'input'};
- }
- else {
- push @{$mapping}, $default_options;
- }
- push @mappings, $mapping;
- }
- return @mappings;
- };
my $field_spec_regexp = qr/^([0-9]{3})([()0-9a-z]+)?(?:_\/(\d+(?:-\d+)?))?$/;
my $leader_regexp = qr/^leader(?:_\/(\d+(?:-\d+)?))?$/;
my $rules = {
};
$self->_foreach_mapping(sub {
- my ( $name, $type, $facet, $suggestible, $sort, $marc_type, $marc_field ) = @_;
+ my ($name, $type, $facet, $suggestible, $sort, $marc_type, $marc_field) = @_;
return if $marc_type ne $marcflavour;
if ($type eq 'sum') {
}
my $range = defined $3 ? $3 : undef;
- my @mappings = _field_mappings($facet, $suggestible, $sort, $name, $type, $range);
+ my @mappings = $self->_field_mappings($facet, $suggestible, $sort, $name, $type, $range);
if ($field_tag < 10) {
$rules->{control_fields}->{$field_tag} //= [];
}
elsif ($marc_field =~ $leader_regexp) {
my $range = defined $1 ? $1 : undef;
- my @mappings = _field_mappings($facet, $suggestible, $sort, $name, $type, $range);
+ my @mappings = $self->_field_mappings($facet, $suggestible, $sort, $name, $type, $range);
push @{$rules->{leader}}, @mappings;
}
else {
- die("Invalid marc field: $marc_field");
+ die("Invalid MARC field: $marc_field");
}
});
return $rules;
$indexer->drop_index();
$indexer->update_index(\@biblionumbers, \@records);
-=head1 FUNCTIONS
-=head2 $indexer->update_index($biblionums, $records);
+=head1 CONSTANTS
-C<$biblionums> is an arrayref containing the biblionumbers for the records.
+=over 4
-C<$records> is an arrayref containing the L<MARC::Record>s themselves.
+=item C<Koha::SearchEngine::Elasticsearch::Indexer::INDEX_STATUS_OK>
-The values in the arrays must match up, and the 999$c value in the MARC record
-will be rewritten using the values in C<$biblionums> to ensure they are correct.
-If C<$biblionums> is C<undef>, this won't happen, but you should be sure that
-999$c is correct on your own then.
+Represents an index state where index is created and in a working state.
-Note that this will modify the original record if C<$biblionums> is supplied.
-If that's a problem, clone them first.
+=item C<Koha::SearchEngine::Elasticsearch::Indexer::INDEX_STATUS_REINDEX_REQUIRED>
+
+Not currently used, but could be useful later, for example if can detect when new field or mapping added.
+
+=item C<Koha::SearchEngine::Elasticsearch::Indexer::INDEX_STATUS_RECREATE_REQUIRED>
+
+Representings an index state where index needs to be recreated and is not in a working state.
+
+=back
=cut
use constant {
INDEX_STATUS_OK => 0,
- INDEX_STATUS_REINDEX_REQUIRED => 1, # Not currently used, but could be useful later, for example if can detect when new field or mapping added
+ INDEX_STATUS_REINDEX_REQUIRED => 1,
INDEX_STATUS_RECREATE_REQUIRED => 2,
};
+=head1 FUNCTIONS
+
+=head2 update_index($biblionums, $records)
+
+ try {
+ $self->update_index($biblionums, $records);
+ } catch {
+ die("Something whent wrong trying to update index:" . $_[0]);
+ }
+
+Converts C<MARC::Records> C<$records> to Elasticsearch documents and performs
+an update request for these records on the Elasticsearch index.
+
+The values in the arrays must match up, and the 999$c value in the MARC record
+will be rewritten using the values in C<$biblionums> to ensure they are correct.
+If C<$biblionums> is C<undef>, this won't happen, so in that case you should make
+sure that 999$c is correct.
+
+Note that this will modify the original record if C<$biblionums> is supplied.
+If that's a problem, clone them first.
+
+=over 4
+
+=item C<$biblionums>
+
+Arrayref of biblio numbers for the C<$records>, the order must be the same as
+and match up with C<$records>.
+
+=item C<$records>
+
+Arrayref of C<MARC::Record>s.
+
+=back
+
+=cut
+
sub update_index {
my ($self, $biblionums, $records) = @_;
- # TODO should have a separate path for dealing with a large number
- # of records at once where we use the bulk update functions in ES.
if ($biblionums) {
$self->_sanitise_records($biblionums, $records);
}
- $self->bulk_index($records);
- return 1;
-}
-
-sub bulk_index {
- my ($self, $records) = @_;
my $conf = $self->get_elasticsearch_params();
my $elasticsearch = $self->get_elasticsearch();
my $documents = $self->marc_records_to_documents($records);
return 1;
}
-sub index_status_ok {
- my ($self, $set) = @_;
- return defined $set ?
- $self->index_status(INDEX_STATUS_OK) :
- $self->index_status == INDEX_STATUS_OK;
+=head2 set_index_status_ok
+
+Convenience method for setting index status to C<INDEX_STATUS_OK>.
+
+=cut
+
+sub set_index_status_ok {
+ my ($self) = @_;
+ $self->index_status(INDEX_STATUS_OK);
}
-sub index_status_reindex_required {
- my ($self, $set) = @_;
- return defined $set ?
- $self->index_status(INDEX_STATUS_REINDEX_REQUIRED) :
- $self->index_status == INDEX_STATUS_REINDEX_REQUIRED;
+=head2 is_index_status_ok
+
+Convenience method for checking if index status is C<INDEX_STATUS_OK>.
+
+=cut
+
+sub is_index_status_ok {
+ my ($self) = @_;
+ return $self->index_status == INDEX_STATUS_OK;
}
-sub index_status_recreate_required {
- my ($self, $set) = @_;
- return defined $set ?
- $self->index_status(INDEX_STATUS_RECREATE_REQUIRED) :
- $self->index_status == INDEX_STATUS_RECREATE_REQUIRED;
+=head2 set_index_status_reindex_required
+
+Convenience method for setting index status to C<INDEX_REINDEX_REQUIRED>.
+
+=cut
+
+sub set_index_status_reindex_required {
+ my ($self) = @_;
+ $self->index_status(INDEX_STATUS_REINDEX_REQUIRED);
}
+=head2 is_index_status_reindex_required
+
+Convenience method for checking if index status is C<INDEX_STATUS_REINDEX_REQUIRED>.
+
+=cut
+
+sub is_index_status_reindex_required {
+ my ($self) = @_;
+ return $self->index_status == INDEX_STATUS_REINDEX_REQUIRED;
+}
+
+=head2 set_index_status_recreate_required
+
+Convenience method for setting index status to C<INDEX_STATUS_RECREATE_REQUIRED>.
+
+=cut
+
+sub set_index_status_recreate_required {
+ my ($self) = @_;
+ $self->index_status(INDEX_STATUS_RECREATE_REQUIRED);
+}
+
+=head2 is_index_status_recreate_required
+
+Convenience method for checking if index status is C<INDEX_STATUS_RECREATE_REQUIRED>.
+
+=cut
+
+sub is_index_status_recreate_required {
+ my ($self) = @_;
+ return $self->index_status == INDEX_STATUS_RECREATE_REQUIRED;
+}
+
+=head2 index_status($status)
+
+Will either set the current index status to C<$status> and return C<$status>,
+or return the current index status if called with no arguments.
+
+=over 4
+
+=item C<$status>
+
+Optional argument. If passed will set current index status to C<$status> if C<$status> is
+a valid status. See L</CONSTANTS>.
+
+=back
+
+=cut
+
sub index_status {
my ($self, $status) = @_;
my $key = 'ElasticsearchIndexStatus_' . $self->index;
}
}
+=head2 update_mappings
+
+Generate Elasticsearch mappings from mappings stored in database and
+perform a request to update Elasticsearch index mappings. Will throw an
+error and set index status to C<INDEX_STATUS_RECREATE_REQUIRED> if update
+failes.
+
+=cut
+
sub update_mappings {
my ($self) = @_;
my $conf = $self->get_elasticsearch_params();
}
);
} catch {
- $self->index_status_recreate_required(1);
+ $self->set_index_status_recreate_required();
my $reason = $_[0]->{vars}->{body}->{error}->{reason};
Koha::Exceptions::Exception->throw(
error => "Unable to update mappings for index \"$conf->{index_name}\". Reason was: \"$reason\". Index needs to be recreated and reindexed",
);
};
}
- $self->index_status_ok(1);
+ $self->set_index_status_ok();
}
-=head2 $indexer->update_index_background($biblionums, $records)
+=head2 update_index_background($biblionums, $records)
-This has exactly the same API as C<update_index_background> however it'll
+This has exactly the same API as C<update_index> however it'll
return immediately. It'll start a background process that does the adding.
If it fails to add to Elasticsearch then it'll add to a queue that will cause
it to be updated by a regular index cron job in the future.
+=cut
+
# TODO implement in the future - I don't know the best way of doing this yet.
# If fork: make sure process group is changed so apache doesn't wait for us.
-=cut
-
sub update_index_background {
my $self = shift;
$self->update_index(@_);
}
-=head2 $indexer->delete_index($biblionums)
+=head2 delete_index($biblionums)
C<$biblionums> is an arrayref of biblionumbers to delete from the index.
$self->store->bag->commit;
}
-=head2 $indexer->delete_index_background($biblionums)
+=head2 delete_index_background($biblionums)
-Identical to L<delete_index>, this will return immediately and start a
-background process to do the actual deleting.
+Identical to L</delete_index($biblionums)>
=cut
-# TODO implement in the future
-
+# TODO: Should be made async
sub delete_index_background {
my $self = shift;
$self->delete_index(@_);
}
-=head2 $indexer->drop_index();
+=head2 drop_index
-Drops the index from the elasticsearch server.
+Drops the index from the Elasticsearch server.
=cut
my $conf = $self->get_elasticsearch_params();
my $elasticsearch = $self->get_elasticsearch();
$elasticsearch->indices->delete(index => $conf->{index_name});
- $self->index_status_recreate_required(1);
+ $self->set_index_status_recreate_required();
}
}
+=head2 create_index
+
+Creates the index (including mappings) on the Elasticsearch server.
+
+=cut
+
sub create_index {
my ($self) = @_;
my $conf = $self->get_elasticsearch_params();
$self->update_mappings();
}
+=head2 index_exists
+
+Checks if index has been created on the Elasticsearch server. Returns C<1> or the
+empty string to indicate whether index exists or not.
+
+=cut
+
sub index_exists {
my ($self) = @_;
my $conf = $self->get_elasticsearch_params();