3 # This inserts records from a Koha database into elastic search
5 # Copyright 2014 Catalyst IT
7 # This file is part of Koha.
9 # Koha is free software; you can redistribute it and/or modify it
10 # under the terms of the GNU General Public License as published by
11 # the Free Software Foundation; either version 3 of the License, or
12 # (at your option) any later version.
14 # Koha is distributed in the hope that it will be useful, but
15 # WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with Koha; if not, see <http://www.gnu.org/licenses>.
24 rebuild_elasticsearch.pl - inserts records from a Koha database into Elasticsearch
28 B<rebuild_elasticsearch.pl>
29 [B<-c|--commit>=C<count>]
43 Inserts records from a Koha database into Elasticsearch.
49 =item B<-c|--commit>=C<count>
51 Specify how many records will be batched up before they're added to Elasticsearch.
52 Higher should be faster, but will cause more RAM usage. Default is 5000.
56 Delete the index and recreate it before indexing.
60 Reload mappings from files (specified in koha-conf.xml) before indexing.
63 =item B<-a|--authorities>
65 Index the authorities only. Combining this with B<-b> is the same as
66 specifying neither and so both get indexed.
70 Index the biblios only. Combining this with B<-a> is the same as
71 specifying neither and so both get indexed.
73 =item B<-bn|--bnumber>
75 Only index the supplied biblionumber, mostly for testing purposes. May be
80 Only index the supplied authority id, mostly for testing purposes. May be
83 =item B<-p|--processes>
85 Number of processes to use for indexing. This can be used to do more indexing
86 work in parallel on multicore systems. By default, a single process is used.
90 By default, this program only emits warnings and errors. This makes it talk
91 more. Add more to make it even more wordy, in particular when debugging.
103 =head1 IMPLEMENTATION
111 use Koha::MetadataRecord::Authority;
112 use Koha::BiblioUtils;
113 use Koha::SearchEngine::Elasticsearch;
114 use Koha::SearchEngine::Elasticsearch::Indexer;
123 my ($delete, $reset, $help, $man, $processes);
124 my ($index_biblios, $index_authorities);
125 my (@biblionumbers,@authids);
127 $|=1; # flushes output
130 'c|commit=i' => \$commit,
131 'd|delete' => \$delete,
132 'r|reset' => \$reset,
133 'a|authorities' => \$index_authorities,
134 'b|biblios' => \$index_biblios,
135 'bn|bnumber=i' => \@biblionumbers,
136 'ai|authid=i' => \@authids,
137 'p|processes=i' => \$processes,
138 'v|verbose+' => \$verbose,
143 # Default is to do both
144 unless ($index_authorities || $index_biblios) {
145 $index_authorities = $index_biblios = 1;
148 if ($processes && ( @biblionumbers || @authids) ) {
149 die "Argument p|processes cannot be combined with bn|bnumber or ai|authid";
152 pod2usage(1) if $help;
153 pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;
158 Koha::SearchEngine::Elasticsearch->reset_elasticsearch_mappings;
159 my $cache = Koha::Caches->get_instance();
160 $cache->clear_from_cache('elasticsearch_search_fields_staff_client');
161 $cache->clear_from_cache('elasticsearch_search_fields_opac');
165 _verify_index_state($Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX, $delete) if ($index_biblios);
166 _verify_index_state($Koha::SearchEngine::Elasticsearch::AUTHORITIES_INDEX, $delete) if ($index_authorities);
169 my $slice_count = ( $processes //= 1 );
170 my %iterator_options;
172 if ($slice_count > 1) {
173 # Fire up child processes for processing slices from 2 on. This main process will handle slice 1.
175 for (my $proc = 1; $proc < $slice_count; $proc++) {
177 die "Failed to fork a child process\n" unless defined $pid;
179 # Child process, give it a slice to process
180 $slice_index = $proc;
184 # Fudge the commit count a bit to spread out the Elasticsearch commits
185 $commit *= 1 + 0.10 * $slice_index;
186 _log(1, "Processing slice @{[$slice_index + 1]} of $slice_count\n");
187 $iterator_options{slice} = { index => $slice_index, count => $slice_count };
191 if ($index_biblios) {
192 _log(1, "Indexing biblios\n");
193 if (@biblionumbers) {
195 my $r = shift @biblionumbers;
196 return () unless defined $r;
197 return ($r, Koha::BiblioUtils->get_from_biblionumber($r, item_data => 1 ));
200 my $records = Koha::BiblioUtils->get_all_biblios_iterator(%iterator_options);
205 _do_reindex($next, $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX);
207 if ($index_authorities) {
208 _log(1, "Indexing authorities\n");
211 my $r = shift @authids;
212 return () unless defined $r;
213 my $a = Koha::MetadataRecord::Authority->get_from_authid($r);
217 my $records = Koha::MetadataRecord::Authority->get_all_authorities_iterator(%iterator_options);
222 _do_reindex($next, $Koha::SearchEngine::Elasticsearch::AUTHORITIES_INDEX);
225 if ($slice_index == 0) {
226 # Main process, wait for children
227 for (my $proc = 1; $proc < $processes; $proc++) {
232 =head1 INTERNAL METHODS
234 =head2 _verify_index_state
236 _verify_index_state($Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX, 1);
238 Checks the index state and recreates it if requested.
242 sub _verify_index_state {
243 my ( $index_name, $recreate ) = @_;
245 _log(1, "Checking state of $index_name index\n");
246 my $indexer = Koha::SearchEngine::Elasticsearch::Indexer->new( { index => $index_name } );
249 _log(1, "Dropping and recreating $index_name index\n");
250 $indexer->drop_index() if $indexer->index_exists();
251 $indexer->create_index();
253 elsif (!$indexer->index_exists) {
254 # Create index if does not exist
255 $indexer->create_index();
256 } elsif ($indexer->is_index_status_ok) {
257 # Update mapping unless index is some kind of problematic state
258 $indexer->update_mappings();
259 } elsif ($indexer->is_index_status_recreate_required) {
260 warn qq/Index "$index_name" has status "recreate required", suggesting it should be recreated/;
266 _do_reindex($callback, $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX);
268 Does the actual reindexing. $callback is a function that always returns the next record.
269 For each index we iterate through the records, committing at specified count
274 my ( $next, $index_name ) = @_;
276 my $indexer = Koha::SearchEngine::Elasticsearch::Indexer->new( { index => $index_name } );
279 my $commit_count = $commit;
280 my ( @id_buffer, @commit_buffer );
281 while ( my $record = $next->() ) {
282 my $id = $record->id // $record->authid;
283 my $record = $record->record;
285 if ( $verbose == 1 ) {
286 _log( 1, "$count records processed\n" ) if ( $count % 1000 == 0);
291 push @id_buffer, $id;
292 push @commit_buffer, $record;
293 if ( !( --$commit_count ) ) {
294 _log( 1, "Committing $commit records...\n" );
295 my $response = $indexer->update_index( \@id_buffer, \@commit_buffer );
296 _handle_response($response);
297 $commit_count = $commit;
300 _log( 1, "Commit complete\n" );
304 # There are probably uncommitted records
305 _log( 1, "Committing final records...\n" );
306 my $response = $indexer->update_index( \@id_buffer, \@commit_buffer );
307 _handle_response($response);
308 _log( 1, "Total $count records indexed\n" );
315 Checks some basic stuff to ensure that it's sane before we start.
320 # Do we have an elasticsearch block defined?
321 my $conf = C4::Context->config('elasticsearch');
322 die "No 'elasticsearch' block is defined in koha-conf.xml.\n" if ( !$conf );
325 =head2 _handle_response
327 Parse the return from update_index and display errors depending on verbosity of the script
331 sub _handle_response {
333 if( $response->{errors} eq 'true' ){
334 _log( 1, "There were errors during indexing\n" );
336 foreach my $item (@{$response->{items}}){
337 next unless defined $item->{index}->{error};
338 print "Record #" . $item->{index}->{_id} . " " .
339 $item->{index}->{error}->{reason} . " (" . $item->{index}->{error}->{type} . ") : " .
340 $item->{index}->{error}->{caused_by}->{type} . " (" . $item->{index}->{error}->{caused_by}->{reason} . ")\n";
348 _log($level, "Message\n");
350 Output progress information.
352 Will output the message if verbosity level is set to $level or more. Will not
353 include a trailing newline automatically.
358 my ($level, $msg) = @_;
360 print "[$$] $msg" if ($verbose >= $level);