Bug 10955 - Add ability to skip deletions in zebraqueue
authorKyle M Hall <kyle@bywatersolutions.com>
Thu, 26 Sep 2013 16:47:13 +0000 (12:47 -0400)
committerGalen Charlton <gmc@esilibrary.com>
Mon, 10 Mar 2014 18:44:10 +0000 (18:44 +0000)
It seems that record deletions can cause extreme slowdowns for Koha
installations with extremely large numbers of records. It would be
helpful to be able to skip record deletions when processing the
zebraqueue with rebuild_zebra.pl so the deletions can be processed with
a lower frequency.

Test Plan:
1) Disable any zebra indexing cronjobs you may have
2) Delete a record
3) Note the operation recordDelete in the zebraqueue table having done = 0
4) Run misc/migration_tools/rebuild_zebra.pl -b -z --skip-deletes
5) Note the delete still has done = 0
6) Run misc/migration_tools/rebuild_zebra.pl -b -z
7) Note the delete now has done = 1

Signed-off-by: Jonathan Druart <jonathan.druart@biblibre.com>
Signed-off-by: Katrin Fischer <Katrin.Fischer.83@web.de>
Passes all tests and QA script.
Also tested for authorities, no problems found.

Signed-off-by: Galen Charlton <gmc@esilibrary.com>

RM note: this is at best a work-around, and I will emphasize that
--skip-deletes should be used only when absolutely necessary.

I hope that --skip-deletes can go away at some point soon, but
that may depend on changes to Zebra.

misc/migration_tools/rebuild_zebra.pl

index 83e32c4..b613a77 100755 (executable)
@@ -39,6 +39,7 @@ my $noshadow;
 my $want_help;
 my $as_xml;
 my $process_zebraqueue;
+my $process_zebraqueue_skip_deletes;
 my $do_not_clear_zebraqueue;
 my $length;
 my $where;
@@ -67,6 +68,7 @@ my $result = GetOptions(
     'x'             => \$as_xml,
     'y'             => \$do_not_clear_zebraqueue,
     'z'             => \$process_zebraqueue,
+    'skip-deletes'  => \$process_zebraqueue_skip_deletes,
     'where:s'       => \$where,
     'length:i'      => \$length,
     'offset:i'      => \$offset,
@@ -314,7 +316,7 @@ sub index_records {
     my ($record_type, $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;
 
     my $num_records_exported = 0;
-    my $records_deleted;
+    my $records_deleted = {};
     my $need_reset = check_zebra_dirs($server_dir);
     if ($need_reset) {
         print "$0: found broken zebra server directories: forcing a rebuild\n";
@@ -333,15 +335,20 @@ sub index_records {
         mkdir "$directory" unless (-d $directory);
         mkdir "$directory/$record_type" unless (-d "$directory/$record_type");
         if ($process_zebraqueue) {
-            my $entries = select_zebraqueue_records($record_type, 'deleted');
-            mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type");
-            $records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
-            mark_zebraqueue_batch_done($entries);
+            my $entries;
+
+            unless ( $process_zebraqueue_skip_deletes ) {
+                $entries = select_zebraqueue_records($record_type, 'deleted');
+                mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type");
+                $records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
+                mark_zebraqueue_batch_done($entries);
+            }
+
             $entries = select_zebraqueue_records($record_type, 'updated');
             mkdir "$directory/upd_$record_type" unless (-d "$directory/upd_$record_type");
-            $num_records_exported = export_marc_records_from_list($record_type,
-                                                                  $entries, "$directory/upd_$record_type", $as_xml, $noxml, $records_deleted);
+            $num_records_exported = export_marc_records_from_list($record_type,$entries, "$directory/upd_$record_type", $as_xml, $noxml, $records_deleted);
             mark_zebraqueue_batch_done($entries);
+
         } else {
             my $sth = select_all_records($record_type);
             $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml, $nosanitize);
@@ -846,6 +853,10 @@ Parameters:
                             table.  Cannot be used with -r
                             or -s.
 
+    --skip-deletes          select only updated records marked
+                            in the zebraqueue table, not deletes.
+                            Only effective with -z.
+
     -r                      clear Zebra index before
                             adding records to index. Implies -w.