Bug 20073: Move Elasticsearch configs to yaml files and improve the default settings.
authorEre Maijala <ere.maijala@helsinki.fi>
Tue, 23 Jan 2018 13:21:31 +0000 (15:21 +0200)
committerJonathan Druart <jonathan.druart@bugs.koha-community.org>
Mon, 23 Apr 2018 17:22:16 +0000 (14:22 -0300)
Improvements:
1) Index settings moved from code to etc/searchengine/elasticsearch/index_config.yaml. An alternative can be specified in koha-conf.xml.
2) Field settings moved from code to etc/searchengine/elasticsearch/field_config.yaml. An alternative can be specified in koha-conf.xml.
3) mappings.yaml has been moved from admin/searchengine/elasticsearch to etc/searchengine/elasticsearch. An alternative can be specified in koha-conf.xml.
4) Default settings have been improved to remove punctuation from phrases used for sorting etc.
5) State variables are used for storing configuration to avoid parsing it multiple times.
6) A possibility to reset the fields too has been added to the reset operation of mappings administration.
7) mappings.yaml has been moved from admin/searchengine/elasticsearch to etc/searchengine/elasticsearch.
8) An stdno field type has been added for standard identifiers.

To test:
1) Run tests in t/Koha/SearchEngine/Elasticsearch.t
2) Clear tables search_fields and search_marc_map
3) Go to admin/searchengine/elasticsearch/mappings.pl?op=reset&i_know_what_i_am_doing=1
4) Verify that admin/searchengine/elasticsearch/mappings.pl displays the mappings properly, including ISBN and other standard number fields.
5) Index some records using the -d parameter with misc/search_tools/rebuild_elastic_search.pl to recreate the index
6) Verify that you can find the records
7) Put <elasticsearch_index_mappings>non_existent</elasticsearch_index_mappings> to koha-conf.xml
8) Verify that admin/searchengine/elasticsearch/mappings.pl?op=reset&i_know_what_i_am_doing=1 fails because it can't find non_existent.
9) Copy etc/searchengine/elasticsearch/mappings.yaml to a new location and make elasticsearch_index_mappings setting in koha-conf.xml point to it.
10) Make a change in the new mappings.yaml.
11) Clear table search_fields (mappings reset doesn't do it yet, see bug 20248)
12) Go to admin/searchengine/elasticsearch/mappings.pl?op=reset&i_know_what_i_am_doing=1
13) Verify that the changes you made are now visible in the mappings UI

Signed-off-by: Nicolas Legrand <nicolas.legrand@bulac.fr>

Signed-off-by: Nick Clemens <nick@bywatersolutions.com>

Signed-off-by: Jonathan Druart <jonathan.druart@bugs.koha-community.org>

Koha/SearchEngine/Elasticsearch.pm
debian/templates/koha-conf-site.xml.in
etc/koha-conf.xml
etc/searchengine/elasticsearch/field_config.yaml [new file with mode: 0644]
etc/searchengine/elasticsearch/index_config.yaml [new file with mode: 0644]
etc/searchengine/elasticsearch/mappings.yaml [moved from admin/searchengine/elasticsearch/mappings.yaml with 99% similarity]
installer/data/mysql/atomicupdate/bug_20073.perl [new file with mode: 0644]
installer/data/mysql/kohastructure.sql
koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt
t/Koha/SearchEngine/Elasticsearch.t

index fe2e0ea..e90af21 100644 (file)
@@ -34,8 +34,6 @@ use Search::Elasticsearch;
 use Try::Tiny;
 use YAML::Syck;
 
-use Data::Dumper;    # TODO remove
-
 __PACKAGE__->mk_ro_accessors(qw( index ));
 __PACKAGE__->mk_accessors(qw( sort_fields ));
 
@@ -137,24 +135,14 @@ A hashref containing the settings is returned.
 sub get_elasticsearch_settings {
     my ($self) = @_;
 
-    # Ultimately this should come from a file or something, and not be
-    # hardcoded.
-    my $settings = {
-        index => {
-            analysis => {
-                analyzer => {
-                    analyser_phrase => {
-                        tokenizer => 'icu_tokenizer',
-                        filter    => ['icu_folding'],
-                    },
-                    analyser_standard => {
-                        tokenizer => 'icu_tokenizer',
-                        filter    => ['icu_folding'],
-                    },
-                },
-            }
-        }
-    };
+    # Use state to speed up repeated calls
+    state $settings = undef;
+    if (!defined $settings) {
+        my $config_file = C4::Context->config('elasticsearch_index_config');
+        $config_file ||= C4::Context->config('intranetdir') . '/etc/searchengine/elasticsearch/index_config.yaml';
+        $settings = LoadFile( $config_file );
+    }
+
     return $settings;
 }
 
@@ -170,116 +158,97 @@ created.
 sub get_elasticsearch_mappings {
     my ($self) = @_;
 
-    # TODO cache in the object?
-    my $mappings = {
-        data => {
-            _all => {type => "string", analyzer => "analyser_standard"},
-            properties => {
-                record => {
-                    store          => "true",
-                    include_in_all => JSON::false,
-                    type           => "text",
-                },
-            }
-        }
-    };
-    my %sort_fields;
-    my $marcflavour = lc C4::Context->preference('marcflavour');
-    $self->_foreach_mapping(
-        sub {
-            my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_;
-            return if $marc_type ne $marcflavour;
-            # TODO if this gets any sort of complexity to it, it should
-            # be broken out into its own function.
-
-            # TODO be aware of date formats, but this requires pre-parsing
-            # as ES will simply reject anything with an invalid date.
-            my $es_type =
-              $type eq 'boolean'
-              ? 'boolean'
-              : 'text';
-
-            if ($es_type eq 'boolean') {
-                $mappings->{data}{properties}{$name} = _elasticsearch_mapping_for_boolean( $name, $es_type, $facet, $suggestible, $sort, $marc_type );
-                return; #Boolean cannot have facets nor sorting nor suggestions
-            } else {
-                $mappings->{data}{properties}{$name} = _elasticsearch_mapping_for_default( $name, $es_type, $facet, $suggestible, $sort, $marc_type );
-            }
+    # Use state to speed up repeated calls
+    state %all_mappings;
+    state %sort_fields;
+
+    if (!defined $all_mappings{$self->index}) {
+        $sort_fields{$self->index} = {};
+        my $mappings = {
+            data => _get_elasticsearch_mapping('general', '')
+        };
+        my $marcflavour = lc C4::Context->preference('marcflavour');
+        $self->_foreach_mapping(
+            sub {
+                my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_;
+                return if $marc_type ne $marcflavour;
+                # TODO if this gets any sort of complexity to it, it should
+                # be broken out into its own function.
+
+                # TODO be aware of date formats, but this requires pre-parsing
+                # as ES will simply reject anything with an invalid date.
+                my $es_type = 'text';
+                if ($type eq 'boolean') {
+                    $es_type = 'boolean';
+                } elsif ($type eq 'number' || $type eq 'sum') {
+                    $es_type = 'integer';
+                } elsif ($type eq 'isbn' || $type eq 'stdno') {
+                    $es_type = 'stdno';
+                }
 
-            if ($facet) {
-                $mappings->{data}{properties}{ $name . '__facet' } = {
-                    type  => "keyword",
-                };
-            }
-            if ($suggestible) {
-                $mappings->{data}{properties}{ $name . '__suggestion' } = {
-                    type => 'completion',
-                    analyzer => 'simple',
-                    search_analyzer => 'simple',
-                };
-            }
-            # Sort is a bit special as it can be true, false, undef.
-            # We care about "true" or "undef",
-            # "undef" means to do the default thing, which is make it sortable.
-            if ($sort || !defined $sort) {
-                $mappings->{data}{properties}{ $name . '__sort' } = {
-                    search_analyzer => "analyser_phrase",
-                    analyzer  => "analyser_phrase",
-                    type            => "text",
-                    include_in_all  => JSON::false,
-                    fields          => {
-                        phrase => {
-                            type            => "keyword",
-                        },
-                    },
-                };
-                $sort_fields{$name} = 1;
+                $mappings->{data}{properties}{$name} = _get_elasticsearch_mapping('search', $es_type);
+
+                if ($facet) {
+                    $mappings->{data}{properties}{ $name . '__facet' } = _get_elasticsearch_mapping('facet', $es_type);
+                }
+                if ($suggestible) {
+                    $mappings->{data}{properties}{ $name . '__suggestion' } = _get_elasticsearch_mapping('suggestible', $es_type);
+                }
+                # Sort is a bit special as it can be true, false, undef.
+                # We care about "true" or "undef",
+                # "undef" means to do the default thing, which is make it sortable.
+                if (!defined $sort || $sort) {
+                    $mappings->{data}{properties}{ $name . '__sort' } = _get_elasticsearch_mapping('sort', $es_type);
+                    $sort_fields{$self->index}{$name} = 1;
+                }
             }
-        }
-    );
-    $self->sort_fields(\%sort_fields);
-    return $mappings;
+        );
+        $all_mappings{$self->index} = $mappings;
+    }
+    $self->sort_fields(\%{$sort_fields{$self->index}});
+
+    return $all_mappings{$self->index};
 }
 
-=head2 _elasticsearch_mapping_for_*
+=head2 _get_elasticsearch_mapping
 
-Get the ES mappings for the given data type or a special mapping case
+Get the ES mappings for the given purpose and data type
 
-Receives the same parameters from the $self->_foreach_mapping() dispatcher
+$mapping = _get_elasticsearch_mapping('search', 'text');
 
 =cut
 
-sub _elasticsearch_mapping_for_boolean {
-    my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_;
+sub _get_elasticsearch_mapping {
 
-    return {
-        type            => $type,
-        null_value      => 0,
-    };
-}
+    my ( $purpose, $type ) = @_;
 
-sub _elasticsearch_mapping_for_default {
-    my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_;
-
-    return {
-        search_analyzer => "analyser_standard",
-        analyzer        => "analyser_standard",
-        type            => $type,
-        fields          => {
-            phrase => {
-                search_analyzer => "analyser_phrase",
-                analyzer        => "analyser_phrase",
-                type            => "text",
-            },
-            raw => {
-                type    => "keyword",
-            }
-        },
-    };
+    # Use state to speed up repeated calls
+    state $settings = undef;
+    if (!defined $settings) {
+        my $config_file = C4::Context->config('elasticsearch_field_config');
+        $config_file ||= C4::Context->config('intranetdir') . '/etc/searchengine/elasticsearch/field_config.yaml';
+        $settings = LoadFile( $config_file );
+    }
+
+    if (!defined $settings->{$purpose}) {
+        die "Field purpose $purpose not defined in field config";
+    }
+    if ($type eq '') {
+        return $settings->{$purpose};
+    }
+    if (defined $settings->{$purpose}{$type}) {
+        return $settings->{$purpose}{$type};
+    }
+    if (defined $settings->{$purpose}{'default'}) {
+        return $settings->{$purpose}{'default'};
+    }
+    return undef;
 }
 
 sub reset_elasticsearch_mappings {
-    my $mappings_yaml = C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/mappings.yaml';
+    my ( $reset_fields ) = @_;
+    my $mappings_yaml = C4::Context->config('elasticsearch_index_mappings');
+    $mappings_yaml ||= C4::Context->config('intranetdir') . '/etc/searchengine/elasticsearch/mappings.yaml';
     my $indexes = LoadFile( $mappings_yaml );
 
     while ( my ( $index_name, $fields ) = each %$indexes ) {
index 21348ae..d6f515f 100644 (file)
@@ -319,10 +319,21 @@ __END_SRU_PUBLICSERVER__
  <plack_max_requests>50</plack_max_requests>
  <plack_workers>2</plack_workers>
 
+ <!-- Elasticsearch Configuration -->
  <elasticsearch>
      <server>localhost:9200</server>
      <index_name>koha___KOHASITE__</index_name>
  </elasticsearch>
+ <!-- Uncomment the following line if you want to override the Elasticsearch default index settings -->
+ <!-- <elasticsearch_index_config>__KOHA_CONF_DIR__/searchengine/elasticsearch/index_config.yaml</elasticsearch_index_config> -->
+ <!-- Uncomment the following line if you want to override the Elasticsearch default field settings -->
+ <!-- <elasticsearch_field_config>__KOHA_CONF_DIR__/searchengine/elasticsearch/field_config.yaml</elasticsearch_field_config> -->
+ <!-- Uncomment the following line if you want to override the Elasticsearch index default settings.
+      Note that any changes made to the mappings file only take effect if you reset the mappings in
+      by visiting /cgi-bin/koha/admin/searchengine/elasticsearch/mappings.pl?op=reset&i_know_what_i_am_doing=1&reset_fields=1.
+      Resetting mappings will override any changes made in the Search engine configuration UI.
+ -->
+ <!-- <elasticsearch_index_mappings>__KOHA_CONF_DIR__/searchengine/elasticsearch/mappings.yaml</elasticsearch_index_mappings> -->
 
  <interlibrary_loans>
      <!-- Path to where Illbackends are located on the system
index 5bcd521..092a26c 100644 (file)
@@ -150,10 +150,21 @@ __PAZPAR2_TOGGLE_XML_POST__
  <plack_max_requests>50</plack_max_requests>
  <plack_workers>2</plack_workers>
 
+ <!-- Elasticsearch Configuration -->
  <elasticsearch>
      <server>localhost:9200</server>
      <index_name>koha___DB_NAME__</index_name>
  </elasticsearch>
+ <!-- Uncomment the following line if you want to override the Elasticsearch default index settings -->
+ <!-- <elasticsearch_index_config>__KOHA_CONF_DIR__/searchengine/elasticsearch/index_config.yaml</elasticsearch_index_config> -->
+ <!-- Uncomment the following line if you want to override the Elasticsearch default field settings -->
+ <!-- <elasticsearch_field_config>__KOHA_CONF_DIR__/searchengine/elasticsearch/field_config.yaml</elasticsearch_field_config> -->
+ <!-- Uncomment the following line if you want to override the Elasticsearch index default settings.
+      Note that any changes made to the mappings file only take effect if you reset the mappings in
+      by visiting /cgi-bin/koha/admin/searchengine/elasticsearch/mappings.pl?op=reset&i_know_what_i_am_doing=1&reset_fields=1.
+      Resetting mappings will override any changes made in the Search engine configuration UI.
+ -->
+ <!-- <elasticsearch_index_mappings>__KOHA_CONF_DIR__/searchengine/elasticsearch/mappings.yaml</elasticsearch_index_mappings> -->
 
  <interlibrary_loans>
      <!-- Path to where Illbackends are located on the system
diff --git a/etc/searchengine/elasticsearch/field_config.yaml b/etc/searchengine/elasticsearch/field_config.yaml
new file mode 100644 (file)
index 0000000..535a43c
--- /dev/null
@@ -0,0 +1,61 @@
+---
+# General field configuration
+general:
+  _all:
+    type: string
+    analyzer: analyser_standard
+  properties:
+    record:
+      store: true
+      type: text
+# Search fields
+search:
+  boolean:
+    type: boolean
+    null_value: false
+  integer:
+    type: integer
+    null_value: 0
+  stdno:
+    type: text
+    analyzer: analyser_stdno
+    search_analyzer: analyser_stdno
+    fields:
+      phrase:
+        type: text
+        analyzer: analyser_stdno
+        search_analyzer: analyser_stdno
+      raw:
+        type: keyword
+    copy_to: _all
+  default:
+    type: text
+    analyzer: analyser_standard
+    search_analyzer: analyser_standard
+    fields:
+      phrase:
+        type: text
+        analyzer: analyser_phrase
+        search_analyzer: analyser_phrase
+      raw:
+        type: keyword
+    copy_to: _all
+# Facets
+facet:
+  default:
+    type: keyword
+# Suggestible
+suggestible:
+  default:
+    type: completion
+    analyzer: simple
+    search_analyzer: simple
+# Sort
+sort:
+  default:
+    type: text
+    analyzer: analyser_phrase
+    search_analyzer: analyser_phrase
+    fields:
+      phrase:
+        type: keyword
diff --git a/etc/searchengine/elasticsearch/index_config.yaml b/etc/searchengine/elasticsearch/index_config.yaml
new file mode 100644 (file)
index 0000000..bdfcdf5
--- /dev/null
@@ -0,0 +1,34 @@
+---
+# Index configuration that defines how different analyzers work.
+index:
+  analysis:
+    analyzer:
+      # Phrase analyzer is used for phrases (phrase match, sorting)
+      analyser_phrase:
+        tokenizer: keyword
+        filter:
+          - icu_folding
+        char_filter:
+          - punctuation
+      analyser_standard:
+        tokenizer: icu_tokenizer
+        filter:
+          - icu_folding
+      analyser_stdno:
+        tokenizer: whitespace
+        filter:
+          - icu_folding
+        char_filter:
+          - punctuation
+    normalizer:
+      normalizer_keyword:
+        type: custom
+        filter:
+          - icu_folding
+    char_filter:
+      # The punctuation filter is used to remove any punctuation chars in fields that don't use icu_tokenizer.
+      punctuation:
+        type: pattern_replace
+        # The pattern contains all ASCII punctuation characters.
+        pattern: '([\x00-\x1F,\x21-\x2F,\x3A-\x40,\x5B-\x60,\x7B-\x89,\x8B,\x8D,\x8F,\x90-\x99,\x9B,\x9D,\xA0-\xBF,\xD7,\xF7])'
+        replacement: ''
@@ -1,4 +1,5 @@
 ---
+# Basic mappings from MARC fields to Elasticsearch fields.
 authorities:
   Corporate-name-see-also-from:
     label: Corporate-name-see-also-from
@@ -2162,7 +2163,7 @@ biblios:
         marc_type: unimarc
         sort: ~
         suggestible: ''
-    type: ''
+    type: 'stdno'
   isbn:
     label: isbn
     mappings:
@@ -2181,7 +2182,7 @@ biblios:
         marc_type: unimarc
         sort: ~
         suggestible: ''
-    type: ''
+    type: 'isbn'
   issn:
     label: issn
     mappings:
@@ -2200,7 +2201,7 @@ biblios:
         marc_type: unimarc
         sort: ~
         suggestible: ''
-    type: ''
+    type: 'stdno'
   issues:
     label: issues
     mappings:
diff --git a/installer/data/mysql/atomicupdate/bug_20073.perl b/installer/data/mysql/atomicupdate/bug_20073.perl
new file mode 100644 (file)
index 0000000..0fd4b45
--- /dev/null
@@ -0,0 +1,8 @@
+$DBversion = 'XXX';  # will be replaced by the RM
+if( CheckVersion( $DBversion ) ) {
+    $dbh->do( "ALTER TABLE search_field CHANGE COLUMN type type ENUM('', 'string', 'date', 'number', 'boolean', 'sum', 'isbn', 'stdno') NOT NULL COMMENT 'what type of data this holds, relevant when storing it in the search engine'" );
+
+    # Always end with this (adjust the bug info)
+    SetVersion( $DBversion );
+    print "Upgrade to $DBversion done (Bug 20073 - Add new types for Elasticsearch fields)\n";
+}
index b689db3..4b770d0 100644 (file)
@@ -1459,7 +1459,7 @@ CREATE TABLE `search_field` (
   `id` int(11) NOT NULL AUTO_INCREMENT,
   `name` varchar(255) NOT NULL COMMENT 'the name of the field as it will be stored in the search engine',
   `label` varchar(255) NOT NULL COMMENT 'the human readable name of the field, for display',
-  `type` ENUM('', 'string', 'date', 'number', 'boolean', 'sum') NOT NULL COMMENT 'what type of data this holds, relevant when storing it in the search engine',
+  `type` ENUM('', 'string', 'date', 'number', 'boolean', 'sum', 'isbn', 'stdno') NOT NULL COMMENT 'what type of data this holds, relevant when storing it in the search engine',
   PRIMARY KEY (`id`),
   UNIQUE KEY (`name` (191))
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
index cecd8de..5014c80 100644 (file)
@@ -164,6 +164,16 @@ a.add, a.delete {
                           [% ELSE %]
                             <option value="sum">Sum</option>
                           [% END %]
+                          [% IF search_field.type == "isbn" %]
+                            <option value="isbn" selected="selected">ISBN</option>
+                          [% ELSE %]
+                            <option value="isbn">ISBN</option>
+                          [% END %]
+                          [% IF search_field.type == "stdno" %]
+                            <option value="stdno" selected="selected">Std. Number</option>
+                          [% ELSE %]
+                            <option value="stdno">Std. Number</option>
+                          [% END %]
                         </select>
                       </td>
                     </tr>
index f7a7f27..b91fcba 100644 (file)
@@ -17,7 +17,7 @@
 
 use Modern::Perl;
 
-use Test::More tests => 1;
+use Test::More tests => 3;
 use Test::Exception;
 
 use t::lib::Mocks;
@@ -84,3 +84,27 @@ subtest '_read_configuration() tests' => sub {
     is( $configuration->{index_name}, 'index', 'Index configuration parsed correctly' );
     is_deeply( $configuration->{nodes}, \@servers , 'Server configuration parsed correctly' );
 };
+
+subtest 'get_elasticsearch_settings() tests' => sub {
+
+    plan tests => 1;
+
+    my $settings;
+
+    # test reading index settings
+    my $es = Koha::SearchEngine::Elasticsearch->new( {index => $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX} );
+    $settings = $es->get_elasticsearch_settings();
+    is( $settings->{index}{analysis}{analyzer}{analyser_phrase}{tokenizer}, 'keyword', 'Index settings parsed correctly' );
+};
+
+subtest 'get_elasticsearch_mappings() tests' => sub {
+
+    plan tests => 1;
+
+    my $mappings;
+
+    # test reading mappings
+    my $es = Koha::SearchEngine::Elasticsearch->new( {index => $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX} );
+    $mappings = $es->get_elasticsearch_mappings();
+    is( $mappings->{data}{_all}{type}, 'string', 'Field mappings parsed correctly' );
+};