LP#1997485: Reify baseline schema

author Mike Rylander <mrylander@gmail.com>

Fri, 31 Mar 2023 19:01:28 +0000 (15:01 -0400)

committer Galen Charlton <gmc@equinoxOLI.org>

Thu, 27 Apr 2023 18:00:27 +0000 (18:00 +0000)
author Mike Rylander <mrylander@gmail.com>
Fri, 31 Mar 2023 19:01:28 +0000 (15:01 -0400)
committer Galen Charlton <gmc@equinoxOLI.org>
Thu, 27 Apr 2023 18:00:27 +0000 (18:00 +0000)
diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql

index c7c12fd..ed9da16 100644 (file)
--- a/Open-ILS/src/sql/Pg/002.schema.config.sql
+++ b/Open-ILS/src/sql/Pg/002.schema.config.sql
@@ -194,7 +194,19 @@ CREATE TABLE config.metabib_class (
     a_weight NUMERIC  DEFAULT 1.0 NOT NULL,
     b_weight NUMERIC  DEFAULT 0.4 NOT NULL,
     c_weight NUMERIC  DEFAULT 0.2 NOT NULL,
-    d_weight NUMERIC  DEFAULT 0.1 NOT NULL
+    d_weight NUMERIC  DEFAULT 0.1 NOT NULL,
+    variant_authority_suggestion   BOOL NOT NULL DEFAULT TRUE,
+    symspell_transfer_case         BOOL NOT NULL DEFAULT TRUE,
+    symspell_skip_correct          BOOL NOT NULL DEFAULT FALSE,
+    symspell_suggestion_verbosity  INT NOT NULL DEFAULT 2,
+    max_phrase_edit_distance       INT NOT NULL DEFAULT 2,
+    suggestion_word_option_count   INT NOT NULL DEFAULT 5,
+    max_suggestions                INT NOT NULL DEFAULT -1,
+    low_result_threshold           INT NOT NULL DEFAULT 0,
+    min_suggestion_use_threshold   INT NOT NULL DEFAULT 1,
+    soundex_weight                 INT NOT NULL DEFAULT 0,
+    pg_trgm_weight                 INT NOT NULL DEFAULT 0,
+    keyboard_distance_weight       INT NOT NULL DEFAULT 0
 );
 
 CREATE TABLE config.metabib_field (
diff --git a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql

index 523e286..56dfeb9 100644 (file)
--- a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql
+++ b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql
@@ -1276,9 +1276,93 @@ $f$ LANGUAGE SQL;
 
 CREATE OR REPLACE FUNCTION search.symspell_parse_words ( phrase TEXT )
 RETURNS SETOF TEXT AS $F$
-    SELECT UNNEST(x) FROM regexp_matches($1, '([[:alnum:]]+''*[[:alnum:]]*)', 'g') x;
+    SELECT  UNNEST
+      FROM  (SELECT UNNEST(x), ROW_NUMBER() OVER ()
+              FROM  regexp_matches($1, '(?:^|\s+)((?:-|\+)?[[:alnum:]]+''*[[:alnum:]]*)', 'g') x
+            ) y
+      WHERE UNNEST IS NOT NULL
+      ORDER BY row_number
 $F$ LANGUAGE SQL STRICT IMMUTABLE;
 
+CREATE OR REPLACE FUNCTION search.distribute_phrase_sign (input TEXT) RETURNS TEXT AS $f$
+DECLARE
+    phrase_sign TEXT;
+    output      TEXT;
+BEGIN
+    output := input;
+
+    IF output ~ '^(?:-|\+)' THEN
+        phrase_sign := SUBSTRING(input FROM 1 FOR 1);
+        output := SUBSTRING(output FROM 2);
+    END IF;
+
+    IF output LIKE '"%"' THEN
+        IF phrase_sign IS NULL THEN
+            phrase_sign := '+';
+        END IF;
+        output := BTRIM(output,'"');
+    END IF;
+
+    IF phrase_sign IS NOT NULL THEN
+        RETURN REGEXP_REPLACE(output,'(^|\s+)(?=[[:alnum:]])','\1'||phrase_sign,'g');
+    END IF;
+
+    RETURN output;
+END;
+$f$ LANGUAGE PLPGSQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION search.query_parse_phrases ( phrase TEXT )
+RETURNS SETOF TEXT AS $F$
+    SELECT  search.distribute_phrase_sign(UNNEST)
+      FROM  (SELECT UNNEST(x), ROW_NUMBER() OVER ()
+              FROM  regexp_matches($1, '(?:^|\s+)(?:((?:-|\+)?"[^"]+")|((?:-|\+)?[[:alnum:]]+''*[[:alnum:]]*))', 'g') x
+            ) y
+      WHERE UNNEST IS NOT NULL
+      ORDER BY row_number
+$F$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE TYPE search.query_parse_position AS (
+    word                TEXT,
+    word_pos            INT,
+    phrase_in_input_pos INT,
+    word_in_phrase_pos  INT,
+    negated             BOOL,
+    exact               BOOL
+);
+
+CREATE OR REPLACE FUNCTION search.query_parse_positions ( raw_input TEXT )
+RETURNS SETOF search.query_parse_position AS $F$
+DECLARE
+    curr_phrase TEXT;
+    curr_word   TEXT;
+    phrase_pos  INT := 0;
+    word_pos    INT := 0;
+    pos         INT := 0;
+    neg         BOOL;
+    ex          BOOL;
+BEGIN
+    FOR curr_phrase IN SELECT x FROM search.query_parse_phrases(raw_input) x LOOP
+        word_pos := 0;
+        FOR curr_word IN SELECT x FROM search.symspell_parse_words(curr_phrase) x LOOP
+            neg := FALSE;
+            ex := FALSE;
+            IF curr_word ~ '^(?:-|\+)' THEN
+                ex := TRUE;
+                IF curr_word LIKE '-%' THEN
+                    neg := TRUE;
+                END IF;
+                curr_word := SUBSTRING(curr_word FROM 2);
+            END IF;
+            RETURN QUERY SELECT curr_word, pos, phrase_pos, word_pos, neg, ex;
+            word_pos := word_pos + 1;
+            pos := pos + 1;
+        END LOOP;
+        phrase_pos := phrase_pos + 1;
+    END LOOP;
+    RETURN;
+END;
+$F$ LANGUAGE PLPGSQL STRICT IMMUTABLE;
+
 -- This version does not preserve input word order!
 CREATE OR REPLACE FUNCTION search.symspell_parse_words_distinct ( phrase TEXT )
 RETURNS SETOF TEXT AS $F$
@@ -1347,20 +1431,108 @@ CREATE TYPE search.symspell_lookup_output AS (
 );
 
 
-CREATE OR REPLACE FUNCTION search.symspell_lookup(
-    raw_input text,
-    search_class text,
-    verbosity integer DEFAULT 2,
-    xfer_case boolean DEFAULT false,
-    count_threshold integer DEFAULT 1,
-    soundex_weight integer DEFAULT 0,
-    pg_trgm_weight integer DEFAULT 0,
-    kbdist_weight integer DEFAULT 0
-) RETURNS SETOF search.symspell_lookup_output LANGUAGE plpgsql AS $function$
+CREATE OR REPLACE FUNCTION search.symspell_generate_combined_suggestions(
+    word_data search.symspell_lookup_output[],
+    pos_data search.query_parse_position[],
+    skip_correct BOOL DEFAULT TRUE,
+    max_words INT DEFAULT 0
+) RETURNS TABLE (suggestion TEXT, test TEXT) AS $f$
+    my $word_data = shift;
+    my $pos_data = shift;
+    my $skip_correct = shift;
+    my $max_per_word = shift;
+    return undef unless (@$word_data and @$pos_data);
+
+    my $last_word_pos = $$word_data[-1]{word_pos};
+    my $pos_to_word_map = [ map { [] } 0 .. $last_word_pos ];
+    my $parsed_query_data = { map { ($$_{word_pos} => $_) } @$pos_data };
+
+    for my $row (@$word_data) {
+        my $wp = +$$row{word_pos};
+        next if (
+            $skip_correct eq 't' and $$row{lev_distance} > 0
+            and @{$$pos_to_word_map[$wp]}
+            and $$pos_to_word_map[$wp][0]{lev_distance} == 0
+        );
+        push @{$$pos_to_word_map[$$row{word_pos}]}, $row;
+    }
+
+    gen_step($max_per_word, $pos_to_word_map, $parsed_query_data, $last_word_pos);
+    return undef;
+
+    # -----------------------------
+    sub gen_step {
+        my $max_words = shift;
+        my $data = shift;
+        my $pos_data = shift;
+        my $last_pos = shift;
+        my $prefix = shift || '';
+        my $test_prefix = shift || '';
+        my $current_pos = shift || 0;
+
+        my $word_count = 0;
+        for my $sugg ( @{$$data[$current_pos]} ) {
+            my $was_inside_phrase = 0;
+            my $now_inside_phrase = 0;
+
+            my $word = $$sugg{suggestion};
+            $word_count++;
+
+            my $prev_phrase = $$pos_data{$current_pos - 1}{phrase_in_input_pos};
+            my $curr_phrase = $$pos_data{$current_pos}{phrase_in_input_pos};
+            my $next_phrase = $$pos_data{$current_pos + 1}{phrase_in_input_pos};
+
+            $now_inside_phrase++ if (defined($next_phrase) and $curr_phrase == $next_phrase);
+            $was_inside_phrase++ if (defined($prev_phrase) and $curr_phrase == $prev_phrase);
+
+            my $string = $prefix;
+            $string .= ' ' if $string;
+
+            if (!$was_inside_phrase) { # might be starting a phrase?
+                $string .= '-' if ($$pos_data{$current_pos}{negated} eq 't');
+                if ($now_inside_phrase) { # we are! add the double-quote
+                    $string .= '"';
+                }
+                $string .= $word;
+            } else { # definitely were in a phrase
+                $string .= $word;
+                if (!$now_inside_phrase) { # we are not any longer, add the double-quote
+                    $string .= '"';
+                }
+            }
+
+            my $test_string = $test_prefix;
+            if ($current_pos > 0) { # have something already, need joiner
+                $test_string .= $curr_phrase == $prev_phrase ? ' <-> ' : ' & ';
+            }
+            $test_string .= '!' if ($$pos_data{$current_pos}{negated} eq 't');
+            $test_string .= $word;
+
+            if ($current_pos == $last_pos) {
+                return_next {suggestion => $string, test => $test_string};
+            } else {
+                gen_step($max_words, $data, $pos_data, $last_pos, $string, $test_string, $current_pos + 1);
+            }
+            
+            last if ($max_words and $word_count >= $max_words);
+        }
+    }
+$f$ LANGUAGE PLPERLU IMMUTABLE;
+
+
+CREATE FUNCTION search.symspell_lookup (
+    raw_input       TEXT,
+    search_class    TEXT,
+    verbosity       INT DEFAULT NULL,
+    xfer_case       BOOL DEFAULT NULL,
+    count_threshold INT DEFAULT NULL,
+    soundex_weight  INT DEFAULT NULL,
+    pg_trgm_weight  INT DEFAULT NULL,
+    kbdist_weight   INT DEFAULT NULL
+) RETURNS SETOF search.symspell_lookup_output AS $F$
 DECLARE
     prefix_length INT;
     maxED         INT;
-    good_suggs  HSTORE;
     word_list   TEXT[];
     edit_list   TEXT[] := '{}';
     seen_list   TEXT[] := '{}';
@@ -1375,35 +1547,63 @@ DECLARE
     w_pos       INT := -1;
     smallest_ed INT := -1;
     global_ed   INT;
-    i_len       INT;
-    l_maxED     INT;
+    c_symspell_suggestion_verbosity INT;
+    c_min_suggestion_use_threshold  INT;
+    c_soundex_weight                INT;
+    c_pg_trgm_weight                INT;
+    c_keyboard_distance_weight      INT;
+    c_symspell_transfer_case        BOOL;
 BEGIN
+
+    SELECT  cmc.min_suggestion_use_threshold,
+            cmc.soundex_weight,
+            cmc.pg_trgm_weight,
+            cmc.keyboard_distance_weight,
+            cmc.symspell_transfer_case,
+            cmc.symspell_suggestion_verbosity
+      INTO  c_min_suggestion_use_threshold,
+            c_soundex_weight,
+            c_pg_trgm_weight,
+            c_keyboard_distance_weight,
+            c_symspell_transfer_case,
+            c_symspell_suggestion_verbosity
+      FROM  config.metabib_class cmc
+      WHERE cmc.name = search_class;
+
+    c_min_suggestion_use_threshold := COALESCE(count_threshold,c_min_suggestion_use_threshold);
+    c_symspell_transfer_case := COALESCE(xfer_case,c_symspell_transfer_case);
+    c_symspell_suggestion_verbosity := COALESCE(verbosity,c_symspell_suggestion_verbosity);
+    c_soundex_weight := COALESCE(soundex_weight,c_soundex_weight);
+    c_pg_trgm_weight := COALESCE(pg_trgm_weight,c_pg_trgm_weight);
+    c_keyboard_distance_weight := COALESCE(kbdist_weight,c_keyboard_distance_weight);
+
     SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
     prefix_length := COALESCE(prefix_length, 6);
 
     SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
     maxED := COALESCE(maxED, 3);
 
-    word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x;
+    -- XXX This should get some more thought ... maybe search_normalize?
+    word_list := ARRAY_AGG(x.word) FROM search.query_parse_positions(raw_input) x;
 
     -- Common case exact match test for preformance
-    IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN
+    IF c_symspell_suggestion_verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN
         EXECUTE
           'SELECT  '||search_class||'_suggestions AS suggestions,
                    '||search_class||'_count AS count,
                    prefix_key
              FROM  search.symspell_dictionary
              WHERE prefix_key = $1
-                   AND '||search_class||'_count >= $2
-                   AND '||search_class||'_suggestions @> ARRAY[$1]'
-          INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1);
+                   AND '||search_class||'_count >= $2 
+                   AND '||search_class||'_suggestions @> ARRAY[$1]' 
+          INTO entry USING evergreen.lowercase(word_list[1]), c_min_suggestion_use_threshold;
         IF entry.prefix_key IS NOT NULL THEN
             output.lev_distance := 0; -- definitionally
             output.prefix_key := entry.prefix_key;
             output.prefix_key_count := entry.count;
             output.suggestion_count := entry.count;
             output.input := word_list[1];
-            IF xfer_case THEN
+            IF c_symspell_transfer_case THEN
                 output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key);
             ELSE
                 output.suggestion := entry.prefix_key;
@@ -1421,14 +1621,12 @@ BEGIN
     FOREACH word IN ARRAY word_list LOOP
         w_pos := w_pos + 1;
         input := evergreen.lowercase(word);
-        i_len := CHARACTER_LENGTH(input);
-        l_maxED := maxED;
 
         IF CHARACTER_LENGTH(input) > prefix_length THEN
             prefix_key := SUBSTRING(input FROM 1 FOR prefix_length);
-            edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED);
+            edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, maxED);
         ELSE
-            edit_list := input || search.symspell_generate_edits(input, 1, l_maxED);
+            edit_list := input || search.symspell_generate_edits(input, 1, maxED);
         END IF;
 
         SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x;
@@ -1443,115 +1641,112 @@ BEGIN
             IF global_ed IS NOT NULL THEN
                 smallest_ed := global_ed;
             END IF;
-
             FOR entry IN EXECUTE
                 'SELECT  '||search_class||'_suggestions AS suggestions,
                          '||search_class||'_count AS count,
                          prefix_key
                    FROM  search.symspell_dictionary
                    WHERE prefix_key = $1
-                         AND '||search_class||'_suggestions IS NOT NULL'
+                         AND '||search_class||'_suggestions IS NOT NULL' 
                 USING entry_key
             LOOP
-
-                SELECT  HSTORE(
-                            ARRAY_AGG(
-                                ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT]
-                                    ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC
-                            )
-                        )
-                  INTO  good_suggs
-                  FROM  UNNEST(entry.suggestions) s
-                  WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED)
-                        AND NOT seen_list @> ARRAY[s];
-
-                CONTINUE WHEN good_suggs IS NULL;
-
-                FOR sugg, output.suggestion_count IN EXECUTE
-                    'SELECT  prefix_key, '||search_class||'_count
-                       FROM  search.symspell_dictionary
-                       WHERE prefix_key = ANY ($1)
-                             AND '||search_class||'_count >= $2'
-                    USING AKEYS(good_suggs), COALESCE(count_threshold,1)
-                LOOP
-
-                    output.lev_distance := good_suggs->sugg;
-                    seen_list := seen_list || sugg;
-
-                    -- Track the smallest edit distance among suggestions from this prefix key.
-                    IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
-                        smallest_ed := output.lev_distance;
-                    END IF;
-
-                    -- Track the smallest edit distance for all prefix keys for this word.
-                    IF global_ed IS NULL OR smallest_ed < global_ed THEN
-                        global_ed = smallest_ed;
-                        -- And if low verbosity, ignore suggs with a larger distance from here on.
-                        IF verbosity <= 1 THEN
-                            l_maxED := global_ed;
-                        END IF;
-                    END IF;
-
-                    -- Lev distance is our main similarity measure. While
-                    -- trgm or soundex similarity could be the main filter,
-                    -- Lev is both language agnostic and faster.
-                    --
-                    -- Here we will skip suggestions that have a longer edit distance
-                    -- than the shortest we've already found. This is simply an
-                    -- optimization that allows us to avoid further processing
-                    -- of this entry. It would be filtered out later.
-                    CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1;
-
-                    -- If we have an exact match on the suggestion key we can also avoid
-                    -- some function calls.
-                    IF output.lev_distance = 0 THEN
-                        output.qwerty_kb_match := 1;
-                        output.pg_trgm_sim := 1;
-                        output.soundex_sim := 1;
-                    ELSE
-                        IF kbdist_weight THEN
-                            output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
+                FOREACH sugg IN ARRAY entry.suggestions LOOP
+                    IF NOT seen_list @> ARRAY[sugg] THEN
+                        seen_list := seen_list || sugg;
+                        IF input = sugg THEN -- exact match, no need to spend time on a call
+                            output.lev_distance := 0;
+                            output.suggestion_count = entry.count;
+                        ELSIF ABS(CHARACTER_LENGTH(input) - CHARACTER_LENGTH(sugg)) > maxED THEN
+                            -- They are definitionally too different to consider, just move on.
+                            CONTINUE;
                         ELSE
-                            output.qwerty_kb_match := 0;
+                            --output.lev_distance := levenshtein_less_equal(
+                            output.lev_distance := evergreen.levenshtein_damerau_edistance(
+                                input,
+                                sugg,
+                                maxED
+                            );
+                            IF output.lev_distance < 0 THEN
+                                -- The Perl module returns -1 for "more distant than max".
+                                output.lev_distance := maxED + 1;
+                                -- This short-circuit's the count test below for speed, bypassing
+                                -- a couple useless tests.
+                                output.suggestion_count := -1;
+                            ELSE
+                                EXECUTE 'SELECT '||search_class||'_count FROM search.symspell_dictionary WHERE prefix_key = $1'
+                                    INTO output.suggestion_count USING sugg;
+                            END IF;
                         END IF;
-                        IF pg_trgm_weight THEN
-                            output.pg_trgm_sim := similarity(input, sugg);
-                        ELSE
-                            output.pg_trgm_sim := 0;
+
+                        -- The caller passes a minimum suggestion count threshold (or uses
+                        -- the default of 0) and if the suggestion has that many or less uses
+                        -- then we move on to the next suggestion, since this one is too rare.
+                        CONTINUE WHEN output.suggestion_count < c_min_suggestion_use_threshold;
+
+                        -- Track the smallest edit distance among suggestions from this prefix key.
+                        IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
+                            smallest_ed := output.lev_distance;
                         END IF;
-                        IF soundex_weight THEN
-                            output.soundex_sim := difference(input, sugg) / 4.0;
-                        ELSE
-                            output.soundex_sim := 0;
+
+                        -- Track the smallest edit distance for all prefix keys for this word.
+                        IF global_ed IS NULL OR smallest_ed < global_ed THEN
+                            global_ed = smallest_ed;
                         END IF;
-                    END IF;
-
-                    -- Fill in some fields
-                    IF xfer_case AND input <> word THEN
-                        output.suggestion := search.symspell_transfer_casing(word, sugg);
-                    ELSE
-                        output.suggestion := sugg;
-                    END IF;
-                    output.prefix_key := entry.prefix_key;
-                    output.prefix_key_count := entry.count;
-                    output.input := word;
-                    output.norm_input := input;
-                    output.word_pos := w_pos;
-
-                    -- We can't "cache" a set of generated records directly, so
-                    -- here we build up an array of search.symspell_lookup_output
-                    -- records that we can revivicate later as a table using UNNEST().
-                    output_list := output_list || output;
-
-                    EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
-                    CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
 
+                        -- Only proceed if the edit distance is <= the max for the dictionary.
+                        IF output.lev_distance <= maxED THEN
+                            IF output.lev_distance > global_ed AND c_symspell_suggestion_verbosity <= 1 THEN
+                                -- Lev distance is our main similarity measure. While
+                                -- trgm or soundex similarity could be the main filter,
+                                -- Lev is both language agnostic and faster.
+                                --
+                                -- Here we will skip suggestions that have a longer edit distance
+                                -- than the shortest we've already found. This is simply an
+                                -- optimization that allows us to avoid further processing
+                                -- of this entry. It would be filtered out later.
+
+                                CONTINUE;
+                            END IF;
+
+                            -- If we have an exact match on the suggestion key we can also avoid
+                            -- some function calls.
+                            IF output.lev_distance = 0 THEN
+                                output.qwerty_kb_match := 1;
+                                output.pg_trgm_sim := 1;
+                                output.soundex_sim := 1;
+                            ELSE
+                                output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
+                                output.pg_trgm_sim := similarity(input, sugg);
+                                output.soundex_sim := difference(input, sugg) / 4.0;
+                            END IF;
+
+                            -- Fill in some fields
+                            IF c_symspell_transfer_case THEN
+                                output.suggestion := search.symspell_transfer_casing(word, sugg);
+                            ELSE
+                                output.suggestion := sugg;
+                            END IF;
+                            output.prefix_key := entry.prefix_key;
+                            output.prefix_key_count := entry.count;
+                            output.input := word;
+                            output.norm_input := input;
+                            output.word_pos := w_pos;
+
+                            -- We can't "cache" a set of generated records directly, so
+                            -- here we build up an array of search.symspell_lookup_output
+                            -- records that we can revivicate later as a table using UNNEST().
+                            output_list := output_list || output;
+
+                            EXIT entry_key_loop WHEN smallest_ed = 0 AND c_symspell_suggestion_verbosity = 0; -- exact match early exit
+                            CONTINUE entry_key_loop WHEN smallest_ed = 0 AND c_symspell_suggestion_verbosity = 1; -- exact match early jump to the next key
+                        END IF; -- maxED test
+                    END IF; -- suggestion not seen test
                 END LOOP; -- loop over suggestions
             END LOOP; -- loop over entries
         END LOOP; -- loop over entry_keys
 
         -- Now we're done examining this word
-        IF verbosity = 0 THEN
+        IF c_symspell_suggestion_verbosity = 0 THEN
             -- Return the "best" suggestion from the smallest edit
             -- distance group.  We define best based on the weighting
             -- of the non-lev similarity measures and use the suggestion
@@ -1559,53 +1754,233 @@ BEGIN
             RETURN QUERY
                 SELECT * FROM UNNEST(output_list)
                     ORDER BY lev_distance,
-                        (soundex_sim * COALESCE(soundex_weight,0))
-                            + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
-                            + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+                        (soundex_sim * c_soundex_weight)
+                            + (pg_trgm_sim * c_pg_trgm_weight)
+                            + (qwerty_kb_match * c_keyboard_distance_weight) DESC,
                         suggestion_count DESC
                         LIMIT 1;
-        ELSIF verbosity = 1 THEN
+        ELSIF c_symspell_suggestion_verbosity = 1 THEN
             -- Return all suggestions from the smallest
             -- edit distance group.
             RETURN QUERY
                 SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed
-                    ORDER BY (soundex_sim * COALESCE(soundex_weight,0))
-                            + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
-                            + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+                    ORDER BY (soundex_sim * c_soundex_weight)
+                            + (pg_trgm_sim * c_pg_trgm_weight)
+                            + (qwerty_kb_match * c_keyboard_distance_weight) DESC,
                         suggestion_count DESC;
-        ELSIF verbosity = 2 THEN
+        ELSIF c_symspell_suggestion_verbosity = 2 THEN
             -- Return everything we find, along with relevant stats
             RETURN QUERY
                 SELECT * FROM UNNEST(output_list)
                     ORDER BY lev_distance,
-                        (soundex_sim * COALESCE(soundex_weight,0))
-                            + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
-                            + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+                        (soundex_sim * c_soundex_weight)
+                            + (pg_trgm_sim * c_pg_trgm_weight)
+                            + (qwerty_kb_match * c_keyboard_distance_weight) DESC,
                         suggestion_count DESC;
-        ELSIF verbosity = 3 THEN
+        ELSIF c_symspell_suggestion_verbosity = 3 THEN
             -- Return everything we find from the two smallest edit distance groups
             RETURN QUERY
                 SELECT * FROM UNNEST(output_list)
                     WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2)
                     ORDER BY lev_distance,
-                        (soundex_sim * COALESCE(soundex_weight,0))
-                            + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
-                            + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+                        (soundex_sim * c_soundex_weight)
+                            + (pg_trgm_sim * c_pg_trgm_weight)
+                            + (qwerty_kb_match * c_keyboard_distance_weight) DESC,
                         suggestion_count DESC;
-        ELSIF verbosity = 4 THEN
+        ELSIF c_symspell_suggestion_verbosity = 4 THEN
             -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance
             RETURN QUERY
                 SELECT * FROM UNNEST(output_list)
                     WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2)
                     ORDER BY lev_distance,
-                        (soundex_sim * COALESCE(soundex_weight,0))
-                            + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
-                            + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+                        (soundex_sim * c_soundex_weight)
+                            + (pg_trgm_sim * c_pg_trgm_weight)
+                            + (qwerty_kb_match * c_keyboard_distance_weight) DESC,
                         suggestion_count DESC;
         END IF;
     END LOOP; -- loop over words
 END;
-$function$;
+$F$ LANGUAGE PLPGSQL;
+
+CREATE OR REPLACE FUNCTION search.symspell_suggest (
+    raw_input       TEXT,
+    search_class    TEXT,
+    search_fields   TEXT[] DEFAULT '{}',
+    max_ed          INT DEFAULT NULL,      -- per word, on average, between norm input and suggestion
+    verbosity       INT DEFAULT NULL,      -- 0=Best only; 1=
+    skip_correct    BOOL DEFAULT NULL,  -- only suggest replacement words for misspellings?
+    max_word_opts   INT DEFAULT NULL,   -- 0 means all combinations, probably want to restrict?
+    count_threshold INT DEFAULT NULL    -- min count of records using the terms
+) RETURNS SETOF search.symspell_lookup_output AS $F$
+DECLARE
+    sugg_set         search.symspell_lookup_output[];
+    parsed_query_set search.query_parse_position[];
+    entry            RECORD;
+    auth_entry       RECORD;
+    norm_count       RECORD;
+    current_sugg     RECORD;
+    auth_sugg        RECORD;
+    norm_test        TEXT;
+    norm_input       TEXT;
+    norm_sugg        TEXT;
+    query_part       TEXT := '';
+    output           search.symspell_lookup_output;
+    c_skip_correct                  BOOL;
+    c_variant_authority_suggestion  BOOL;
+    c_symspell_transfer_case        BOOL;
+    c_authority_class_restrict      BOOL;
+    c_min_suggestion_use_threshold  INT;
+    c_soundex_weight                INT;
+    c_pg_trgm_weight                INT;
+    c_keyboard_distance_weight      INT;
+    c_suggestion_word_option_count  INT;
+    c_symspell_suggestion_verbosity INT;
+    c_max_phrase_edit_distance      INT;
+BEGIN
+
+    -- Gather settings
+    SELECT  cmc.min_suggestion_use_threshold,
+            cmc.soundex_weight,
+            cmc.pg_trgm_weight,
+            cmc.keyboard_distance_weight,
+            cmc.suggestion_word_option_count,
+            cmc.symspell_suggestion_verbosity,
+            cmc.symspell_skip_correct,
+            cmc.symspell_transfer_case,
+            cmc.max_phrase_edit_distance,
+            cmc.variant_authority_suggestion,
+            cmc.restrict
+      INTO  c_min_suggestion_use_threshold,
+            c_soundex_weight,
+            c_pg_trgm_weight,
+            c_keyboard_distance_weight,
+            c_suggestion_word_option_count,
+            c_symspell_suggestion_verbosity,
+            c_skip_correct,
+            c_symspell_transfer_case,
+            c_max_phrase_edit_distance,
+            c_variant_authority_suggestion,
+            c_authority_class_restrict
+      FROM  config.metabib_class cmc
+      WHERE cmc.name = search_class;
+
+
+    -- Set up variables to use at run time based on params and settings
+    c_min_suggestion_use_threshold := COALESCE(count_threshold,c_min_suggestion_use_threshold);
+    c_max_phrase_edit_distance := COALESCE(max_ed,c_max_phrase_edit_distance);
+    c_symspell_suggestion_verbosity := COALESCE(verbosity,c_symspell_suggestion_verbosity);
+    c_suggestion_word_option_count := COALESCE(max_word_opts,c_suggestion_word_option_count);
+    c_skip_correct := COALESCE(skip_correct,c_skip_correct);
+
+    SELECT  ARRAY_AGG(
+                x ORDER BY  x.word_pos,
+                            x.lev_distance,
+                            (x.soundex_sim * c_soundex_weight)
+                                + (x.pg_trgm_sim * c_pg_trgm_weight)
+                                + (x.qwerty_kb_match * c_keyboard_distance_weight) DESC,
+                            x.suggestion_count DESC
+            ) INTO sugg_set
+      FROM  search.symspell_lookup(
+                raw_input,
+                search_class,
+                c_symspell_suggestion_verbosity,
+                c_symspell_transfer_case,
+                c_min_suggestion_use_threshold,
+                c_soundex_weight,
+                c_pg_trgm_weight,
+                c_keyboard_distance_weight
+            ) x
+      WHERE x.lev_distance <= c_max_phrase_edit_distance;
+
+    SELECT ARRAY_AGG(x) INTO parsed_query_set FROM search.query_parse_positions(raw_input) x;
+
+    IF search_fields IS NOT NULL AND CARDINALITY(search_fields) > 0 THEN
+        SELECT STRING_AGG(id::TEXT,',') INTO query_part FROM config.metabib_field WHERE name = ANY (search_fields);
+        IF CHARACTER_LENGTH(query_part) > 0 THEN query_part := 'AND field IN ('||query_part||')'; END IF;
+    END IF;
+
+    SELECT STRING_AGG(word,' ') INTO norm_input FROM search.query_parse_positions(evergreen.lowercase(raw_input)) WHERE NOT negated;
+    EXECUTE 'SELECT  COUNT(DISTINCT source) AS recs
+               FROM  metabib.' || search_class || '_field_entry
+               WHERE index_vector @@ plainto_tsquery($$simple$$,$1)' || query_part
+            INTO norm_count USING norm_input;
+
+    SELECT STRING_AGG(word,' ') INTO norm_test FROM UNNEST(parsed_query_set);
+    FOR current_sugg IN
+        SELECT  *
+          FROM  search.symspell_generate_combined_suggestions(
+                    sugg_set,
+                    parsed_query_set,
+                    c_skip_correct,
+                    c_suggestion_word_option_count
+                ) x
+    LOOP
+        EXECUTE 'SELECT  COUNT(DISTINCT source) AS recs
+                   FROM  metabib.' || search_class || '_field_entry
+                   WHERE index_vector @@ to_tsquery($$simple$$,$1)' || query_part
+                INTO entry USING current_sugg.test;
+        SELECT STRING_AGG(word,' ') INTO norm_sugg FROM search.query_parse_positions(current_sugg.suggestion);
+        IF entry.recs >= c_min_suggestion_use_threshold AND (norm_count.recs = 0 OR norm_sugg <> norm_input) THEN
+
+            output.input := raw_input;
+            output.norm_input := norm_input;
+            output.suggestion := current_sugg.suggestion;
+            output.suggestion_count := entry.recs;
+            output.prefix_key := NULL;
+            output.prefix_key_count := norm_count.recs;
+
+            output.lev_distance := NULLIF(evergreen.levenshtein_damerau_edistance(norm_test, norm_sugg, c_max_phrase_edit_distance * CARDINALITY(parsed_query_set)), -1);
+            output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(norm_test, norm_sugg);
+            output.pg_trgm_sim := similarity(norm_input, norm_sugg);
+            output.soundex_sim := difference(norm_input, norm_sugg) / 4.0;
+
+            RETURN NEXT output;
+        END IF;
+
+        IF c_variant_authority_suggestion THEN
+            FOR auth_sugg IN
+                SELECT  DISTINCT m.value AS prefix_key,
+                        m.sort_value AS suggestion,
+                        v.value as raw_input,
+                        v.sort_value as norm_input
+                  FROM  authority.simple_heading v
+                        JOIN authority.control_set_authority_field csaf ON (csaf.id = v.atag)
+                        JOIN authority.heading_field f ON (f.id = csaf.heading_field)
+                        JOIN authority.simple_heading m ON (m.record = v.record AND csaf.main_entry = m.atag)
+                        JOIN authority.control_set_bib_field csbf ON (csbf.authority_field = csaf.main_entry)
+                        JOIN authority.control_set_bib_field_metabib_field_map csbfmfm ON (csbf.id = csbfmfm.bib_field)
+                        JOIN config.metabib_field cmf ON (
+                                csbfmfm.metabib_field = cmf.id
+                                AND (c_authority_class_restrict IS FALSE OR cmf.field_class = search_class)
+                                AND (search_fields = '{}'::TEXT[] OR cmf.name = ANY (search_fields))
+                        )
+                  WHERE v.sort_value = norm_sugg
+            LOOP
+                EXECUTE 'SELECT  COUNT(DISTINCT source) AS recs
+                           FROM  metabib.' || search_class || '_field_entry
+                           WHERE index_vector @@ plainto_tsquery($$simple$$,$1)' || query_part
+                        INTO auth_entry USING auth_sugg.suggestion;
+                IF auth_entry.recs >= c_min_suggestion_use_threshold AND (norm_count.recs = 0 OR auth_sugg.suggestion <> norm_input) THEN
+                    output.input := auth_sugg.raw_input;
+                    output.norm_input := auth_sugg.norm_input;
+                    output.suggestion := auth_sugg.suggestion;
+                    output.prefix_key := auth_sugg.prefix_key;
+                    output.suggestion_count := auth_entry.recs * -1; -- negative value here 
+
+                    output.lev_distance := 0;
+                    output.qwerty_kb_match := 0;
+                    output.pg_trgm_sim := 0;
+                    output.soundex_sim := 0;
+
+                    RETURN NEXT output;
+                END IF;
+            END LOOP;
+        END IF;
+    END LOOP;
+
+    RETURN;
+END;
+$F$ LANGUAGE PLPGSQL;
 
 CREATE OR REPLACE FUNCTION search.symspell_build_raw_entry (
     raw_input       TEXT,
diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql

index ee9e110..20ba28c 100644 (file)
--- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql
+++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql
@@ -95,8 +95,8 @@ INSERT INTO config.standing_penalty (id, name, label, staff_alert, org_depth) VA
 
 SELECT SETVAL('config.standing_penalty_id_seq', 100);
 
-INSERT INTO config.metabib_class ( name, label ) VALUES ( 'identifier', oils_i18n_gettext('identifier', 'Identifier', 'cmc', 'label') );
-INSERT INTO config.metabib_class ( name, label ) VALUES ( 'keyword', oils_i18n_gettext('keyword', 'Keyword', 'cmc', 'label') );
+INSERT INTO config.metabib_class ( name, label, variant_authority_suggestion ) VALUES ( 'identifier', oils_i18n_gettext('identifier', 'Identifier', 'cmc', 'label'), FALSE );
+INSERT INTO config.metabib_class ( name, label, variant_authority_suggestion ) VALUES ( 'keyword', oils_i18n_gettext('keyword', 'Keyword', 'cmc', 'label'), FALSE );
 INSERT INTO config.metabib_class ( name, label ) VALUES ( 'title', oils_i18n_gettext('title', 'Title', 'cmc', 'label'));
 INSERT INTO config.metabib_class ( name, label ) VALUES ( 'author', oils_i18n_gettext('author', 'Author', 'cmc', 'label'));
 INSERT INTO config.metabib_class ( name, label ) VALUES ( 'subject', oils_i18n_gettext('subject', 'Subject', 'cmc', 'label') );
@@ -21805,90 +21805,6 @@ VALUES (
 INSERT INTO config.internal_flag (name, value, enabled) VALUES ('symspell.prefix_length', '6', TRUE);
 INSERT INTO config.internal_flag (name, value, enabled) VALUES ('symspell.max_edit_distance', '3', TRUE);
 
-INSERT into config.org_unit_setting_type
-( name, grp, label, description, datatype )
-VALUES
-( 'opac.did_you_mean.max_suggestions', 'opac',
-   oils_i18n_gettext(
-     'opac.did_you_mean.max_suggestions',
-     'Maximum number of spelling suggestions that may be offered',
-     'coust', 'label'),
-   oils_i18n_gettext(
-     'opac.did_you_mean.max_suggestions',
-     'If set to -1, provide "best" suggestion if mispelled; if set higher than 0, the maximum suggestions that can be provided; if set to 0, disable suggestions.',
-     'coust', 'description'),
-   'integer' );
-
-INSERT into config.org_unit_setting_type
-( name, grp, label, description, datatype )
-VALUES
-( 'opac.did_you_mean.low_result_threshold', 'opac',
-   oils_i18n_gettext(
-     'opac.did_you_mean.low_result_threshold',
-     'Maximum search result count at which spelling suggestions may be offered',
-     'coust', 'label'),
-   oils_i18n_gettext(
-     'opac.did_you_mean.low_result_threshold',
-     'If a search results in this number or fewer results, and there are correctable spelling mistakes, a suggested search may be provided.',
-     'coust', 'description'),
-   'integer' );
-
-INSERT into config.org_unit_setting_type
-( name, grp, label, description, datatype )
-VALUES
-( 'search.symspell.min_suggestion_use_threshold', 'opac',
-   oils_i18n_gettext(
-     'search.symspell.min_suggestion_use_threshold',
-     'Minimum required uses of a spelling suggestions that may be offered',
-     'coust', 'label'),
-   oils_i18n_gettext(
-     'search.symspell.min_suggestion_use_threshold',
-     'The number of bibliographic records (more or less) that a spelling suggestion must appear in to be considered before offering it to a user. Defaults to 1 (must appear in the bib data).',
-     'coust', 'description'),
-   'integer' );
-
-INSERT into config.org_unit_setting_type
-( name, grp, label, description, datatype )
-VALUES
-( 'search.symspell.soundex.weight', 'opac',
-   oils_i18n_gettext(
-     'search.symspell.soundex.weight',
-     'Soundex score weighting in OPAC spelling suggestions.',
-     'coust', 'label'),
-   oils_i18n_gettext(
-     'search.symspell.soundex.weight',
-     'Soundex, trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled soundex component. Defaults to 0 for "off".',
-     'coust', 'description'),
-   'integer' );
-
-INSERT into config.org_unit_setting_type
-( name, grp, label, description, datatype )
-VALUES
-( 'search.symspell.pg_trgm.weight', 'opac',
-   oils_i18n_gettext(
-     'search.symspell.pg_trgm.weight',
-     'Pg_trgm score weighting in OPAC spelling suggestions.',
-     'coust', 'label'),
-   oils_i18n_gettext(
-     'search.symspell.pg_trgm.weight',
-     'Soundex, pg_trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled pg_trgm component. Defaults to 0 for "off".',
-     'coust', 'description'),
-   'integer' );
-
-INSERT into config.org_unit_setting_type
-( name, grp, label, description, datatype )
-VALUES
-( 'search.symspell.keyboard_distance.weight', 'opac',
-   oils_i18n_gettext(
-     'search.symspell.keyboard_distance.weight',
-     'Keyboard distance score weighting in OPAC spelling suggestions.',
-     'coust', 'label'),
-   oils_i18n_gettext(
-     'search.symspell.keyboard_distance.weight',
-     'Soundex, trgm, and keyboard distance similarity measures can be combined to form a secondary ordering parameter for spelling suggestions. This controls the relative weight of the scaled keyboard distance component. Defaults to 0 for "off".',
-     'coust', 'description'),
-   'integer' );
-
 INSERT INTO config.workstation_setting_type (name, grp, datatype, label)
 VALUES (
     'eg.staff.catalog.results.show_more', 'gui', 'bool',
author	Mike Rylander <mrylander@gmail.com>
	Fri, 31 Mar 2023 19:01:28 +0000 (15:01 -0400)
committer	Galen Charlton <gmc@equinoxOLI.org>
	Thu, 27 Apr 2023 18:00:27 +0000 (18:00 +0000)
Open-ILS/src/sql/Pg/002.schema.config.sql		patch \| blob \| history
Open-ILS/src/sql/Pg/300.schema.staged_search.sql		patch \| blob \| history
Open-ILS/src/sql/Pg/950.data.seed-values.sql		patch \| blob \| history