From cc9af130212128a4941526da9f79938001adf140 Mon Sep 17 00:00:00 2001 From: Dan Wells Date: Tue, 4 Sep 2012 17:09:44 -0400 Subject: [PATCH] Upgrade bits for Vandelay Overlay Changes Signed-off-by: Dan Wells --- Open-ILS/src/sql/Pg/002.schema.config.sql | 2 +- ...38.schema.vandelay.import-match-no-like-any.sql | 249 ++++++++++++++++++++ ...XX.schema.vandelay.import-match-no-like-any.sql | 249 -------------------- 3 files changed, 250 insertions(+), 250 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0738.schema.vandelay.import-match-no-like-any.sql delete mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.vandelay.import-match-no-like-any.sql diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 8439b14..e1f8381 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -87,7 +87,7 @@ CREATE TRIGGER no_overlapping_deps BEFORE INSERT OR UPDATE ON config.db_patch_dependencies FOR EACH ROW EXECUTE PROCEDURE evergreen.array_overlap_check ('deprecates'); -INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0737', :eg_version); -- dyrcona/dbs +INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0738', :eg_version); -- senator/dbwells CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/upgrade/0738.schema.vandelay.import-match-no-like-any.sql b/Open-ILS/src/sql/Pg/upgrade/0738.schema.vandelay.import-match-no-like-any.sql new file mode 100644 index 0000000..e00cb4d --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0738.schema.vandelay.import-match-no-like-any.sql @@ -0,0 +1,249 @@ +BEGIN; + +-- 0738.schema.vandelay.import-match-no-like-any.sql + +SELECT evergreen.upgrade_deps_block_check('0738', :eg_version); + +CREATE OR REPLACE FUNCTION vandelay.match_set_test_marcxml( + match_set_id INTEGER, record_xml TEXT +) RETURNS SETOF vandelay.match_set_test_result AS $$ +DECLARE + tags_rstore HSTORE; + svf_rstore HSTORE; + coal TEXT; + joins TEXT; + query_ TEXT; + wq TEXT; + qvalue INTEGER; + rec RECORD; +BEGIN + tags_rstore := vandelay.flatten_marc_hstore(record_xml); + svf_rstore := vandelay.extract_rec_attrs(record_xml); + + CREATE TEMPORARY TABLE _vandelay_tmp_qrows (q INTEGER); + CREATE TEMPORARY TABLE _vandelay_tmp_jrows (j TEXT); + + -- generate the where clause and return that directly (into wq), and as + -- a side-effect, populate the _vandelay_tmp_[qj]rows tables. + wq := vandelay.get_expr_from_match_set(match_set_id, tags_rstore); + + query_ := 'SELECT DISTINCT(record), '; + + -- qrows table is for the quality bits we add to the SELECT clause + SELECT ARRAY_TO_STRING( + ARRAY_ACCUM('COALESCE(n' || q::TEXT || '.quality, 0)'), ' + ' + ) INTO coal FROM _vandelay_tmp_qrows; + + -- our query string so far is the SELECT clause and the inital FROM. + -- no JOINs yet nor the WHERE clause + query_ := query_ || coal || ' AS quality ' || E'\n'; + + -- jrows table is for the joins we must make (and the real text conditions) + SELECT ARRAY_TO_STRING(ARRAY_ACCUM(j), E'\n') INTO joins + FROM _vandelay_tmp_jrows; + + -- add those joins and the where clause to our query. + query_ := query_ || joins || E'\n' || 'JOIN biblio.record_entry bre ON (bre.id = record) ' || 'WHERE ' || wq || ' AND not bre.deleted'; + + -- this will return rows of record,quality + FOR rec IN EXECUTE query_ USING tags_rstore, svf_rstore LOOP + RETURN NEXT rec; + END LOOP; + + DROP TABLE _vandelay_tmp_qrows; + DROP TABLE _vandelay_tmp_jrows; + RETURN; +END; + +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.get_expr_from_match_set( + match_set_id INTEGER, + tags_rstore HSTORE +) RETURNS TEXT AS $$ +DECLARE + root vandelay.match_set_point; +BEGIN + SELECT * INTO root FROM vandelay.match_set_point + WHERE parent IS NULL AND match_set = match_set_id; + + RETURN vandelay.get_expr_from_match_set_point(root, tags_rstore); +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.get_expr_from_match_set_point( + node vandelay.match_set_point, + tags_rstore HSTORE +) RETURNS TEXT AS $$ +DECLARE + q TEXT; + i INTEGER; + this_op TEXT; + children INTEGER[]; + child vandelay.match_set_point; +BEGIN + SELECT ARRAY_ACCUM(id) INTO children FROM vandelay.match_set_point + WHERE parent = node.id; + + IF ARRAY_LENGTH(children, 1) > 0 THEN + this_op := vandelay._get_expr_render_one(node); + q := '('; + i := 1; + WHILE children[i] IS NOT NULL LOOP + SELECT * INTO child FROM vandelay.match_set_point + WHERE id = children[i]; + IF i > 1 THEN + q := q || ' ' || this_op || ' '; + END IF; + i := i + 1; + q := q || vandelay.get_expr_from_match_set_point(child, tags_rstore); + END LOOP; + q := q || ')'; + RETURN q; + ELSIF node.bool_op IS NULL THEN + PERFORM vandelay._get_expr_push_qrow(node); + PERFORM vandelay._get_expr_push_jrow(node, tags_rstore); + RETURN vandelay._get_expr_render_one(node); + ELSE + RETURN ''; + END IF; +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay._get_expr_push_jrow( + node vandelay.match_set_point, + tags_rstore HSTORE +) RETURNS VOID AS $$ +DECLARE + jrow TEXT; + my_alias TEXT; + op TEXT; + tagkey TEXT; + caseless BOOL; + jrow_count INT; + my_using TEXT; + my_join TEXT; +BEGIN + -- remember $1 is tags_rstore, and $2 is svf_rstore + + caseless := FALSE; + SELECT COUNT(*) INTO jrow_count FROM _vandelay_tmp_jrows; + IF jrow_count > 0 THEN + my_using := ' USING (record)'; + my_join := 'FULL OUTER JOIN'; + ELSE + my_using := ''; + my_join := 'FROM'; + END IF; + + IF node.tag IS NOT NULL THEN + caseless := (node.tag IN ('020', '022', '024')); + tagkey := node.tag; + IF node.subfield IS NOT NULL THEN + tagkey := tagkey || node.subfield; + END IF; + END IF; + + IF node.negate THEN + IF caseless THEN + op := 'NOT LIKE'; + ELSE + op := '<>'; + END IF; + ELSE + IF caseless THEN + op := 'LIKE'; + ELSE + op := '='; + END IF; + END IF; + + my_alias := 'n' || node.id::TEXT; + + jrow := my_join || ' (SELECT *, '; + IF node.tag IS NOT NULL THEN + jrow := jrow || node.quality || + ' AS quality FROM metabib.full_rec mfr WHERE mfr.tag = ''' || + node.tag || ''''; + IF node.subfield IS NOT NULL THEN + jrow := jrow || ' AND mfr.subfield = ''' || + node.subfield || ''''; + END IF; + jrow := jrow || ' AND ('; + jrow := jrow || vandelay._node_tag_comparisons(caseless, op, tags_rstore, tagkey); + jrow := jrow || ')) ' || my_alias || my_using || E'\n'; + ELSE -- svf + jrow := jrow || 'id AS record, ' || node.quality || + ' AS quality FROM metabib.record_attr mra WHERE mra.attrs->''' || + node.svf || ''' ' || op || ' $2->''' || node.svf || ''') ' || + my_alias || my_using || E'\n'; + END IF; + INSERT INTO _vandelay_tmp_jrows (j) VALUES (jrow); +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay._node_tag_comparisons( + caseless BOOLEAN, + op TEXT, + tags_rstore HSTORE, + tagkey TEXT +) RETURNS TEXT AS $$ +DECLARE + result TEXT; + i INT; + vals TEXT[]; +BEGIN + i := 1; + vals := tags_rstore->tagkey; + result := ''; + + WHILE TRUE LOOP + IF i > 1 THEN + IF vals[i] IS NULL THEN + EXIT; + ELSE + result := result || ' OR '; + END IF; + END IF; + + IF caseless THEN + result := result || 'LOWER(mfr.value) ' || op; + ELSE + result := result || 'mfr.value ' || op; + END IF; + + result := result || ' ' || COALESCE('''' || vals[i] || '''', 'NULL'); + + IF vals[i] IS NULL THEN + EXIT; + END IF; + i := i + 1; + END LOOP; + + RETURN result; + +END; +$$ LANGUAGE PLPGSQL; + +-- drop old versions of these functions with fewer args +DROP FUNCTION vandelay.get_expr_from_match_set( INTEGER ); +DROP FUNCTION vandelay.get_expr_from_match_set_point( vandelay.match_set_point ); +DROP FUNCTION vandelay._get_expr_push_jrow( vandelay.match_set_point ); + +-- This next index might fully supplant an existing one but leaving both for now +-- (they are not too large) +-- The reason we need this index is to ensure that the query parser always +-- prefers this index over the simpler tag/subfield index, as this greatly +-- increases Vandelay overlay speed for these identifiers, especially when +-- a record has many of these fields (around > 4-6 seems like the cutoff +-- on at least one PG9.1 system) +-- A similar index could be added for other fields (e.g. 010), but one should +-- leave out the LOWER() in all other cases. +-- TODO: verify whether we can discard the non tag/subfield/substring version +-- (metabib_full_rec_isxn_caseless_idx) +CREATE INDEX metabib_full_rec_02x_tag_subfield_lower_substring + ON metabib.real_full_rec (tag, subfield, LOWER(substring(value, 1, 1024))) + WHERE tag IN ('020', '022', '024'); + +COMMIT; + diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.vandelay.import-match-no-like-any.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.vandelay.import-match-no-like-any.sql deleted file mode 100644 index 1ec6032..0000000 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.vandelay.import-match-no-like-any.sql +++ /dev/null @@ -1,249 +0,0 @@ -BEGIN; - --- XXXX.schema.vandelay.import-match-no-like-any.sql - -SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); - -CREATE OR REPLACE FUNCTION vandelay.match_set_test_marcxml( - match_set_id INTEGER, record_xml TEXT -) RETURNS SETOF vandelay.match_set_test_result AS $$ -DECLARE - tags_rstore HSTORE; - svf_rstore HSTORE; - coal TEXT; - joins TEXT; - query_ TEXT; - wq TEXT; - qvalue INTEGER; - rec RECORD; -BEGIN - tags_rstore := vandelay.flatten_marc_hstore(record_xml); - svf_rstore := vandelay.extract_rec_attrs(record_xml); - - CREATE TEMPORARY TABLE _vandelay_tmp_qrows (q INTEGER); - CREATE TEMPORARY TABLE _vandelay_tmp_jrows (j TEXT); - - -- generate the where clause and return that directly (into wq), and as - -- a side-effect, populate the _vandelay_tmp_[qj]rows tables. - wq := vandelay.get_expr_from_match_set(match_set_id, tags_rstore); - - query_ := 'SELECT DISTINCT(record), '; - - -- qrows table is for the quality bits we add to the SELECT clause - SELECT ARRAY_TO_STRING( - ARRAY_ACCUM('COALESCE(n' || q::TEXT || '.quality, 0)'), ' + ' - ) INTO coal FROM _vandelay_tmp_qrows; - - -- our query string so far is the SELECT clause and the inital FROM. - -- no JOINs yet nor the WHERE clause - query_ := query_ || coal || ' AS quality ' || E'\n'; - - -- jrows table is for the joins we must make (and the real text conditions) - SELECT ARRAY_TO_STRING(ARRAY_ACCUM(j), E'\n') INTO joins - FROM _vandelay_tmp_jrows; - - -- add those joins and the where clause to our query. - query_ := query_ || joins || E'\n' || 'JOIN biblio.record_entry bre ON (bre.id = record) ' || 'WHERE ' || wq || ' AND not bre.deleted'; - - -- this will return rows of record,quality - FOR rec IN EXECUTE query_ USING tags_rstore, svf_rstore LOOP - RETURN NEXT rec; - END LOOP; - - DROP TABLE _vandelay_tmp_qrows; - DROP TABLE _vandelay_tmp_jrows; - RETURN; -END; - -$$ LANGUAGE PLPGSQL; - -CREATE OR REPLACE FUNCTION vandelay.get_expr_from_match_set( - match_set_id INTEGER, - tags_rstore HSTORE -) RETURNS TEXT AS $$ -DECLARE - root vandelay.match_set_point; -BEGIN - SELECT * INTO root FROM vandelay.match_set_point - WHERE parent IS NULL AND match_set = match_set_id; - - RETURN vandelay.get_expr_from_match_set_point(root, tags_rstore); -END; -$$ LANGUAGE PLPGSQL; - -CREATE OR REPLACE FUNCTION vandelay.get_expr_from_match_set_point( - node vandelay.match_set_point, - tags_rstore HSTORE -) RETURNS TEXT AS $$ -DECLARE - q TEXT; - i INTEGER; - this_op TEXT; - children INTEGER[]; - child vandelay.match_set_point; -BEGIN - SELECT ARRAY_ACCUM(id) INTO children FROM vandelay.match_set_point - WHERE parent = node.id; - - IF ARRAY_LENGTH(children, 1) > 0 THEN - this_op := vandelay._get_expr_render_one(node); - q := '('; - i := 1; - WHILE children[i] IS NOT NULL LOOP - SELECT * INTO child FROM vandelay.match_set_point - WHERE id = children[i]; - IF i > 1 THEN - q := q || ' ' || this_op || ' '; - END IF; - i := i + 1; - q := q || vandelay.get_expr_from_match_set_point(child, tags_rstore); - END LOOP; - q := q || ')'; - RETURN q; - ELSIF node.bool_op IS NULL THEN - PERFORM vandelay._get_expr_push_qrow(node); - PERFORM vandelay._get_expr_push_jrow(node, tags_rstore); - RETURN vandelay._get_expr_render_one(node); - ELSE - RETURN ''; - END IF; -END; -$$ LANGUAGE PLPGSQL; - -CREATE OR REPLACE FUNCTION vandelay._get_expr_push_jrow( - node vandelay.match_set_point, - tags_rstore HSTORE -) RETURNS VOID AS $$ -DECLARE - jrow TEXT; - my_alias TEXT; - op TEXT; - tagkey TEXT; - caseless BOOL; - jrow_count INT; - my_using TEXT; - my_join TEXT; -BEGIN - -- remember $1 is tags_rstore, and $2 is svf_rstore - - caseless := FALSE; - SELECT COUNT(*) INTO jrow_count FROM _vandelay_tmp_jrows; - IF jrow_count > 0 THEN - my_using := ' USING (record)'; - my_join := 'FULL OUTER JOIN'; - ELSE - my_using := ''; - my_join := 'FROM'; - END IF; - - IF node.tag IS NOT NULL THEN - caseless := (node.tag IN ('020', '022', '024')); - tagkey := node.tag; - IF node.subfield IS NOT NULL THEN - tagkey := tagkey || node.subfield; - END IF; - END IF; - - IF node.negate THEN - IF caseless THEN - op := 'NOT LIKE'; - ELSE - op := '<>'; - END IF; - ELSE - IF caseless THEN - op := 'LIKE'; - ELSE - op := '='; - END IF; - END IF; - - my_alias := 'n' || node.id::TEXT; - - jrow := my_join || ' (SELECT *, '; - IF node.tag IS NOT NULL THEN - jrow := jrow || node.quality || - ' AS quality FROM metabib.full_rec mfr WHERE mfr.tag = ''' || - node.tag || ''''; - IF node.subfield IS NOT NULL THEN - jrow := jrow || ' AND mfr.subfield = ''' || - node.subfield || ''''; - END IF; - jrow := jrow || ' AND ('; - jrow := jrow || vandelay._node_tag_comparisons(caseless, op, tags_rstore, tagkey); - jrow := jrow || ')) ' || my_alias || my_using || E'\n'; - ELSE -- svf - jrow := jrow || 'id AS record, ' || node.quality || - ' AS quality FROM metabib.record_attr mra WHERE mra.attrs->''' || - node.svf || ''' ' || op || ' $2->''' || node.svf || ''') ' || - my_alias || my_using || E'\n'; - END IF; - INSERT INTO _vandelay_tmp_jrows (j) VALUES (jrow); -END; -$$ LANGUAGE PLPGSQL; - -CREATE OR REPLACE FUNCTION vandelay._node_tag_comparisons( - caseless BOOLEAN, - op TEXT, - tags_rstore HSTORE, - tagkey TEXT -) RETURNS TEXT AS $$ -DECLARE - result TEXT; - i INT; - vals TEXT[]; -BEGIN - i := 1; - vals := tags_rstore->tagkey; - result := ''; - - WHILE TRUE LOOP - IF i > 1 THEN - IF vals[i] IS NULL THEN - EXIT; - ELSE - result := result || ' OR '; - END IF; - END IF; - - IF caseless THEN - result := result || 'LOWER(mfr.value) ' || op; - ELSE - result := result || 'mfr.value ' || op; - END IF; - - result := result || ' ' || COALESCE('''' || vals[i] || '''', 'NULL'); - - IF vals[i] IS NULL THEN - EXIT; - END IF; - i := i + 1; - END LOOP; - - RETURN result; - -END; -$$ LANGUAGE PLPGSQL; - --- drop old versions of these functions with fewer args -DROP FUNCTION vandelay.get_expr_from_match_set( INTEGER ); -DROP FUNCTION vandelay.get_expr_from_match_set_point( vandelay.match_set_point ); -DROP FUNCTION vandelay._get_expr_push_jrow( vandelay.match_set_point ); - --- This next index might fully supplant an existing one but leaving both for now --- (they are not too large) --- The reason we need this index is to ensure that the query parser always --- prefers this index over the simpler tag/subfield index, as this greatly --- increases Vandelay overlay speed for these identifiers, especially when --- a record has many of these fields (around > 4-6 seems like the cutoff --- on at least one PG9.1 system) --- A similar index could be added for other fields (e.g. 010), but one should --- leave out the LOWER() in all other cases. --- TODO: verify whether we can discard the non tag/subfield/substring version --- (metabib_full_rec_isxn_caseless_idx) -CREATE INDEX metabib_full_rec_02x_tag_subfield_lower_substring - ON metabib.real_full_rec (tag, subfield, LOWER(substring(value, 1, 1024))) - WHERE tag IN ('020', '022', '024'); - -COMMIT; - -- 1.7.2.5