LP#1998355: reduce growth of DYM dictionary
authorGalen Charlton <gmc@equinoxOLI.org>
Wed, 30 Nov 2022 16:10:36 +0000 (11:10 -0500)
committerJason Boyer <JBoyer@equinoxOLI.org>
Sun, 5 Mar 2023 17:54:53 +0000 (12:54 -0500)
This patch reduces the number of updates to search.sympell_dictionary
rows that would not change the contents of those rows, thereby
reducing the potential for certain record maintenance operations to
significantlly bloat that table.

In particular, it adjust the upsert to update the row for an existing
prefix only if there would be a net change in at least one of the *_count
columns or the list of suggestions. (Note that if a row is the target of
an UPDATE statement, PostgreSQL will _always_ create a row version, even
if there is no change to the contents of the row.)

It should be noted that while this patch is useful in and of itself, there
is a longer-term fix that would have additional benefits: adjust the
overall reingest logic so that it minimizes changes to all large tables
derived from the bib record when a bib gets reingested. A row that never
gets touched because it doesn't have to be can never become bloat.

To test
-------
[1] In a Concerto database, ensure that idempotent updates of the MARC
    in biblio.record_entry will nonetheless force a reingest by running:

    update config.internal_flag set enabled = true where name = 'ingest.reingest.force_on_same_marc';

[2] Note the size of search.symspell_dictionary by running:

    select pg_size_pretty(pg_total_relation_size('search.symspell_dictionary'));

[3] Run a few rounds of the following update that forces a reingest of the bibs:

    update biblio.record_entry set id = id;

[4] For the sake of fairness, run a vacuum on the table:

    VACUUM ANALYZE search.sympsell_dictionary

[5] Run the size measuremeant again and not that it's significantly larger.

[6] Run the following the reset the table size:

    VACUUM FULL search.symspell_dictionary;

[7] Note the size, apply the patch, and repeat step 3.

[8] This time, the table size should be the same (or close to the same) as it
    was at the beginning of step 7.

Signed-off-by: Galen Charlton <gmc@equinoxOLI.org>
Signed-off-by: Jason Boyer <JBoyer@equinoxOLI.org>

Open-ILS/src/sql/Pg/300.schema.staged_search.sql
Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql [new file with mode: 0644]

index 5abbbef..2455aad 100644 (file)
@@ -1202,6 +1202,21 @@ CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF sea
 
     identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
     identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+
+    WHERE (
+        EXCLUDED.keyword_count <> 0 OR
+        EXCLUDED.title_count <> 0 OR
+        EXCLUDED.author_count <> 0 OR
+        EXCLUDED.subject_count <> 0 OR
+        EXCLUDED.series_count <> 0 OR
+        EXCLUDED.identifier_count <> 0 OR
+        NOT (EXCLUDED.keyword_suggestions <@ d.keyword_suggestions) OR
+        NOT (EXCLUDED.title_suggestions <@ d.title_suggestions) OR
+        NOT (EXCLUDED.author_suggestions <@ d.author_suggestions) OR
+        NOT (EXCLUDED.subject_suggestions <@ d.subject_suggestions) OR
+        NOT (EXCLUDED.series_suggestions <@ d.series_suggestions) OR
+        NOT (EXCLUDED.identifier_suggestions <@ d.identifier_suggestions)
+    )
  RETURNING *;
 $f$ LANGUAGE SQL;
 
diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql
new file mode 100644 (file)
index 0000000..996e967
--- /dev/null
@@ -0,0 +1,64 @@
+BEGIN;
+
+-- SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF search.symspell_dictionary AS $f$
+ WITH new_rows AS (
+    DELETE FROM search.symspell_dictionary_updates WHERE transaction_id = txid_current() RETURNING *
+ ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT
+    SELECT  SUM(keyword_count)    AS keyword_count,
+            SUM(title_count)      AS title_count,
+            SUM(author_count)     AS author_count,
+            SUM(subject_count)    AS subject_count,
+            SUM(series_count)     AS series_count,
+            SUM(identifier_count) AS identifier_count,
+
+            prefix_key,
+
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL)    AS keyword_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL)      AS title_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL)     AS author_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL)    AS subject_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL)     AS series_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions
+      FROM  new_rows
+      GROUP BY prefix_key
+ )
+ INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows
+ ON CONFLICT (prefix_key) DO UPDATE SET
+    keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count),
+    keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions),
+
+    title_count = GREATEST(0, d.title_count + EXCLUDED.title_count),
+    title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions),
+
+    author_count = GREATEST(0, d.author_count + EXCLUDED.author_count),
+    author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions),
+
+    subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count),
+    subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions),
+
+    series_count = GREATEST(0, d.series_count + EXCLUDED.series_count),
+    series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions),
+
+    identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
+    identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+
+    WHERE (
+        EXCLUDED.keyword_count <> 0 OR
+        EXCLUDED.title_count <> 0 OR
+        EXCLUDED.author_count <> 0 OR
+        EXCLUDED.subject_count <> 0 OR
+        EXCLUDED.series_count <> 0 OR
+        EXCLUDED.identifier_count <> 0 OR
+        NOT (EXCLUDED.keyword_suggestions <@ d.keyword_suggestions) OR
+        NOT (EXCLUDED.title_suggestions <@ d.title_suggestions) OR
+        NOT (EXCLUDED.author_suggestions <@ d.author_suggestions) OR
+        NOT (EXCLUDED.subject_suggestions <@ d.subject_suggestions) OR
+        NOT (EXCLUDED.series_suggestions <@ d.series_suggestions) OR
+        NOT (EXCLUDED.identifier_suggestions <@ d.identifier_suggestions)
+    )
+ RETURNING *;
+$f$ LANGUAGE SQL;
+
+COMMIT;