Merge branch 'main' of ssh://pagure.io/fedora-infra/arc into main

2021-02-08 11:24:20 +00:00 · 2021-02-08 11:24:20 +00:00 · dff16dd1a9
commit dff16dd1a9
parent 84eabffba8 f61f8c482a
6 changed files with 196 additions and 5 deletions
--- a/docs/datanommer_datagrepper/index.rst
+++ b/docs/datanommer_datagrepper/index.rst
@ -25,3 +25,5 @@ Here is the list of ideas/things we looked at:
    :maxdepth: 1

    pg_stat_statements
+    pg_partitioning
+    pg_timescaledb
--- a/docs/datanommer_datagrepper/pg_partitioning.rst
+++ b/docs/datanommer_datagrepper/pg_partitioning.rst
@ -0,0 +1,60 @@
+Partitioning the database
+=========================
+
+In the database used by datanommer and datagrepper one table stands out from the
+other ones by its size, the ``messages`` table. This can be observed in
+:ref:`datanommer`.
+
+One possibility to speed things up in datagrepper is to partition that table
+into a set of smaller sized partitions.
+
+Here are some resources regarding partitioning postgresql tables:
+
+* Table partitioning at postgresql's documentation: https://www.postgresql.org/docs/13/ddl-partitioning.html
+* How to use table partitioning to scale PostgreSQL: https://www.enterprisedb.com/postgres-tutorials/how-use-table-partitioning-scale-postgresql
+* Definition of PostgreSQL Partition: https://www.educba.com/postgresql-partition/
+
+
+Attempt #1
+----------
+
+For our first attempt at partitioning the `messages` table, we thought we would
+partition it by year. Having a different partition for each year.
+We thus started by adding a ``year`` field to the table and fill it by extracting
+the year from the ``timestamp`` field of the table.
+
+However, one thing to realize when using partitioned table is that each partition
+needs to be considered as an independant table. Meaning an unique constraint has
+to involve the field on which the table is partitioned.
+In other words, if you partition the table by a year field, that year field will
+need to be part of the primary key as well as any ``UNIQUE`` constraint on the
+table.
+
+So to partition the `messages` table on ``year``, we had to add the ``year``
+field to the primary key. However, that broke the foreign key constraints on
+the ``user_messages`` and ``package_messages`` tables which rely on the ``id``
+field to link the tables.
+
+
+Attempt #2
+----------
+
+Since partitioning on ``year`` did not work, we reconsidered and decided to
+partition on the ``id`` field instead using `RANGE PARTITION`.
+
+We partitioned the ``messages`` table on the ``id`` field with partition of 10
+million records each. This has the advantage of making each partition of similar
+sizes.
+
+
+
+
+More resources
+--------------
+
+These are a few more resources we looked at and thought were worth bookmarking:
+
+* Automatic partitioning by day - PostgreSQL: https://stackoverflow.com/questions/55642326/
+* pg_partman, partition manager: https://github.com/pgpartman/pg_partman
+* How to scale PostgreSQL 10 using table inheritance and declarative partitioning: https://blog.timescale.com/blog/scaling-partitioning-data-postgresql-10-explained-cd48a712a9a1/
+
--- a/docs/datanommer_datagrepper/pg_timescaledb.rst
+++ b/docs/datanommer_datagrepper/pg_timescaledb.rst
@ -0,0 +1,55 @@
+Using the timescaledb extension
+===============================
+
+timescaledb (https://docs.timescale.com/latest/) is a postgresql extension for
+time-series database.
+Considering a lot of the actions done on datagrepper involve the timestamp field
+(for example: all the messages with that topic in this time range), we figured
+this extension is worth investigating.
+
+A bonus point being for this extension to already packaged and available in
+Fedora and EPEL.
+
+
+Resources
+---------
+
+* Setting up/enabling timescaledb: https://severalnines.com/database-blog/how-enable-timescaledb-existing-postgresql-database
+* Migrating an existing database to timescaledb: https://docs.timescale.com/latest/getting-started/migrating-data#same-db
+
+
+Installing/enabling/activating
+------------------------------
+
+To install the plugin, simply run:
+::
+
+    dnf install timescaledb
+
+The edit ``/var/lib/pgsql/data/postgresql.conf`` to tell postgresql to load it:
+::
+
+    shared_preload_libraries = 'pg_stat_statements,timescaledb'
+    timescaledb.max_background_workers=4
+
+
+It will then need a restart of the entire database server:
+::
+
+    systemctl restart postgresql
+
+You can then check if the extension loaded properly:
+::
+
+    $ sudo -u postgres psql
+    SELECT * FROM pg_available_extensions ORDER BY name;
+
+Then, you will need to activate it for your database:
+::
+    $ sudo -u postgres psql <database_name>
+    CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE;
+
+Finally, you can check that the extension was activated for your database:
+::
+    $ sudo -u postgres psql <database_name>
+    \dx
--- a/scripts/migration.sql
+++ b/scripts/migration.sql
@ -107,3 +107,6 @@ CREATE INDEX messages2_datanommer_timestamp_topic_idx ON public.messages2 USING

 ALTER TABLE user_messages DROP CONSTRAINT user_messages_msg_fkey;
 ALTER TABLE user_messages ADD CO
+
+ALTER TABLE package_messages DROP CONSTRAINT package_messages_msg_fkey;
+ALTER TABLE package_messages ADD CONSTRAINT package_messages_msg_fkey FOREIGN KEY (msg) REFERENCES messages2(id);
--- a/scripts/migration_timescaledb.sql
+++ b/scripts/migration_timescaledb.sql
@ -0,0 +1,69 @@
+-- Sources
+-- setting up/enabling: https://severalnines.com/database-blog/how-enable-timescaledb-existing-postgresql-database
+-- migration: https://docs.timescale.com/latest/getting-started/migrating-data#same-db
+
+-- dnf install timescaledb
+-- vim /var/lib/pgsql/data/postgresql.conf
+-- """
+-- shared_preload_libraries = 'pg_stat_statements,timescaledb'
+-- timescaledb.max_background_workers=4
+-- """
+-- systemctl restart postgresql
+-- Check the extension is available
+-- sudo -u postgres psql
+
+
+\timing on
+
+-- Check the extension is available
+SELECT * FROM pg_available_extensions ORDER BY name;
+
+-- Enable the extension on the datanommer database (\c datanommer)
+CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE;
+
+-- Check that the extension is enabled
+
+\dx
+
+
+-- Create the messages2 table that will be have the timescaledb optimization
+
+CREATE TABLE messages2 (LIKE messages INCLUDING DEFAULTS INCLUDING CONSTRAINTS EXCLUDING INDEXES);
+
+CREATE SEQUENCE public.messages2_id_seq
+    START WITH 1
+    INCREMENT BY 1
+    NO MINVALUE
+    NO MAXVALUE
+    CACHE 1;
+
+ALTER TABLE public.messages2 OWNER TO datanommer;
+ALTER TABLE public.messages2_id_seq OWNER TO datanommer;
+-- ALTER SEQUENCE public.messages2_id_seq OWNED BY public.messages2.id;
+ALTER TABLE ONLY public.messages2 ALTER COLUMN id SET DEFAULT nextval('public.messages2_id_seq'::regclass);
+GRANT SELECT ON TABLE public.messages2 TO datagrepper;
+GRANT SELECT ON SEQUENCE public.messages2_id_seq TO datagrepper;
+
+-- Convert the timestamp to the hypertable
+
+SELECT create_hypertable('messages2', 'timestamp');
+
+-- Insert the data
+
+INSERT INTO messages2 (i,"timestamp",certificate,signature,topic,_msg,category,source_name,source_version,msg_id,_headers,username,crypto)
+SELECT i,"timestamp",certificate,signature,topic,_msg,category,source_name,source_version,msg_id,_headers,username,crypto FROM messages ORDER BY messages.id;
+
+-- Create the indexes ones the data is in
+
+CREATE INDEX index_msg2_category ON public.messages2 USING btree (category);
+CREATE INDEX index_msg2_timestamp ON public.messages2 USING btree ("timestamp");
+CREATE INDEX index_msg2_topic ON public.messages2 USING btree (topic);
+CREATE INDEX messages2_datanommer_timestamp_category_idx ON public.messages2 USING btree ("timestamp" DESC, category);
+CREATE INDEX messages2_datanommer_timestamp_topic_idx ON public.messages2 USING btree ("timestamp" DESC, topic);
+
+
+ALTER TABLE user_messages DROP CONSTRAINT user_messages_msg_fkey;
+ALTER TABLE user_messages ADD CONSTRAINT user_messages_msg_fkey FOREIGN KEY (msg) REFERENCES messages2(id);
+
+ALTER TABLE package_messages DROP CONSTRAINT package_messages_msg_fkey;
+ALTER TABLE package_messages ADD CONSTRAINT package_messages_msg_fkey FOREIGN KEY (msg) REFERENCES messages2(id);
--- a/scripts/test_datagrepper_perfs.py
+++ b/scripts/test_datagrepper_perfs.py
@ -313,15 +313,16 @@ class TestAPI:
        Return: None for exception
        """
        try:
-            if auth == None:
-                resp = requests.get(url, verify=verify)
-            else:
-                resp = requests.get(url, auth=auth, verify=verify)
+            resp = requests.get(url, auth=auth, verify=verify, timeout=120)
        except Exception as ex:
            _log.error("requests.get() failed with exception:", str(ex))
            return None

-        _log.debug("response time in seconds: %s", resp.elapsed.total_seconds())
+        _log.debug(
+            "response time in seconds: %s -- %s",
+            resp.elapsed.total_seconds(),
+            url
+        )

        return resp

@ -341,6 +342,7 @@ def main():
    print("Tests started at %s." % time.asctime())

    for env_name, base_url in [
+        ("datagrepper-timescalebd/aws", "http://datagrepper-timescale.arc.fedorainfracloud.org/datagrepper"),
        ("datagrepper-test/aws", "http://datagrepper-test.arc.fedorainfracloud.org/datagrepper"),
        ("datagrepper-prod/aws", "http://datagrepper.arc.fedorainfracloud.org/datagrepper"),
        ("datagrepper-prod/openshift", "https://datagrepper-monitor-dashboard.app.os.fedoraproject.org"),