From 6b128c08b67f1357f6aa3130526079c3ffa6115d Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Thu, 4 Feb 2021 09:11:58 +0100 Subject: [PATCH 1/6] Add constraing on package_messages Signed-off-by: Pierre-Yves Chibon --- scripts/migration.sql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/migration.sql b/scripts/migration.sql index 8a507fe..ff07211 100644 --- a/scripts/migration.sql +++ b/scripts/migration.sql @@ -107,3 +107,6 @@ CREATE INDEX messages2_datanommer_timestamp_topic_idx ON public.messages2 USING ALTER TABLE user_messages DROP CONSTRAINT user_messages_msg_fkey; ALTER TABLE user_messages ADD CO + +ALTER TABLE package_messages DROP CONSTRAINT package_messages_msg_fkey; +ALTER TABLE package_messages ADD CONSTRAINT package_messages_msg_fkey FOREIGN KEY (msg) REFERENCES messages2(id); From 72b02b0ad8b4cb14afdec4d865fbe12d6a44faa9 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Thu, 4 Feb 2021 09:32:08 +0100 Subject: [PATCH 2/6] Add a migration_timescaledb script This script contains the steps followed to install, configure, enable, activate and convert the datanommer database to timescaledb Signed-off-by: Pierre-Yves Chibon --- scripts/migration_timescaledb.sql | 77 +++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 scripts/migration_timescaledb.sql diff --git a/scripts/migration_timescaledb.sql b/scripts/migration_timescaledb.sql new file mode 100644 index 0000000..0b9521d --- /dev/null +++ b/scripts/migration_timescaledb.sql @@ -0,0 +1,77 @@ +-- Sources +-- setting up/enabling: https://severalnines.com/database-blog/how-enable-timescaledb-existing-postgresql-database +-- migration: https://docs.timescale.com/latest/getting-started/migrating-data#same-db + +-- dnf install timescaledb +-- vim /var/lib/pgsql/data/postgresql.conf +-- """ +-- shared_preload_libraries = 'pg_stat_statements,timescaledb' +-- timescaledb.max_background_workers=4 +-- """ +-- systemctl restart postgresql +-- Check the extension is available +-- sudo -u postgres psql + + +\timing on + +-- Check the extension is available +SELECT * FROM pg_available_extensions ORDER BY name; + +-- Enable the extension on the datanommer database (\c datanommer) +CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE; + +-- Check that the extension is enabled + +\dx + + +-- Create the messages2 table that will be have the timescaledb optimization + +CREATE TABLE messages2 (LIKE messages INCLUDING DEFAULTS INCLUDING CONSTRAINTS EXCLUDING INDEXES); + +CREATE SEQUENCE public.messages2_id_seq + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; + +ALTER TABLE public.messages2 OWNER TO datanommer; +ALTER TABLE public.messages2_id_seq OWNER TO datanommer; +-- ALTER SEQUENCE public.messages2_id_seq OWNED BY public.messages2.id; +ALTER TABLE ONLY public.messages2 ALTER COLUMN id SET DEFAULT nextval('public.messages2_id_seq'::regclass); +GRANT SELECT ON TABLE public.messages2 TO datagrepper; +GRANT SELECT ON SEQUENCE public.messages2_id_seq TO datagrepper; + +-- Convert the timestamp to the hypertable + +SELECT create_hypertable('messages2', 'timestamp'); + +-- Insert the data + +INSERT INTO messages2 (i,"timestamp",certificate,signature,topic,_msg,category,source_name,source_version,msg_id,_headers,username,crypto) +SELECT i,"timestamp",certificate,signature,topic,_msg,category,source_name,source_version,msg_id,_headers,username,crypto FROM messages ORDER BY messages.id; + +-- Create the indexes ones the data is in + +CREATE INDEX index_msg2_category ON public.messages2 USING btree (category); +CREATE INDEX index_msg2_timestamp ON public.messages2 USING btree ("timestamp"); +CREATE INDEX index_msg2_topic ON public.messages2 USING btree (topic); +CREATE INDEX messages2_datanommer_timestamp_category_idx ON public.messages2 USING btree ("timestamp" DESC, category); +CREATE INDEX messages2_datanommer_timestamp_topic_idx ON public.messages2 USING btree ("timestamp" DESC, topic); + + +ALTER TABLE user_messages DROP CONSTRAINT user_messages_msg_fkey; +ALTER TABLE user_messages ADD CO + +ALTER TABLE package_messages DROP CONSTRAINT package_messages_msg_fkey; +ALTER TABLE package_messages ADD CONSTRAINT package_messages_msg_fkey FOREIGN KEY (msg) REFERENCES messages2(id); + +-------------------------------------------------------------------------------- + +DROP TABLE messages_010g , messages_020g, messages_030g, messages_040g, messages_050g, + messages_060g, messages_070g, messages_080g, messages_090g, messages_100g, + messages_110g, messages_120g, messages_130g, messages_140g, messages_150g, + messages_160g, messages_170g, messages_180g, messages_190g, messages_200g, + messages_210g, messages_220g From 807b58df14d0a09acee16036fd2cb4f428c1ef56 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Thu, 4 Feb 2021 09:33:27 +0100 Subject: [PATCH 3/6] Drop statements that don't apply to timescaledb Signed-off-by: Pierre-Yves Chibon --- scripts/migration_timescaledb.sql | 8 -------- 1 file changed, 8 deletions(-) diff --git a/scripts/migration_timescaledb.sql b/scripts/migration_timescaledb.sql index 0b9521d..3845076 100644 --- a/scripts/migration_timescaledb.sql +++ b/scripts/migration_timescaledb.sql @@ -67,11 +67,3 @@ ALTER TABLE user_messages ADD CO ALTER TABLE package_messages DROP CONSTRAINT package_messages_msg_fkey; ALTER TABLE package_messages ADD CONSTRAINT package_messages_msg_fkey FOREIGN KEY (msg) REFERENCES messages2(id); - --------------------------------------------------------------------------------- - -DROP TABLE messages_010g , messages_020g, messages_030g, messages_040g, messages_050g, - messages_060g, messages_070g, messages_080g, messages_090g, messages_100g, - messages_110g, messages_120g, messages_130g, messages_140g, messages_150g, - messages_160g, messages_170g, messages_180g, messages_190g, messages_200g, - messages_210g, messages_220g From 346ff989e2060adf9c3a1db6da3bfcb5ba5520f8 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Fri, 5 Feb 2021 16:53:10 +0100 Subject: [PATCH 4/6] Fix foreign key constraint Signed-off-by: Pierre-Yves Chibon --- scripts/migration_timescaledb.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/migration_timescaledb.sql b/scripts/migration_timescaledb.sql index 3845076..7aec1ef 100644 --- a/scripts/migration_timescaledb.sql +++ b/scripts/migration_timescaledb.sql @@ -63,7 +63,7 @@ CREATE INDEX messages2_datanommer_timestamp_topic_idx ON public.messages2 USING ALTER TABLE user_messages DROP CONSTRAINT user_messages_msg_fkey; -ALTER TABLE user_messages ADD CO +ALTER TABLE user_messages ADD CONSTRAINT user_messages_msg_fkey FOREIGN KEY (msg) REFERENCES messages2(id); ALTER TABLE package_messages DROP CONSTRAINT package_messages_msg_fkey; ALTER TABLE package_messages ADD CONSTRAINT package_messages_msg_fkey FOREIGN KEY (msg) REFERENCES messages2(id); From 1ff0d8fbd2efbd4be91cfa969a51916c93eaccc6 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Fri, 5 Feb 2021 16:54:54 +0100 Subject: [PATCH 5/6] Improve logging and the get function and test our third instance Signed-off-by: Pierre-Yves Chibon --- scripts/test_datagrepper_perfs.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/test_datagrepper_perfs.py b/scripts/test_datagrepper_perfs.py index 16f625f..742fb83 100644 --- a/scripts/test_datagrepper_perfs.py +++ b/scripts/test_datagrepper_perfs.py @@ -313,15 +313,16 @@ class TestAPI: Return: None for exception """ try: - if auth == None: - resp = requests.get(url, verify=verify) - else: - resp = requests.get(url, auth=auth, verify=verify) + resp = requests.get(url, auth=auth, verify=verify, timeout=120) except Exception as ex: _log.error("requests.get() failed with exception:", str(ex)) return None - _log.debug("response time in seconds: %s", resp.elapsed.total_seconds()) + _log.debug( + "response time in seconds: %s -- %s", + resp.elapsed.total_seconds(), + url + ) return resp @@ -341,6 +342,7 @@ def main(): print("Tests started at %s." % time.asctime()) for env_name, base_url in [ + ("datagrepper-timescalebd/aws", "http://datagrepper-timescale.arc.fedorainfracloud.org/datagrepper"), ("datagrepper-test/aws", "http://datagrepper-test.arc.fedorainfracloud.org/datagrepper"), ("datagrepper-prod/aws", "http://datagrepper.arc.fedorainfracloud.org/datagrepper"), ("datagrepper-prod/openshift", "https://datagrepper-monitor-dashboard.app.os.fedoraproject.org"), From f61f8c482a55e5a0858b329ebd3df9f945fc6c5f Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Fri, 5 Feb 2021 17:37:11 +0100 Subject: [PATCH 6/6] Update the documentation about the datanommer/datagrepper work Signed-off-by: Pierre-Yves Chibon --- docs/datanommer_datagrepper/index.rst | 2 + .../pg_partitioning.rst | 60 +++++++++++++++++++ .../datanommer_datagrepper/pg_timescaledb.rst | 55 +++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 docs/datanommer_datagrepper/pg_partitioning.rst create mode 100644 docs/datanommer_datagrepper/pg_timescaledb.rst diff --git a/docs/datanommer_datagrepper/index.rst b/docs/datanommer_datagrepper/index.rst index 28cae4b..4bd6e09 100644 --- a/docs/datanommer_datagrepper/index.rst +++ b/docs/datanommer_datagrepper/index.rst @@ -25,3 +25,5 @@ Here is the list of ideas/things we looked at: :maxdepth: 1 pg_stat_statements + pg_partitioning + pg_timescaledb diff --git a/docs/datanommer_datagrepper/pg_partitioning.rst b/docs/datanommer_datagrepper/pg_partitioning.rst new file mode 100644 index 0000000..67add82 --- /dev/null +++ b/docs/datanommer_datagrepper/pg_partitioning.rst @@ -0,0 +1,60 @@ +Partitioning the database +========================= + +In the database used by datanommer and datagrepper one table stands out from the +other ones by its size, the ``messages`` table. This can be observed in +:ref:`datanommer`. + +One possibility to speed things up in datagrepper is to partition that table +into a set of smaller sized partitions. + +Here are some resources regarding partitioning postgresql tables: + +* Table partitioning at postgresql's documentation: https://www.postgresql.org/docs/13/ddl-partitioning.html +* How to use table partitioning to scale PostgreSQL: https://www.enterprisedb.com/postgres-tutorials/how-use-table-partitioning-scale-postgresql +* Definition of PostgreSQL Partition: https://www.educba.com/postgresql-partition/ + + +Attempt #1 +---------- + +For our first attempt at partitioning the `messages` table, we thought we would +partition it by year. Having a different partition for each year. +We thus started by adding a ``year`` field to the table and fill it by extracting +the year from the ``timestamp`` field of the table. + +However, one thing to realize when using partitioned table is that each partition +needs to be considered as an independant table. Meaning an unique constraint has +to involve the field on which the table is partitioned. +In other words, if you partition the table by a year field, that year field will +need to be part of the primary key as well as any ``UNIQUE`` constraint on the +table. + +So to partition the `messages` table on ``year``, we had to add the ``year`` +field to the primary key. However, that broke the foreign key constraints on +the ``user_messages`` and ``package_messages`` tables which rely on the ``id`` +field to link the tables. + + +Attempt #2 +---------- + +Since partitioning on ``year`` did not work, we reconsidered and decided to +partition on the ``id`` field instead using `RANGE PARTITION`. + +We partitioned the ``messages`` table on the ``id`` field with partition of 10 +million records each. This has the advantage of making each partition of similar +sizes. + + + + +More resources +-------------- + +These are a few more resources we looked at and thought were worth bookmarking: + +* Automatic partitioning by day - PostgreSQL: https://stackoverflow.com/questions/55642326/ +* pg_partman, partition manager: https://github.com/pgpartman/pg_partman +* How to scale PostgreSQL 10 using table inheritance and declarative partitioning: https://blog.timescale.com/blog/scaling-partitioning-data-postgresql-10-explained-cd48a712a9a1/ + diff --git a/docs/datanommer_datagrepper/pg_timescaledb.rst b/docs/datanommer_datagrepper/pg_timescaledb.rst new file mode 100644 index 0000000..b13d670 --- /dev/null +++ b/docs/datanommer_datagrepper/pg_timescaledb.rst @@ -0,0 +1,55 @@ +Using the timescaledb extension +=============================== + +timescaledb (https://docs.timescale.com/latest/) is a postgresql extension for +time-series database. +Considering a lot of the actions done on datagrepper involve the timestamp field +(for example: all the messages with that topic in this time range), we figured +this extension is worth investigating. + +A bonus point being for this extension to already packaged and available in +Fedora and EPEL. + + +Resources +--------- + +* Setting up/enabling timescaledb: https://severalnines.com/database-blog/how-enable-timescaledb-existing-postgresql-database +* Migrating an existing database to timescaledb: https://docs.timescale.com/latest/getting-started/migrating-data#same-db + + +Installing/enabling/activating +------------------------------ + +To install the plugin, simply run: +:: + + dnf install timescaledb + +The edit ``/var/lib/pgsql/data/postgresql.conf`` to tell postgresql to load it: +:: + + shared_preload_libraries = 'pg_stat_statements,timescaledb' + timescaledb.max_background_workers=4 + + +It will then need a restart of the entire database server: +:: + + systemctl restart postgresql + +You can then check if the extension loaded properly: +:: + + $ sudo -u postgres psql + SELECT * FROM pg_available_extensions ORDER BY name; + +Then, you will need to activate it for your database: +:: + $ sudo -u postgres psql + CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE; + +Finally, you can check that the extension was activated for your database: +:: + $ sudo -u postgres psql + \dx