From 16e008282488b82b198eb26c0b5094569114cb9b Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 6 May 2015 09:02:31 +0000 Subject: [PATCH] Distribute the hosts to be crawled by MM2 between the existing crawlers This commit adds two new scripts to the crawlers: * mm2_get-highest-active-host-id: this queries the MM database and returns the id of the active mirror with the highest id * run_crawler.sh: this calculates the right startid and stopid parameters depending on the number of available crawlers These scripts are integrated into the start of the crawler so that each crawler only crawls a subset of the existing mirrors. This distribution is not perfect as it pretends that the active mirrors are equally distributed under the total number of existing mirrors. --- .../mirrormanager/crawler/files/crawler.cron | 4 +-- .../files/mm2_get-highest-active-host-id | 32 +++++++++++++++++++ .../crawler/files/run_crawler.sh | 29 +++++++++++++++++ roles/mirrormanager/crawler/tasks/main.yml | 6 ++++ 4 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 roles/mirrormanager/crawler/files/mm2_get-highest-active-host-id create mode 100644 roles/mirrormanager/crawler/files/run_crawler.sh diff --git a/roles/mirrormanager/crawler/files/crawler.cron b/roles/mirrormanager/crawler/files/crawler.cron index e23f38c626..054cb84d9f 100644 --- a/roles/mirrormanager/crawler/files/crawler.cron +++ b/roles/mirrormanager/crawler/files/crawler.cron @@ -1,4 +1,4 @@ # run the crawler twice a day # logs sent to /var/log/mirrormanager/crawler.log and crawl/* by default -# 32GB of RAM is not enough for 75 threads, 40 seems to work so far -0 */12 * * * mirrormanager /usr/bin/mm2_crawler --threads 40 > /dev/null 2>&1 +# 32GB of RAM is not enough for 75 threads, 35 seems to work so far +0 */12 * * * mirrormanager /usr/bin/mm2_crawler --threads 35 `/usr/local/bin/run_crawler.sh 2` > /dev/null 2>&1 diff --git a/roles/mirrormanager/crawler/files/mm2_get-highest-active-host-id b/roles/mirrormanager/crawler/files/mm2_get-highest-active-host-id new file mode 100644 index 0000000000..dd4299e85a --- /dev/null +++ b/roles/mirrormanager/crawler/files/mm2_get-highest-active-host-id @@ -0,0 +1,32 @@ +#!/usr/bin/python + +import argparse +import sys + +import mirrormanager2.lib + +parser = argparse.ArgumentParser(usage=sys.argv[0] + " [options]") +parser.add_argument( + "-c", "--config", + dest="config", default='/etc/mirrormanager/mirrormanager2.cfg', + help="Configuration file to use") + +options = parser.parse_args() + +config = dict() +with open(options.config) as config_file: + exec(compile(config_file.read(), options.config, 'exec'), config) + +session = mirrormanager2.lib.create_session(config['DB_URL']) + +# Get all active mirrors +hosts = mirrormanager2.lib.get_mirrors(session, private=False, + admin_active=True, user_active=True, site_private=False, + site_user_active=True, site_admin_active=True) + +# only the ids +hosts = [ host.id for host in hosts ] + +session.close() + +print max(hosts) diff --git a/roles/mirrormanager/crawler/files/run_crawler.sh b/roles/mirrormanager/crawler/files/run_crawler.sh new file mode 100644 index 0000000000..b9d642e549 --- /dev/null +++ b/roles/mirrormanager/crawler/files/run_crawler.sh @@ -0,0 +1,29 @@ +#/bin/bash + +if [ $# -ne 1 ]; then + exit 0 +fi + +NUMBER_OF_CRAWLERS=$1 +HOST=`hostname -s` +MAX_HOST=`/usr/local/bin/mm2_get-highest-active-host-id` + +# make MAX_HOST a multiple of NUMBER_OF_CRAWLERS +let FIX=${MAX_HOST}%${NUMBER_OF_CRAWLERS} + +if [ "${FIX}" -ne "0" ]; then + let MAX_HOST=${MAX_HOST}+${NUMBER_OF_CRAWLERS}-${FIX} +fi + +let PART=${MAX_HOST}/${NUMBER_OF_CRAWLERS} + +STARTID=0 +STOPID=${PART} + +for i in `seq 1 ${NUMBER_OF_CRAWLERS}`; do + if [ "${HOST}" == "mm-crawler0${i}" ]; then + echo "--startid=${STARTID} --stopid=${STOPID}" + fi + let STARTID=${STARTID}+${PART} + let STOPID=${STOPID}+${PART} +done diff --git a/roles/mirrormanager/crawler/tasks/main.yml b/roles/mirrormanager/crawler/tasks/main.yml index 6ec1414cc4..539886c4d9 100644 --- a/roles/mirrormanager/crawler/tasks/main.yml +++ b/roles/mirrormanager/crawler/tasks/main.yml @@ -51,3 +51,9 @@ setype=httpd_sys_content_t state=directory recurse=yes + +- name: install the script to get the highest mirror id + copy: src=mm2_get-highest-active-host-id dest=/usr/local/bin/mm2_get-highest-active-host-id mode=0755 + +- name: install the crawler distribute script + copy: src=run_crawler.sh dest=/usr/local/bin/run_crawler.sh mode=0755