Distribute the hosts to be crawled by MM2 between the existing crawlers

This commit adds two new scripts to the crawlers:
 * mm2_get-highest-active-host-id: this queries the MM database and returns
   the id of the active mirror with the highest id
 * run_crawler.sh: this calculates the right startid and stopid parameters
   depending on the number of available crawlers

These scripts are integrated into the start of the crawler so that each
crawler only crawls a subset of the existing mirrors. This distribution
is not perfect as it pretends that the active mirrors are equally
distributed under the total number of existing mirrors.
This commit is contained in:
Adrian Reber 2015-05-06 09:02:31 +00:00
parent b7c6fe0f48
commit 16e0082824
4 changed files with 69 additions and 2 deletions

View file

@ -1,4 +1,4 @@
# run the crawler twice a day
# logs sent to /var/log/mirrormanager/crawler.log and crawl/* by default
# 32GB of RAM is not enough for 75 threads, 40 seems to work so far
0 */12 * * * mirrormanager /usr/bin/mm2_crawler --threads 40 > /dev/null 2>&1
# 32GB of RAM is not enough for 75 threads, 35 seems to work so far
0 */12 * * * mirrormanager /usr/bin/mm2_crawler --threads 35 `/usr/local/bin/run_crawler.sh 2` > /dev/null 2>&1

View file

@ -0,0 +1,32 @@
#!/usr/bin/python
import argparse
import sys
import mirrormanager2.lib
parser = argparse.ArgumentParser(usage=sys.argv[0] + " [options]")
parser.add_argument(
"-c", "--config",
dest="config", default='/etc/mirrormanager/mirrormanager2.cfg',
help="Configuration file to use")
options = parser.parse_args()
config = dict()
with open(options.config) as config_file:
exec(compile(config_file.read(), options.config, 'exec'), config)
session = mirrormanager2.lib.create_session(config['DB_URL'])
# Get all active mirrors
hosts = mirrormanager2.lib.get_mirrors(session, private=False,
admin_active=True, user_active=True, site_private=False,
site_user_active=True, site_admin_active=True)
# only the ids
hosts = [ host.id for host in hosts ]
session.close()
print max(hosts)

View file

@ -0,0 +1,29 @@
#/bin/bash
if [ $# -ne 1 ]; then
exit 0
fi
NUMBER_OF_CRAWLERS=$1
HOST=`hostname -s`
MAX_HOST=`/usr/local/bin/mm2_get-highest-active-host-id`
# make MAX_HOST a multiple of NUMBER_OF_CRAWLERS
let FIX=${MAX_HOST}%${NUMBER_OF_CRAWLERS}
if [ "${FIX}" -ne "0" ]; then
let MAX_HOST=${MAX_HOST}+${NUMBER_OF_CRAWLERS}-${FIX}
fi
let PART=${MAX_HOST}/${NUMBER_OF_CRAWLERS}
STARTID=0
STOPID=${PART}
for i in `seq 1 ${NUMBER_OF_CRAWLERS}`; do
if [ "${HOST}" == "mm-crawler0${i}" ]; then
echo "--startid=${STARTID} --stopid=${STOPID}"
fi
let STARTID=${STARTID}+${PART}
let STOPID=${STOPID}+${PART}
done

View file

@ -51,3 +51,9 @@
setype=httpd_sys_content_t
state=directory
recurse=yes
- name: install the script to get the highest mirror id
copy: src=mm2_get-highest-active-host-id dest=/usr/local/bin/mm2_get-highest-active-host-id mode=0755
- name: install the crawler distribute script
copy: src=run_crawler.sh dest=/usr/local/bin/run_crawler.sh mode=0755