Distribute the hosts to be crawled by MM2 between the existing crawlers
This commit adds two new scripts to the crawlers: * mm2_get-highest-active-host-id: this queries the MM database and returns the id of the active mirror with the highest id * run_crawler.sh: this calculates the right startid and stopid parameters depending on the number of available crawlers These scripts are integrated into the start of the crawler so that each crawler only crawls a subset of the existing mirrors. This distribution is not perfect as it pretends that the active mirrors are equally distributed under the total number of existing mirrors.
This commit is contained in:
parent
b7c6fe0f48
commit
16e0082824
4 changed files with 69 additions and 2 deletions
|
@ -1,4 +1,4 @@
|
||||||
# run the crawler twice a day
|
# run the crawler twice a day
|
||||||
# logs sent to /var/log/mirrormanager/crawler.log and crawl/* by default
|
# logs sent to /var/log/mirrormanager/crawler.log and crawl/* by default
|
||||||
# 32GB of RAM is not enough for 75 threads, 40 seems to work so far
|
# 32GB of RAM is not enough for 75 threads, 35 seems to work so far
|
||||||
0 */12 * * * mirrormanager /usr/bin/mm2_crawler --threads 40 > /dev/null 2>&1
|
0 */12 * * * mirrormanager /usr/bin/mm2_crawler --threads 35 `/usr/local/bin/run_crawler.sh 2` > /dev/null 2>&1
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import mirrormanager2.lib
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(usage=sys.argv[0] + " [options]")
|
||||||
|
parser.add_argument(
|
||||||
|
"-c", "--config",
|
||||||
|
dest="config", default='/etc/mirrormanager/mirrormanager2.cfg',
|
||||||
|
help="Configuration file to use")
|
||||||
|
|
||||||
|
options = parser.parse_args()
|
||||||
|
|
||||||
|
config = dict()
|
||||||
|
with open(options.config) as config_file:
|
||||||
|
exec(compile(config_file.read(), options.config, 'exec'), config)
|
||||||
|
|
||||||
|
session = mirrormanager2.lib.create_session(config['DB_URL'])
|
||||||
|
|
||||||
|
# Get all active mirrors
|
||||||
|
hosts = mirrormanager2.lib.get_mirrors(session, private=False,
|
||||||
|
admin_active=True, user_active=True, site_private=False,
|
||||||
|
site_user_active=True, site_admin_active=True)
|
||||||
|
|
||||||
|
# only the ids
|
||||||
|
hosts = [ host.id for host in hosts ]
|
||||||
|
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
print max(hosts)
|
29
roles/mirrormanager/crawler/files/run_crawler.sh
Normal file
29
roles/mirrormanager/crawler/files/run_crawler.sh
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
#/bin/bash
|
||||||
|
|
||||||
|
if [ $# -ne 1 ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
NUMBER_OF_CRAWLERS=$1
|
||||||
|
HOST=`hostname -s`
|
||||||
|
MAX_HOST=`/usr/local/bin/mm2_get-highest-active-host-id`
|
||||||
|
|
||||||
|
# make MAX_HOST a multiple of NUMBER_OF_CRAWLERS
|
||||||
|
let FIX=${MAX_HOST}%${NUMBER_OF_CRAWLERS}
|
||||||
|
|
||||||
|
if [ "${FIX}" -ne "0" ]; then
|
||||||
|
let MAX_HOST=${MAX_HOST}+${NUMBER_OF_CRAWLERS}-${FIX}
|
||||||
|
fi
|
||||||
|
|
||||||
|
let PART=${MAX_HOST}/${NUMBER_OF_CRAWLERS}
|
||||||
|
|
||||||
|
STARTID=0
|
||||||
|
STOPID=${PART}
|
||||||
|
|
||||||
|
for i in `seq 1 ${NUMBER_OF_CRAWLERS}`; do
|
||||||
|
if [ "${HOST}" == "mm-crawler0${i}" ]; then
|
||||||
|
echo "--startid=${STARTID} --stopid=${STOPID}"
|
||||||
|
fi
|
||||||
|
let STARTID=${STARTID}+${PART}
|
||||||
|
let STOPID=${STOPID}+${PART}
|
||||||
|
done
|
|
@ -51,3 +51,9 @@
|
||||||
setype=httpd_sys_content_t
|
setype=httpd_sys_content_t
|
||||||
state=directory
|
state=directory
|
||||||
recurse=yes
|
recurse=yes
|
||||||
|
|
||||||
|
- name: install the script to get the highest mirror id
|
||||||
|
copy: src=mm2_get-highest-active-host-id dest=/usr/local/bin/mm2_get-highest-active-host-id mode=0755
|
||||||
|
|
||||||
|
- name: install the crawler distribute script
|
||||||
|
copy: src=run_crawler.sh dest=/usr/local/bin/run_crawler.sh mode=0755
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue