Better distribution of mirrors to crawl.

The script mm2_get-highest-active-host-id used to return the highest
ID of the active mirrors. This number was divided by the number of
active crawlers and then each crawler got its share of mirrors to crawl.
This did not take into account that more active mirrors are in the
higher IDs as old mirror IDs are not re-used and thus one crawler was
getting much more mirrors to crawl than another. The new script (which
will be renamed) now divides the list correctly by returning exactly the
fraction which each crawler should crawl.
This commit is contained in:
Adrian Reber 2015-06-28 10:11:05 +00:00
parent 0553afe274
commit c8420ba580

View file

@ -5,18 +5,32 @@ import sys
import mirrormanager2.lib
parser = argparse.ArgumentParser(usage=sys.argv[0] + " [options]")
parser = argparse.ArgumentParser(usage=sys.argv[0] + " [options]",
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
"-c", "--config",
dest="config", default='/etc/mirrormanager/mirrormanager2.cfg',
help="Configuration file to use")
parser.add_argument(
"-f", "--fraction",
dest="fraction", default="1:1",
help='''Specify which part of the mirror range should be returned
1:1 - all mirrors
1:2 - the first half of the mirrors
2:3 - the middle third of the mirrors''')
options = parser.parse_args()
config = dict()
with open(options.config) as config_file:
exec(compile(config_file.read(), options.config, 'exec'), config)
if ':' not in options.fraction:
parser.print_help()
sys.exit(0)
session = mirrormanager2.lib.create_session(config['DB_URL'])
# Get all active mirrors
@ -29,4 +43,15 @@ hosts = [ host.id for host in hosts ]
session.close()
print max(hosts)
hosts.sort()
total = int(options.fraction.split(':')[1])
part = int(options.fraction.split(':')[0])
start = (part-1)*(len(hosts)/total)
stop = (len(hosts)/total)*part
if total == part:
print "--startid=%d" % (hosts[start])
else:
print "--startid=%d --stopid=%d" % (hosts[start], hosts[stop])