fedora-infrastructure/scripts/vacstats/vacstat.py

652 lines
25 KiB
Python
Executable file

#!/usr/bin/python -tt
'''
Licensed under the GNU GPL v2.
Some pretty awful python code to give details about our postgres databases
I would have written this in bad perl but my perl was too rusty.
This script detects when a new database is added to the system.
Note:
If this is run on a database that hasn't had the pgstattuple function added
into template1, pgstattuple will have to be added to template1 and each
existing database. Future databases will inherit pgstattuple from template1.
yum install -y postgresql-contrib
sudo -u postgres psql </usr/share/pgsql/contrib/pgstattuple.sql DBNAME
'''
__version__ = '0.5'
import sys
import os
import re
import cPickle
import glob
import subprocess
import tempfile
import datetime
from stat import ST_MTIME
from subprocess import Popen, PIPE
import optparse
import fcntl
IGNOREDDBS = ('postgres', 'template1', 'template0')
STATEDIR = '/var/lib/vacstat'
class DBError(Exception):
pass
class SchemaChangeError(DBError):
pass
class InitialRunWarning(SchemaChangeError):
pass
class ArgumentError(Exception):
pass
class XIDOverflowWarning(DBError):
pass
def _compare_db(dbTables, opts):
'''Verify the db tables are the same as the last run.
'''
# Load the currently known tables
try:
knownFile = file(os.path.join(opts.statedir, 'knowndbs.pkl'), 'r')
except IOError:
knownFile = file(os.path.join(opts.statedir, 'knowndbs.pkl'), 'w')
cPickle.dump(dbTables, knownFile)
knownFile.close()
raise InitialRunWarning, 'No databases currently setup at this location. Be sure to setup an initial vacuum policy for all databases from this script'
knownTables = cPickle.load(knownFile)
knownFile.close()
results = []
# First check that we haven't dropped any dbs
for db in knownTables:
if db not in dbTables:
results.append('%s has been removed' % db)
for table in knownTables[db]:
if table not in dbTables[db]:
results.append('table %s has been removed from %s' % (table, db))
# Then check that we haven't added any
for db in dbTables:
if db not in knownTables:
results.append('db %s has been added' % db)
continue
for table in dbTables[db]:
if table not in knownTables[db]:
results.append('db %s has a new table %s' % (db, table))
if results:
unknowns = file(os.path.join(opts.statedir, 'unknowndbs.pkl'), 'w')
cPickle.dump(dbTables, unknowns)
unknowns.close()
msg = '''The database schema has changed since the last run.
Please make sure that the following databases and tables are setup to be
vacuumed. Then move the file %s/unknowndbs.pkl to %s/knowndbs.pkl
%s
''' % (opts.statedir, opts.statedir, '\n'.join(results))
raise SchemaChangeError, msg
def test_schema(opts):
'''Test that the database schema is known. This helps us keep the
vacuum policy up-to-date by forcing us to acknowledge all changes before
they can be used.
'''
dbnameRE = re.compile('^[ \t]+([^ \t]+)[ \t]+\|')
tablenameRE = re.compile('^[^|]+\|[ \t]+([^ \t]+)[ \t]+\|')
psqlCmd = subprocess.Popen(('/usr/bin/psql', 'postgres'),
stdout=PIPE, stdin=PIPE)
output = psqlCmd.communicate('\\l\n')[0].split('\n')
dbTables = {}
for db in output[3:-3]:
match = dbnameRE.match(db)
if match.group(1):
if match.group(1) in IGNOREDDBS:
continue
dbTables[match.group(1)] = []
else:
raise SchemaChangeError, 'Regular Expression did not detect db'
for db in dbTables:
psqlCmd = subprocess.Popen(('/usr/bin/psql', db),
stdout=PIPE, stdin=PIPE)
output = psqlCmd.communicate('\\dt\n')[0].split('\n')
for table in output[3:-3]:
match = tablenameRE.match(table)
if match.group(1):
dbTables[db].append(match.group(1))
else:
raise SchemaChangeError, 'Regular Expression did not detect table for %s' % db
# Make sure we're only dealing with known databases
_compare_db(dbTables, opts)
def test_transactions(opts):
dbXIdRE = re.compile('^[ \t]+([^ \t]+).*[ \t]+([0-9]+)$')
psqlCmd = subprocess.Popen(('/usr/bin/psql'), stdout=PIPE, stdin=PIPE)
output = psqlCmd.communicate('select datname, age(datfrozenxid), pow(2, 31) - age(datfrozenxid) as xids_remaining from pg_database;\n')[0].split('\n')
overflows = []
for dbLine in output[2:-3]:
match = dbXIdRE.match(dbLine)
if match.group(1) and match.group(2):
if int(match.group(2)) <= 500000000:
overflows.append('Used over half the transaction ids for %(db)s. Please schedule a vacuum of that entire database soon:\n sudo -u postgres vacuumdb -zvd %(db)s' % {'db': match.group(1)})
else:
raise DBError, 'Unexpected string received when testing for transaction overflow:\n %s' % dbLine
if overflows:
raise XIDOverflowWarning, '\n'.join(overflows)
def test_all(opts):
test_transactions(opts)
test_schema(opts)
def list_dbs(opts):
# Read in the DBs we are already aware of
if not os.access(os.path.join(opts.statedir, 'knowndbs.pkl'), os.F_OK):
try:
test_schema(opts)
except InitialRunWarning, e:
# This is expected to be the initial run
print e
knownFile = file(os.path.join(opts.statedir, 'knowndbs.pkl'), 'r')
knownDBs = cPickle.load(knownFile)
knownFile.close()
print 'Databases: %s\n' % knownDBs.keys()
for db in knownDBs:
print ' %s tables:' % db
print ' %s' % knownDBs[db]
print
def _st_run(interval, stats, db, table, sessionDir):
# When the optionList says we're going to set stattuple-hour to
# be invoked we know that we're presently on stattuple-start.
if interval == 'initial':
# Run a vacuum of the database table if this is the first time
psqlCmd = subprocess.Popen(
('/usr/bin/psql', 'postgres', '-d', db),
stdout=PIPE, stdin=PIPE, env={'PGOPTIONS':'-c maintenance_work_mem=1048576'})
psqlCmd.communicate('vacuum analyze %s\n' % table)
if psqlCmd.returncode:
raise DBError, 'Vacuum failed on %s %s' % (db, table)
# Set an at job to invoke the next stattuple job
if interval != 'day':
command = ['/usr/bin/at']
if interval == 'initial':
command.append('now + 1 hours')
elif interval == 'hour':
command.append('now + 5 hours')
elif interval == 'quarter':
command.append('now + 18 hours')
else:
raise Exception, 'Unknown interval %s' % interval
atCmd = subprocess.Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
command = [os.path.abspath(sys.argv[0])]
command.extend(('--session', opts.sessionID, '--database',
db, '--table', table))
command.extend(opts.optionList)
atCmd.stdin.write(' '.join(command))
atCmd.stdin.close()
if atCmd.wait():
print 'Scheduling interval after %s for %s:%s failed' % (interval, db, table)
# Run stattuple of the tables in the db.
psqlCmd = subprocess.Popen(('/usr/bin/psql', 'postgres', '-d', db), stdout=PIPE, stdin=PIPE)
output = psqlCmd.communicate("\\x\nselect * from pgstattuple('%s')\n" % table)[0].split('\n')
for line in output[2:-2]:
key, value = line.split(' | ')
value = value.strip()
if value.find('.') >= 0:
value = float(value)
else:
value = int(value)
stats[db][table][interval][key.strip()] = value
def stattuple(opts):
'''Start a stattuple run.
This gathers initial statistics on how much updating is being seen on a
database's tables.
'''
# Ah, what I wouldn't give for functools.partial()
interval = 'day'
if opts.optionList:
if opts.optionList[0].endswith('-hour'):
interval = 'initial'
elif opts.optionList[0].endswith('-quarter'):
interval = 'hour'
elif opts.optionList[0].endswith('-day'):
interval = 'quarter'
# Read in the DBs we are already aware of
if not os.access(os.path.join(opts.statedir, 'knowndbs.pkl'), os.F_OK):
try:
test_schema(opts)
except InitialRunWarning, e:
# This is expected to be the initial run
print e
knownFile = file(os.path.join(opts.statedir, 'knowndbs.pkl'), 'r')
knownDBs = cPickle.load(knownFile)
knownFile.close()
if opts.databases:
dbList = opts.databases
else:
# If no database is selected, we'll collect statistics on all of them
dbList = knownDBs.keys()
# Make sure we have a schema for all requested dbs
for db in dbList:
if db not in knownDBs:
raise ArgumentsError, 'Cannot process unknown DB, %s. Perhaps you need to run "vacstat.py schema" first' % db
if opts.tables:
# Make sure we have a schema for all requested tables
if len(dbList) != 1:
raise ArgumentsError, '--tables can only be used if --databases is specified exactly once.'
for table in opts.tables:
if table not in knownDBs[dbList[0]]:
raise ArgumentsError, 'Cannot process unknown Table %s which is not in db %s. Perhaps you need to run "vacstat.py schema" first' % (table, db)
tableList = opts.tables
else:
tableList = None
# Intialize the data struct we'll be saving all our information in
stats = {}
for db in dbList:
stats[db] = {}
for table in knownDBs[db]:
stats[db][table] = {'initial':{},
'hour':{}, 'quarter':{}, 'day':{}}
# If this is our first time, initialize a new outputDir
if not opts.sessionID:
sessionDir = tempfile.mkdtemp(prefix=datetime.datetime.today().strftime('stattuple-%Y%m%d%H%M%S.'), dir=opts.statedir)
opts.sessionID = os.path.basename(sessionDir)
statsFile = file(os.path.join(sessionDir, 'stats.pkl'), 'w')
cPickle.dump(stats, statsFile)
statsFile.close()
else:
sessionDir = os.path.join(opts.statedir, opts.sessionID
)
if tableList:
db = dbList[0]
for table in tableList:
_st_run(interval, stats, db, table, sessionDir)
else:
for db in dbList:
for table in knownDBs[db]:
_st_run(interval, stats, db, table, sessionDir)
# Load the current pickled data
statsFile = file(os.path.join(sessionDir, 'stats.pkl'), 'rb+')
fcntl.lockf(statsFile, fcntl.LOCK_EX)
persistentStats = cPickle.load(statsFile)
# Merge old and new information
for db in stats:
for table in stats[db]:
for period in stats[db][table]:
if stats[db][table][period]:
persistentStats[db][table][period] = stats[db][table][period]
# save merged information back to the statsFile
statsFile.truncate(0)
statsFile.seek(0)
cPickle.dump(persistentStats, statsFile)
statsFile.flush()
fcntl.lockf(statsFile, fcntl.LOCK_UN)
statsFile.close()
if interval == 'day':
# add the db:table to a file to show we're done gathering stats
done = file(os.path.join(sessionDir, 'DONE'), 'a')
fcntl.lockf(done, fcntl.LOCK_EX)
done.write('%s:%s\n' % (opts.databases[0], opts.tables[0]))
fcntl.lockf(done, fcntl.LOCK_UN)
done.close()
def merge_history(statedir):
statDirList = sorted(glob.glob(os.path.join(statedir, 'stattuple-*')))
for statDir in statDirList:
finishedTables = {}
timestamp = 0 # When the stat collection finished
# Read data from each session directory.
if not os.path.isdir(statDir):
# Sanity check that this is an output dir
continue
if glob.glob(os.path.join(statDir, 'DONE')):
# Read in the tables
finishFile = file(os.path.join(statDir, 'DONE'), 'r')
fcntl.lockf(finishFile, fcntl.LOCK_SH)
# Get the timestamp from the finish file for later
timestamp = os.stat(os.path.join(statDir, 'DONE'))[ST_MTIME]
for line in finishFile:
(db, table) = line.strip().split(':')
if db not in finishedTables:
finishedTables[db] = {}
if table not in finishedTables[db]:
finishedTables[db][table] = {}
fcntl.lockf(finishFile, fcntl.LOCK_UN)
finishFile.close()
else:
# This one is not done yet
continue
# Read the tables from the stat file
statsFile = file(os.path.join(statDir, 'stats.pkl'), 'r')
fcntl.lockf(statsFile, fcntl.LOCK_SH)
stats = cPickle.load(statsFile)
fcntl.lockf(statsFile, fcntl.LOCK_UN)
statsFile.close()
# Check that all tables are done
finished = True
for db in stats:
if db not in finishedTables:
finished = False
break
for table in stats[db]:
if table not in finishedTables[db]:
finished = False
break
if not finished:
break
del finishedTables
if not finished:
continue
#
# Add a new record to our history file
#
historyFilename = os.path.join(statedir, 'history.pkl')
# If the file doesn't exist yet, create it
if not os.access(historyFilename, os.F_OK):
history = {}
historyFile = file(historyFilename, 'w')
fcntl.lockf(historyFile, fcntl.LOCK_EX)
cPickle.dump(history, historyFile)
fcntl.lockf(historyFile, fcntl.LOCK_UN)
historyFile.close()
historyFile = file(os.path.join(statedir, 'history.pkl'), 'rb+')
fcntl.lockf(historyFile, fcntl.LOCK_EX)
history = cPickle.load(historyFile)
# Merge the data we've read in with the historical data
for db in stats:
if db not in history:
history[db] = {}
for table in stats[db]:
if table not in history[db]:
history[db][table] = {}
history[db][table][timestamp] = {}
for interval in stats[db][table]:
history[db][table][timestamp][interval] = {}
for key, value in stats[db][table][interval].items():
if isinstance(value, str):
if value.find('.') >= -1:
history[db][table][timestamp][interval][key] = float(value)
else:
history[db][table][timestamp][interval][key] = int(value)
else:
history[db][table][timestamp][interval][key] = value
historyFile.truncate(0)
historyFile.seek(0)
cPickle.dump(history, historyFile)
fcntl.lockf(historyFile, fcntl.LOCK_UN)
historyFile.close()
# Delete the processed stattuple directory
def analyze_data(opts):
merge_history(opts.statedir)
# Read in the history
historyFile = file(os.path.join(opts.statedir, 'history.pkl'), 'r')
fcntl.lockf(historyFile, fcntl.LOCK_SH)
history = cPickle.load(historyFile)
fcntl.lockf(historyFile, fcntl.LOCK_UN)
historyFile.close()
hourly = []
daily = []
suggestions = []
for db in history:
for table in history[db]:
# find latest timestamp for this table
last = sorted(history[db][table].keys())[-1]
tableData = []
run = history[db][table][last]
#
# Battery of tests
#
infrequent = False
frequent = False
vacuumFull = False
if run['day']['free_space'] == 0 and \
run['day']['dead_tuple_len'] == 0 \
and run['day']['table_len'] == 0:
# This table is empty
infrequent = True
# Check how much dead tuples grew absolutely in 24 hours
if run['day']['dead_tuple_len'] <= 10000:
infrequent = True
elif run['day']['dead_tuple_len'] >= 1000000:
frequent = True
# Check how many dead vs live tuples there are
if run['day']['dead_tuple_len'] + run['day']['tuple_len'] == 0:
deadTuplePercent = 0
else:
deadTuplePercent = run['day']['dead_tuple_len'] * 100.0 \
/ (run['day']['dead_tuple_len'] \
+ run['day']['tuple_len'])
if deadTuplePercent > 20:
frequent = True
elif deadTuplePercent < 10:
infrequent = True
# Check how much free space exists
if run['day']['free_space'] + run['day']['tuple_len'] \
+ run['day']['dead_tuple_len'] == 0:
freeSpacePercent = 0
else:
freeSpacePercent = (run['day']['free_space'] \
+ run['day']['dead_tuple_len']) * 100.0 \
/ (run['day']['free_space'] + run['day']['tuple_len'] \
+ run['day']['dead_tuple_len'])
# If free space is larger than 15%, see whether we can use that
# much space between vacuums (Build in a small margin for tables
# that are so small that the free space from the table being
# allocated is > 15%.)
if freeSpacePercent > 15 and run['day']['table_len'] > 524288:
if frequent:
# Calculate roughly how much is used per hour. Take the
# maximum of our samples
usage = (run['initial']['free_space'] - run['day']['free_space'])/24.0
if usage < run['initial']['free_space'] - run['hour']['free_space']:
usage = run['hour']['free_space'] - run['hour']['free_space']
if usage < (run['initial']['free_space'] - run['quarter']['free_space']) / 6:
usage = (run['initial']['free_space'] - run['quarter']['free_space']) / 6
else:
# Calculate how much is used per day
usage = run['initial']['free_space'] - run['day']['free_space']
# If the projected usage between vacuums is < the amount of
# free space we have, recommend a vacuum full.
if usage < run['day']['free_space']:
suggestions.append('Vacuum full %(db)s %(table)s: Freespace Percent %(freeP)s%%, %(freeB)s Bytes\n vacuumdb -zfd %(db)s -t %(table)s' % {'db': db, 'table': table, 'freeP': freeSpacePercent, 'freeB': run['day']['free_space']})
# if a table is large in absolute terms, flag them as
# potentially problematic
# 5GB (For reference, mirrormanager::host_category_dir==1.2GB
# koji::rpmfiles == 20GB)
if run['day']['table_len'] >= 5000000000:
suggestions.append('%s %s is quite large and may cause problems' % (db, table))
# Output suggestions
# Currently we only suggest hourly and daily
if frequent:
hourly.append((db, table))
else:
daily.append((db, table))
print 'hourly cron script:'
print '#!/bin/sh'
print
print "PGOPTIONS='-c maintenance_work_mem=1048576'"
print
for table in hourly:
print '/usr/bin/vacuumdb -z --quiet -d %s -t %s' % (table[0], table[1])
print '\n\ndaily cron script:'
print '#!/bin/sh'
print
print "PGOPTIONS='-c maintenance_work_mem=1048576'"
print
for table in daily:
print '/usr/bin/vacuumdb -z --quiet -d %s -t %s' % (table[0], table[1])
print 'Things to look into further:'
for line in suggestions:
print line
Commands = {'schema': test_schema,
'transactions': test_transactions,
'check': test_all,
'list': list_dbs,
'stattuple-start': stattuple,
'stattuple-hour': stattuple,
'stattuple-quarter': stattuple,
'stattuple-day': stattuple,
'analyze': analyze_data}
def parse_args():
'''Take information from the user about what actions to perform.
'''
parser = optparse.OptionParser(version = __version__, usage='''
vacstat.py COMMAND [options]
COMMAND can be::
transactions: check that we aren't in danger of running out of
transaction ids.
schema: check that the database schema hasn't changed since the
last run. This helps you keep the vacuum policy up to
date by showing you what tables/databases have changed
since the last run.
check: run schema and transactions checks.
list: List dbs and tables that are known.
stattuple-start: Start a stattuple run. This command should be used with
the --database option to prevent overloading the database
server with too many queries at the same time.
stattuple-start will run a vacuum of the database/tables
followed by a stattuple of the tables in the db. It will
save the stattuple output to directories under --statedir
and then set an at job to reinvoke itself in an hour
with the stattuple-hour command.
analyze: *** Unimplemented *** This command should take information
in --statedir and produce a graph of tuple growth over
time and recomendation for how frequently to vacuum.
** The following commands are used internally and won't produce meaningful
** statistics by themselves. Run stattuple-start instead.
stattuple-hour: Used internally by stattuple-start to run stattuple on
certain databases/tables an hour after vacuuming. The
stattuple output will be saved to --statedir and then it
will set an at job to reinvoke itself in five more hours
with the stattuple-quarter command.
stattuple-quarter:Used internally by stattuple-hour to run stattuple on
certain databases/tables 6 hours after vacuuming. The
stattuple output will be saved to --statedir and then it
will set an at job to reinvoke itself in 18 hours.
stattuple-day: Used internally by stattuple-quarter to run stattuple on
certain databases/tables 6 hours after vacuuming. The
stattuple output will be saved to --statedir and then
exit.
''')
parser.add_option('-s', '--state-dir',
dest='statedir',
action='store',
default=STATEDIR,
help='Directory to get and store information about databases/tables')
parser.add_option('-d', '--database',
dest='databases',
action='append',
default=[],
help='Database to process. You can specify this option multiple times. Defaults to all')
parser.add_option('-t', '--table',
dest='tables',
action='append',
default=[],
help='Tables to process. This option can only be used if --databases is used to specify exactly one database. You can specify this option multiple times. Defaults to all')
parser.add_option('--session',
dest='sessionID',
action='store',
default='',
help='Internal command line option to pass data between invocations of the program.')
(opts, args) = parser.parse_args()
# Check that we were given a proper command
if len(args) < 1:
raise ArgumentError, 'No command specified'
elif len(args) > 1:
raise ArgumentError, 'Can only specify one command'
if args[0] not in Commands:
raise ArgumentError, 'Unknown Command'
if opts.tables and len(opts.databases) != 1:
raise ArgumentError, '--tables can only be used if --databases is specified exactly once.'
if args[0] in ('schema', 'list', 'check', 'transactions'):
if opts.databases:
raise ArgumentError, 'schema, list, transactions, and check commands cannot be used with --database'
# optionList is used to reinvoke the new stattuple
opts.optionList = []
if args[0].startswith('stattuple') and not args[0].endswith('-day'):
if args[0].endswith('-start'):
opts.optionList.append('stattuple-hour')
elif args[0].endswith('-hour'):
opts.optionList.append('stattuple-quarter')
elif args[0].endswith('-quarter'):
opts.optionList.append('stattuple-day')
opts.optionList.extend(('-s', opts.statedir))
return args[0], opts
def init_statedir(statedir):
# Make sure the statedir is ready
if not os.path.isdir(statedir):
try:
os.makedirs(statedir)
except:
raise IOError, 'You do not have permission to create the statedir %s' % statedir
if not os.access(statedir, os.R_OK | os.X_OK | os.W_OK):
raise IOError, 'You do not have permission to use %s as the statedir' % statedir
if __name__ == '__main__':
command, opts = parse_args()
init_statedir(opts.statedir)
Commands[command](opts)
sys.exit(0)