ansible/roles/web-data-analysis/files/mirrorlist.py
2018-03-21 22:03:52 +00:00

590 lines
15 KiB
Python

#!/usr/bin/python
# This file is part of Fedora Project Infrastructure Ansible
# Repository.
#
# Fedora Project Infrastructure Ansible Repository is free software:
# you can redistribute it and/or modify it under the terms of the GNU
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later
# version.
#
# Fedora Project Infrastructure Ansible Repository is distributed in
# the hope that it will be useful, but WITHOUT ANY WARRANTY; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License
# along with Fedora Project Infrastructure Ansible Repository. If
# not, see <http://www.gnu.org/licenses/>.
import sys
import re
import optparse
import os
import string
'''
Mirror list will go through the file given as an argument and parse
out which releases and architectures were looked for, and by how much.
'''
log_line = [
r"(?P<host>([\d\.]+|[0-9a-fA-F\:]+))\s",
r"(?P<identity>\S*)\s",
r"(?P<user>\S*)\s",
r"\[(?P<time>.*?)\]\s",
r'"(?P<request>.*?)"\s',
r"(?P<status>\d+)\s",
r"(?P<bytes>\S*)\s",
r'"(?P<referrer>.*?)"\s',
r'"(?P<user_agent>.*?)"\s*',
]
pattern = re.compile("".join(log_line))
repo_dict = {
"epel4" : "epel4",
"epel4.0" : "epel4",
"epel4.1" : "epel4",
"epel4.2" : "epel4",
"epel4.3" : "epel4",
"epel4.4" : "epel4",
"epel4.5" : "epel4",
"epel4.6" : "epel4",
"epel4.7" : "epel4",
"epel4.8" : "epel4",
"epel4.9" : "epel4",
"epel4.10" : "epel4",
"epel4.11" : "epel4",
"epel4.12" : "epel4",
"epel4.13" : "epel4",
"epel4.14" : "epel4",
"epel4.15" : "epel4",
"epel4.16" : "epel4",
"epel4.17" : "epel4",
"epel4.18" : "epel4",
"epel4.19" : "epel4",
"epel4.20" : "epel4",
"epel5" : "epel5",
"epel5.0" : "epel5",
"epel5.1" : "epel5",
"epel5.2" : "epel5",
"epel5.3" : "epel5",
"epel5.4" : "epel5",
"epel5.5" : "epel5",
"epel5.6" : "epel5",
"epel5.7" : "epel5",
"epel5.8" : "epel5",
"epel5.9" : "epel5",
"epel5.10" : "epel5",
"epel5.11" : "epel5",
"epel5.12" : "epel5",
"epel5.13" : "epel5",
"epel5.14" : "epel5",
"epel5.15" : "epel5",
"epel5.16" : "epel5",
"epel5.17" : "epel5",
"epel5.18" : "epel5",
"epel5.19" : "epel5",
"epel5.20" : "epel5",
"epel6" : "epel6",
"epel6.0" : "epel6",
"epel6.1" : "epel6",
"epel6.2" : "epel6",
"epel6.3" : "epel6",
"epel6.4" : "epel6",
"epel6.5" : "epel6",
"epel6.6" : "epel6",
"epel6.7" : "epel6",
"epel6.8" : "epel6",
"epel6.9" : "epel6",
"epel6.10" : "epel6",
"epel6.11" : "epel6",
"epel6.12" : "epel6",
"epel6.13" : "epel6",
"epel6.14" : "epel6",
"epel6.15" : "epel6",
"epel6.16" : "epel6",
"epel6.17" : "epel6",
"epel6.18" : "epel6",
"epel6.19" : "epel6",
"epel6.20" : "epel6",
"epel7" : "epel7",
"epel7.0" : "epel7",
"epel7.1" : "epel7",
"epel7.2" : "epel7",
"epel7.3" : "epel7",
"epel7.4" : "epel7",
"epel7.5" : "epel7",
"epel7.6" : "epel7",
"epel7.7" : "epel7",
"epel7.8" : "epel7",
"epel7.9" : "epel7",
"epel7.10" : "epel7",
"epel7.11" : "epel7",
"epel7.12" : "epel7",
"epel7.13" : "epel7",
"epel7.14" : "epel7",
"epel7.15" : "epel7",
"epel7.16" : "epel7",
"epel7.17" : "epel7",
"epel7.18" : "epel7",
"epel7.19" : "epel7",
"epel7.20" : "epel7",
"epel8" : "epel8",
"epel8.0" : "epel8",
"epel8.1" : "epel8",
"epel8.2" : "epel8",
"epel8.3" : "epel8",
"epel8.4" : "epel8",
"epel8.5" : "epel8",
"epel8.6" : "epel8",
"epel8.7" : "epel8",
"epel8.8" : "epel8",
"epel8.9" : "epel8",
"epel8.10" : "epel8",
"epel8.11" : "epel8",
"epel8.12" : "epel8",
"epel8.13" : "epel8",
"epel8.14" : "epel8",
"epel8.15" : "epel8",
"epel8.16" : "epel8",
"epel8.17" : "epel8",
"epel8.18" : "epel8",
"epel8.19" : "epel8",
"epel8.20" : "epel8",
"epel9" : "epel9",
"epel9.0" : "epel9",
"epel9.1" : "epel9",
"epel9.2" : "epel9",
"epel9.3" : "epel9",
"epel9.4" : "epel9",
"epel9.5" : "epel9",
"epel9.6" : "epel9",
"epel9.7" : "epel9",
"epel9.8" : "epel9",
"epel9.9" : "epel9",
"epel9.10" : "epel9",
"epel9.11" : "epel9",
"epel9.12" : "epel9",
"epel9.13" : "epel9",
"epel9.14" : "epel9",
"epel9.15" : "epel9",
"epel9.16" : "epel9",
"epel9.17" : "epel9",
"epel9.18" : "epel9",
"epel9.19" : "epel9",
"epel9.20" : "epel9",
"rawhide" : "rawhide",
"frawhide" : "rawhide",
"rawhidemodular" : "rawhide_modular",
"3" : "f03",
"4" : "f04",
"5" : "f05",
"6" : "f06",
"7" : "f07",
"8" : "f08",
"9" : "f09",
"10" : "f10",
"11" : "f11",
"12" : "f12",
"13" : "f13",
"14" : "f14",
"15" : "f15",
"16" : "f16",
"17" : "f17",
"18" : "f18",
"19" : "f19",
"20" : "f20",
"21" : "f21",
"22" : "f22",
"23" : "f23",
"24" : "f24",
"25" : "f25",
"26" : "f26",
"27" : "f27",
"28" : "f28",
"29" : "f29",
"30" : "f30",
"31" : "f31",
"32" : "f32",
"33" : "f33",
"6.89" : "f07",
"6.90" : "f07",
"6.91" : "f07",
"6.92" : "f07",
"6.93" : "f07",
"7.89" : "f08",
"7.90" : "f08",
"7.91" : "f08",
"7.92" : "f08",
"7.93" : "f08",
"8.90" : "f09",
"8.91" : "f09",
"8.92" : "f09",
"8.93" : "f09",
"9.90" : "f10",
"9.90.1" : "f10",
"9.91" : "f10",
"9.92" : "f10",
"9.93" : "f10",
"10.89" : "f11",
"10.90" : "f11",
"10.91" : "f11",
"10.92" : "f11",
"10.93" : "f11",
"11.89" : "f12",
"11.90" : "f12",
"11.91" : "f12",
"11.92" : "f12",
"11.93" : "f12",
"12.89" : "f13",
"12.90" : "f13",
"12.91" : "f13",
"12.92" : "f13",
"12.93" : "f13",
"f6.89" : "f07",
"f6.90" : "f07",
"f6.91" : "f07",
"f6.92" : "f07",
"f6.93" : "f07",
"f7.89" : "f08",
"f7.90" : "f08",
"f7.91" : "f08",
"f7.92" : "f08",
"f7.93" : "f08",
"f8.90" : "f09",
"f8.91" : "f09",
"f8.92" : "f09",
"f8.93" : "f09",
"f9.90" : "f10",
"f9.90.1" : "f10",
"f9.91" : "f10",
"f9.92" : "f10",
"f9.93" : "f10",
"f10.89" : "f11",
"f10.90" : "f11",
"f10.91" : "f11",
"f10.92" : "f11",
"f10.93" : "f11",
"f11.89" : "f12",
"f11.90" : "f12",
"f11.91" : "f12",
"f11.92" : "f12",
"f11.93" : "f12",
"f12.89" : "f13",
"f12.90" : "f13",
"f12.91" : "f13",
"f12.92" : "f13",
"f12.93" : "f13",
'f3' : 'f03',
'f4' : 'f04',
'f5' : 'f05',
'f6' : 'f06',
'f7' : 'f07',
'f8' : 'f08',
'f9' : 'f09',
'f03' : 'f03',
'f04' : 'f04',
'f05' : 'f05',
'f06' : 'f06',
'f07' : 'f07',
'f08' : 'f08',
'f09' : 'f09',
'f10' : 'f10',
'f11' : 'f11',
'f12' : 'f12',
'f13' : 'f13',
'f14' : 'f14',
'f15' : 'f15',
'f16' : 'f16',
'f17' : 'f17',
'f18' : 'f18',
'f19' : 'f19',
'f20' : 'f20',
'f21' : 'f21',
'f22' : 'f22',
'f23' : 'f23',
'f24' : 'f24',
'f25' : 'f25',
'f26' : 'f26',
'f27' : 'f27',
'f28' : 'f28',
'f29' : 'f29',
'f30' : 'f30',
'f31' : 'f31',
'f32' : 'f32',
'f33' : 'f33',
'fmodular27' : 'modular_f27',
'fmodular28' : 'modular_f28',
'fmodular29' : 'modular_f29',
'fmodular30' : 'modular_f30',
'fmodular31' : 'modular_f31',
'fmodular32' : 'modular_f32',
'fmodular33' : 'modular_f33',
'modularf27' : 'modular_f27',
'modularf28' : 'modular_f28',
'modularf29' : 'modular_f29',
'modularf30' : 'modular_f30',
'modularf31' : 'modular_f31',
'modularf32' : 'modular_f32',
'modularf33' : 'modular_f33',
'rhel4' : 'rhel4',
'rhel5' : 'rhel5',
'rhel6' : 'rhel6',
'rhel7' : 'rhel7',
'rhel8' : 'rhel8',
'rhel9' : 'rhel9',
}
repo_keys = repo_dict.keys()
def breakoutdate(givendate):
Apache_Months = {
'Jan' : '01',
'Feb' : '02',
'Mar' : '03',
'Apr' : '04',
'May' : '05',
'Jun' : '06',
'Jul' : '07',
'Aug' : '08',
'Sep' : '09',
'Oct' : '10',
'Nov' : '11',
'Dec' : '12',
}
date_part = givendate.split()
try:
[day, month, year] = givendate.split(":")[0].split('/')
except:
# string out of index because date corrupted?
[day, month, year ] = ['01', '01', '1970'] # epoch
ret_str = "%s-%s-%s" % (year, Apache_Months[month], day)
return ret_str
def breakoutrepo(request):
try:
parts = request.split()[1].split("?")[1].split("&")
repo=""
arch=""
for i in parts:
if 'repo=' in i:
repo = i.split('=')[1]
if 'arch=' in i:
arch = i.split('=')[1]
return (repo,arch)
except:
return ("unknown_repo","unknown_arch")
def figureoutrepo(asked_repo):
global repo_dict
global repo_keys
crap_chars = ['/', '$', '!', '#', '%', '&', "'", '"', "(", ")", "*", "+", ",", "_", ":", ";", "<", ">", "=", "?", "@", "[", "^", "|"]
spew = asked_repo.lower()
for char in crap_chars:
if char in spew:
spew.split(char)[0]
f_phrases = ["core", "fedora", "extras", "legacy", "fc"]
for word in f_phrases:
if word in spew:
spew = spew.replace(word, "f")
repo_phrases = [".newkey", "install", "alpha", "beta", "client", "debug", "devel", "info", "optional", "preview", "released", "source", "testing", "updates"]
for word in repo_phrases:
if word in spew:
spew = spew.replace(word, "")
if "centosplus" in spew:
spew = spew.replace("centosplus", "centos")
if "client" in spew:
spew = re.sub("client.*", "", spew)
if "cloud" in spew:
spew = re.sub("cloud.*", "", spew)
if "server" in spew:
spew = re.sub("server.*", "", spew)
if "workstation" in spew:
spew = re.sub("workstation.*", "", spew)
if "-" in spew:
spew = re.sub("-+", "", spew)
sanitize = spew.strip()
if sanitize in repo_dict.keys():
return repo_dict[sanitize]
else:
# sys.stderr.write("asked_repo: %s. Thought it was %s\n" % (asked_repo,spew))
return "unknown_repo"
def figureoutarch(asked_arch):
arch_dict = {
'i386' : 'i386',
'i486' : 'i386',
'i586' : 'i386',
'i686' : 'i386',
'athlon' : 'i386',
'pentium' : 'i386',
'pentium3' : 'i386',
'pentium4' : 'i386',
'pentium5' : 'i386',
'ia32' : 'i386',
'x86' : 'i386',
'x86_32' : 'i386',
'x86_64' : 'x86_64',
'amd64' : 'x86_64',
'aarch64' : 'aarch64',
'alpha' : 'alpha',
'arm' : 'arm',
'arm64' : 'aarch64',
'armhfp' : 'arm',
'armv3l' : 'arm',
'armv5tel' : 'arm',
'armv7hl' : 'arm',
'ia64' : 'ia64',
'mips' : 'mips',
'mips64' : 'mips64',
'mips64el' : 'mips64',
'powepc' : 'ppc',
'ppc' : 'ppc',
'ppc32' : 'ppc',
'ppc64' : 'ppc64',
'ppc64le' : 'ppc64le',
's390' : 's390',
's390x' : 's390',
'sparc' : 'sparc',
'sparc64' : 'sparc64',
'tilegx' : 'tilegx',
}
spew = asked_arch.split("/")[0]
spew = spew.split("!")[0]
spew = spew.split("#")[0]
spew = spew.split("%")[0]
spew = spew.split("&")[0]
spew = spew.split("'")[0]
spew = spew.split("(")[0]
spew = spew.split("*")[0]
spew = spew.split("+")[0]
spew = spew.split(",")[0]
spew = spew.split("-")[0]
spew = spew.split(".")[0]
spew = spew.split(":")[0]
spew = spew.split(";")[0]
spew = spew.split("<")[0]
spew = spew.split("=")[0]
spew = spew.split(">")[0]
spew = spew.split("?")[0]
spew = spew.split("@")[0]
spew = spew.split("[")[0]
spew = spew.split("]")[0]
spew = spew.split("^")[0]
spew = spew.split('"')[0]
spew = spew.split('\\')[0]
spew = spew.split('|')[0]
spew = spew.split('$')[0]
sanitize = spew.lower()
if sanitize in arch_dict.keys():
return arch_dict[sanitize]
else:
#sys.stderr.write("asked_arch: %s\n" % asked_arch)
return "unknown_arch"
def parseline(our_line):
##
## Figure out if line is something we want to work on more
global pattern
if (('/metalink' in our_line) or ('/mirrorlist' in our_line)):
our_blob = pattern.match(our_line)
if our_blob:
our_dict = our_blob.groupdict()
ip = our_dict['host']
time = breakoutdate(our_dict['time'])
r,a = breakoutrepo(our_dict['request'])
repo = figureoutrepo(r)
arch = figureoutarch(a)
return "%s %s %s %s" % (time,ip,repo,arch)
else:
return ""
else:
return ""
def parselog(our_file, out_file):
our_file = our_file
yumclients_set = set()
output_file = out_file
try:
data = open(our_file, "r")
except:
sys.stderr.write("Unable to open %s\n" % our_file )
sys.exit(-1)
for line in data:
parsed = parseline(line)
if parsed == "":
pass
else:
yumclients_set.add(parsed)
data.close()
our_list = list(yumclients_set)
our_list.sort()
try:
output = open(output_file,"a")
sys.stderr.write("Outputting data: %s\n" % our_file)
except:
sys.stderr.write("Unable to open outputfile\n")
sys.exit(-1)
for line in our_list:
output.write(line + os.linesep)
output.close()
return
def main():
parser = optparse.OptionParser(
description = "A program to parse Fedora mirrorlist apache common log format files.",
prog = "mirrorlist.py",
version = "1.0.2",
usage = "%prog [-o output-filename] logfile1 [logfile2...]"
)
parser.add_option("-o", "--output",
default = "output.txt",
help = "Sets the name of the output file for the run.",
dest = "output")
(options, args) = parser.parse_args()
if options.output:
out_file = options.output
else:
out_file = "output.txt"
for our_file in args:
parselog(our_file,out_file)
if __name__ == '__main__':
main()