and we have the initial items for a getfedora daily stats

This commit is contained in:
Stephen Smoogen 2016-05-05 19:56:03 +00:00
parent 5ddbf54811
commit bcb3611abb
7 changed files with 348 additions and 15 deletions

View file

@ -0,0 +1 @@
0 07 * * * root /usr/local/bin/condense-getfedoralogs.sh

View file

@ -0,0 +1,90 @@
#!/bin/bash
# This file is part of Fedora Project Infrastructure Ansible
# Repository.
#
# Fedora Project Infrastructure Ansible Repository is free software:
# you can redistribute it and/or modify it under the terms of the GNU
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later
# version.
#
# Fedora Project Infrastructure Ansible Repository is distributed in
# the hope that it will be useful, but WITHOUT ANY WARRANTY; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License
# along with Fedora Project Infrastructure Ansible Repository. If
# not, see <http://www.gnu.org/licenses/>.
# There is a multiday delay involved in processing the logs. It
# may take up to 4 days to get the logs to the main-server. It may
# take a day to combine all the logs onto combined-httpd. So we assume
# we are 5 days behind.
let NUMDAYS=5
let OLDDAYS=$(( $NUMDAYS+1 ))
PROJECT=getfedora
WEBLOG=${PROJECT}.org
# This is the year/month/day for a N days ago.
YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y)
MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m)
DAY=$(/bin/date -d "-${NUMDAYS} days" +%d)
# And we have have to deal with year/month/day boundaries for our later grep.
OLDDATE=$(/bin/date -d "-${OLDDAYS} days" +%Y-%m-%d)
OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y)
NFSDIR=/mnt/fedora_stats/combined-http
TARGET=${NFSDIR}/latest
LOGFILE=${TARGET}/${WEBLOG}-access.log
WORKDIR=/mnt/fedora_stats/data/${PROJECT}
WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/worked-${DAY}
WEBDIR=/var/www/html/csv-reports/${PROJECT}
TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX )
LBIN=/usr/local/bin/
LSHARE=/usr/local/share/web-data-analysis
mkdir -p ${WORKDIR}/${YEAR}/${MONTH}
if [[ ! -f ${LOGFILE} ]]; then
echo "No logfile found for ${YEAR}/${MONTH}/${DAY}. Please fix."
elif [[ -f ${WORKFILE} ]]; then
echo "The workfile for ${YEAR}/${MONTH}/${DAY} already existed."
else
egrep -iv 'slurp|bot|yandex|spider|crawler|check_http' ${LOGFILE} | awk -f ${LSHARE}/${PROJECT}.awk > ${WORKFILE}
sort -o ${WORKFILE} -u ${WORKFILE}
fi
# So the data isn't strictly across month boundries due to the end of
# the logfiles being at 04:00 versus 23:59. Also log files might get
# stuck and you end up with days or weeks of data in a single
# file. Doing a continual sort clears up that.
sort -o ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} -S 4G -u -m ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} ${WORKFILE}
sort -o ${WORKDIR}/out-${YEAR} -S 4G -u -m ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH}
# Because the logs stop at 04:00 we can only get 24 hours from 6 days before.
egrep "${OLDDATE}" ${WORKDIR}/out-${OLDYEAR} > ${TEMPDIR}/watched-day
# Grab the data and put it in the two files. This makes it a lot
# faster to process as a whole year may take an hour to go through.
for i in ${OLDYEAR} all; do
awk -f ${LSHARE}/${PROJECT}-data.awk ${TEMPDIR}/watched-day >> ${WEBDIR}/${PROJECT}data-${i}.csv
sort -o ${WEBDIR}/mirrordata-${i}.csv -u ${WEBDIR}/${PROJECT}data-${i}.csv
done
gnuplot ${LSHARE}/${PROJECT}-data.gp
# cleanup the temp data
rm -rf ${TEMPDIR}

View file

@ -27,6 +27,9 @@
let NUMDAYS=5
let OLDDAYS=$(( $NUMDAYS+1 ))
PROJECT=getfedora
WEBLOG=${PROJECT}.org
# This is the year/month/day for a N days ago.
YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y)
MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m)
@ -39,15 +42,17 @@ OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y)
NFSDIR=/mnt/fedora_stats/combined-http
TARGET=${NFSDIR}/latest
LOGFILE=${TARGET}/mirrors.fedoraproject.org-access.log
LOGFILE=${TARGET}/${WEBLOG}-access.log
WORKDIR=/mnt/fedora_stats/data/mirrors
WORKDIR=/mnt/fedora_stats/data/${PROJECT}
WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/worked-${DAY}
WEBDIR=/var/www/html/csv-reports/mirrors
WEBDIR=/var/www/html/csv-reports/${PROJECT}
TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX )
LBIN=/usr/local/bin/
LSHARE=/usr/local/share/web-data-analysis
mkdir -p ${WORKDIR}/${YEAR}/${MONTH}
@ -56,7 +61,7 @@ if [[ ! -f ${LOGFILE} ]]; then
elif [[ -f ${WORKFILE} ]]; then
echo "The workfile for ${YEAR}/${MONTH}/${DAY} already existed."
else
/usr/local/bin/mirrorlist.py -o ${WORKFILE} ${LOGFILE};
${LBIN}/mirrorlist.py -o ${WORKFILE} ${LOGFILE};
fi
# So the data isn't strictly across month boundries due to the end of
@ -74,11 +79,11 @@ egrep "${OLDDATE}" ${WORKDIR}/out-${OLDYEAR} > ${TEMPDIR}/watched-day
# faster to process as a whole year may take an hour to go through.
for i in ${OLDYEAR} all; do
awk -f /usr/local/share/web-data-analysis/mirror-data.awk ${TEMPDIR}/watched-day >> ${WEBDIR}/mirrordata-${i}.csv
sort -o ${WEBDIR}/mirrordata-${i}.csv -u ${WEBDIR}/mirrordata-${i}.csv
awk -f ${LSHARE}/${PROJECT}-data.awk ${TEMPDIR}/watched-day >> ${WEBDIR}/${PROJECT}data-${i}.csv
sort -o ${WEBDIR}/mirrordata-${i}.csv -u ${WEBDIR}/${PROJECT}data-${i}.csv
done
gnuplot /usr/local/share/web-data-analysis/mirror-data.gp
gnuplot ${LSHARE}/${PROJECT}-data.gp
# cleanup the temp data
rm -rf ${TEMPDIR}

View file

@ -0,0 +1,106 @@
BEGIN{
olddate="1970-01-01"
dt = 0;
#edition
atomic=0;
cloud=0;
server=0;
workstation=0;
unk_edt=0;
# release
f21=0;
f22=0;
f23=0;
f24=0;
f25=0;
f26=0;
f27=0;
f28=0;
f29=0;
unk_rel=0;
# arch
arm_32=0;
arm_64=0;
x86_32=0;
x86_64=0;
ppc_le=0;
ppc_he=0;
s390x=0;
unk_arc=0;
# additional data
netinstall=0;
print olddate ",dt,atomic,cloud,server,workstation,unk_edt,f21,f22,f23,f24,f25,f26,f27,f28,f29,unk_rel,arm_32,arm_64,ppc_le,ppc_he,s390x,x86_32,x86_64,unk_arc,netinstall"
}
{
if ($1 == olddate) {
if (($3 ~/\.x86_64\./) || ($3 ~/-x86_64-/)) { x86_64 = x86_64 +1; }
else if (($3 ~/\.i686\./) || ($3 ~/-i686-/) || ($3 ~/\.i386\./) || ($3 ~/-i386-/)) { x86_32 = x86_32 +1; }
else if (($3 ~/\.armhfp\./) || ($3 ~/-armhfp-/)){ arm_32 = arm_32 +1; }
else if (($3 ~/\.aarch64\./) || ($3 ~/-aarch64-/)){ arm_64= arm_64 +1; }
else if (($3 ~/\.ppc64le\./) || ($3 ~/-ppc64le-/)){ ppc_le = ppc_le +1; }
else if (($3 ~/\.ppc64\./) || ($3 ~/-ppc64-/)){ ppc_he = ppc_he +1; }
else if (($3 ~/\.s390x\./) || ($3 ~/-s390x-/)){ s390x = s390x +1; }
else { unk_arc = unk_arc +1 };
if (($3 ~/-21\./) || ($3 ~/-21-/)) { f21 = f21 + 1 }
else if (($3 ~/-22\./) || ($3 ~/-22-/)) { f22 = f22 + 1 }
else if (($3 ~/-23\./) || ($3 ~/-23-/)) { f23 = f23 + 1 }
else if (($3 ~/-24\./) || ($3 ~/-24-/)) { f24 = f24 + 1 }
else if (($3 ~/-25\./) || ($3 ~/-25-/)) { f25 = f25 + 1 }
else if (($3 ~/-26\./) || ($3 ~/-26-/)) { f26 = f26 + 1 }
else if (($3 ~/-27\./) || ($3 ~/-27-/)) { f27 = f27 + 1 }
else if (($3 ~/-28\./) || ($3 ~/-28-/)) { f28 = f28 + 1 }
else if (($3 ~/-29\./) || ($3 ~/-29-/)) { f29 = f29 + 1 }
else {unk_rel = unk_rel +1 }
if (($3 ~/Cloud-Atomic/) || ($3 ~/Cloud_Atomic/)) { atomic = atomic +1 ; dt = dt +1 }
else if (($3 ~/Cloud-Base/) || ($3 ~/Cloud_Base/)) { cloud = cloud +1 ; dt = dt +1 }
else if (($3 ~/Server-DVD/) || ($3 ~/Server_DVD/)) { server = server +1; dt = dt +1 }
else if (($3 ~/Server-netinst/) || ($3 ~/Server_netinst/)) { server = server +1; netinstall = netinstall +1 ; dt = dt +1 }
else if (($3 ~/Workstation-netinst/) || ($3 ~/Workstation_netinst/)) { workstation = workstation +1; netinstall = netinstall +1; dt = dt +1 }
else if (($3 ~/Live-Workstation/) || ($3 ~/Live_Workstation/)) { workstation = workstation +1; dt = dt +1 }
else { unk_edt = unk_edt + 1; dt = dt +1}
} else {
print olddate "," dt "," atomic "," cloud "," server "," workstation "," unk_edt "," f21 "," f22 "," f23 "," f24 "," f25 "," f26 "," f27 "," f28 "," f29 "," unk_rel "," arm_32 "," arm_64 "," ppc_le "," ppc_he "," s390x "," x86_32 "," x86_64 "," unk_arc "," netinstall
olddate=$1
dt = 0;
#edition
atomic=0;
cloud=0;
server=0;
workstation=0;
unk_edt=0;
# release
f21=0;
f22=0;
f23=0;
f24=0;
f25=0;
f26=0;
f27=0;
f28=0;
f29=0;
unk_rel=0;
# arch
arm_32=0;
arm_64=0;
x86_32=0;
x86_64=0;
ppc_le=0;
ppc_he=0;
s390x=0;
unk_arc=0;
# additional data
netinstall=0;
}
}
END {
print olddate "," dt "," atomic "," cloud "," server "," workstation "," unk_edt "," f21 "," f22 "," f23 "," f24 "," f25 "," f26 "," f27 "," f28 "," f29 "," unk_rel "," arm_32 "," arm_64 "," ppc_le "," ppc_he "," s390x "," x86_32 "," x86_64 "," unk_arc "," netinstall
}

View file

@ -0,0 +1,53 @@
set grid
set xdata time
set format x "%Y-%m-%d"
set timefmt "%Y-%m-%d"
set datafile separator ","
set term png size 1600,1200
##
set output "getfedora-editions.png"
set title "Daily Editions Total"
plot ["2014-12-03":"2016-01-18"] \
'getfedora-all.csv' using 1:2 title 'Total number' with lines lw 4,\
'getfedora-all.csv' using 1:3 title 'Atomic' with lines lw 4,\
'getfedora-all.csv' using 1:4 title 'Cloud' with lines lw 4,\
'getfedora-all.csv' using 1:5 title 'Server' with lines lw 4,\
'getfedora-all.csv' using 1:6 title 'Workstation' with lines lw 4,\
'getfedora-all.csv' using 1:26 title 'Netinstall' with lines lw 4,\
'getfedora-all.csv' using 1:7 title 'Unknown' with lines lw 4
unset output
##
set output "getfedora-versions.png"
set title "Daily Version Totals"
plot ["2015-01-01":"2015-12-31"] \
'getfedora-all.csv' using 1:2 title 'Total number' with lines lw 4,\
'getfedora-all.csv' using 1:8 title 'Fedora-21' with lines lw 4,\
'getfedora-all.csv' using 1:9 title 'Fedora-22' with lines lw 4,\
'getfedora-all.csv' using 1:10 title 'Fedora-23' with lines lw 4,\
'getfedora-all.csv' using 1:11 title 'Fedora-24' with lines lw 4,\
'getfedora-all.csv' using 1:12 title 'Fedora-25' with lines lw 4,\
'getfedora-all.csv' using 1:13 title 'Fedora-26' with lines lw 4,\
'getfedora-all.csv' using 1:14 title 'Fedora-27' with lines lw 4,\
'getfedora-all.csv' using 1:15 title 'Fedora-28' with lines lw 4,\
'getfedora-all.csv' using 1:16 title 'Fedora-29' with lines lw 4,\
'getfedora-all.csv' using 1:17 title 'Unknown' with lines lw 4
unset output
##
set output "getfedora-arch.png"
set title "Daily Architectures Totals"
plot ["2015-01-01":"2015-12-31"] \
'getfedora-all.csv' using 1:2 title 'Total number' with lines lw 4,\
'getfedora-all.csv' using 1:18 title 'arm_32' with lines lw 4,\
'getfedora-all.csv' using 1:19 title 'arm_64' with lines lw 4,\
'getfedora-all.csv' using 1:20 title 'ppc_le' with lines lw 4,\
'getfedora-all.csv' using 1:21 title 'ppc_he' with lines lw 4,\
'getfedora-all.csv' using 1:22 title 's390x' with lines lw 4,\
'getfedora-all.csv' using 1:23 title 'x86_32' with lines lw 4,\
'getfedora-all.csv' using 1:24 title 'x86_64' with lines lw 4,\
'getfedora-all.csv' using 1:25 title 'unknown' with lines lw 4
unset output

View file

@ -0,0 +1,74 @@
#
# Take the apache log line
# 123.115.133.104 - - [01/Jan/2015:04:02:01 +0000] "GET /zh_CN/server/download/server-download-splash?file=http://download.fedoraproject.org/pub/fedora/linux/releases/21/Server/i386/iso/Fedora-Server-DVD-i386-21.iso HTTP/1.1" 200 4355 "https://getfedora.org/zh_CN/server/download/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
# Convert to
# 2012-10-30:13:5 123.115.133.104 Fedora-Server-DVD-i386-21.iso
function convertdate(str) {
gsub(/\[/, "", str)
split(str,a,":");
split(a[1],b,"/");
temp="";
switch (b[2]) {
case "Jan":
temp="01"
break;
case "Feb":
temp="02"
break;
case "Mar":
temp="03"
break;
case "Apr":
temp="04"
break;
case "May":
temp="05"
break;
case "Jun":
temp="06"
break;
case "Jul":
temp="07"
break;
case "Aug":
temp="08"
break;
case "Sep":
temp="09"
break;
case "Oct":
temp="10"
break;
case "Nov":
temp="11"
break;
case "Dec":
temp="12"
break;
default:
temp="00"
break;
}
foo=substr(a[3],1,1);
return b[3]"-"temp"-"b[1]
}
function getimage(str) {
if (str ~/=/) {
split(str,a,"=");
x=split(a[2],b,"/");
return b[x]
} else {
x=split(str,b,"/");
return b[x]
}
}
$7 ~/\.qcow2$|\.iso$|\.raw\.xz$|\.box$/ && $6 ~/GET/ && $9 ~/302|200/ {
date = convertdate($4)
iso = getimage($7)
ip = $1
print date, ip, iso
}

View file

@ -32,28 +32,32 @@
tags:
- web-data
- name: script to run the daily mirror log condenser
copy: src=condense-mirrorlogs.sh dest=/usr/local/bin/ mode=0755
- name: scripts to condense data down for further processing
copy: src={{item}} dest=/usr/local/bin/ mode=0755
with_items: [condense-mirrorlogs.sh, condense-getfedoralogs.sh]
tags:
- web-data
- name: python script to calculate mirrorlist
- name: python scripts to calculate various data
copy: src=mirrorlist.py dest=/usr/local/bin mode=0755
tags:
- web-data
- name: awk file for csv creation
copy: src=mirror-data.awk dest=/usr/local/share/web-data-analysis mode=0644
- name: awk files for csv creation
copy: src={{item}} dest=/usr/local/share/web-data-analysis mode=0644
with_items: [mirror-data.awk, getfedora-data.awk]
tags:
- web-data
- name: gnuplot file for csv creation
copy: src=mirror-data.gp dest=/usr/local/share/web-data-analysis mode=0644
- name: gnuplot file for image creation
copy: src={{item} dest=/usr/local/share/web-data-analysis mode=0644
with_items: [ mirror-data.gp, getfedora-data.gp ]
tags:
- web-data
- name: daily cron file to run the log files
copy: src=condense-mirrorlogs.cron dest=/etc/cron.d/ mode=0644
copy: src={{item}} dest=/etc/cron.d/ mode=0644
with_items: [condense-mirrorlogs.cron, condense-getfedoralogs.cron]
tags:
- web-data
- cron