From bcb3611abbdcad6bcaaf9c88004030966428d65e Mon Sep 17 00:00:00 2001 From: Stephen Smoogen Date: Thu, 5 May 2016 19:56:03 +0000 Subject: [PATCH] and we have the initial items for a getfedora daily stats --- .../files/condense-getfedoralogs.cron | 1 + .../files/condense-getfedoralogs.sh | 90 +++++++++++++++ .../files/condense-mirrorlogs.sh | 19 ++-- .../files/getfedora-data.awk | 106 ++++++++++++++++++ .../web-data-analysis/files/getfedora-data.gp | 53 +++++++++ roles/web-data-analysis/files/getfedora.awk | 74 ++++++++++++ roles/web-data-analysis/tasks/main.yml | 20 ++-- 7 files changed, 348 insertions(+), 15 deletions(-) create mode 100644 roles/web-data-analysis/files/condense-getfedoralogs.cron create mode 100644 roles/web-data-analysis/files/condense-getfedoralogs.sh create mode 100644 roles/web-data-analysis/files/getfedora-data.awk create mode 100644 roles/web-data-analysis/files/getfedora-data.gp create mode 100644 roles/web-data-analysis/files/getfedora.awk diff --git a/roles/web-data-analysis/files/condense-getfedoralogs.cron b/roles/web-data-analysis/files/condense-getfedoralogs.cron new file mode 100644 index 0000000000..c758f0c0e1 --- /dev/null +++ b/roles/web-data-analysis/files/condense-getfedoralogs.cron @@ -0,0 +1 @@ +0 07 * * * root /usr/local/bin/condense-getfedoralogs.sh diff --git a/roles/web-data-analysis/files/condense-getfedoralogs.sh b/roles/web-data-analysis/files/condense-getfedoralogs.sh new file mode 100644 index 0000000000..ed1e8b4e09 --- /dev/null +++ b/roles/web-data-analysis/files/condense-getfedoralogs.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# This file is part of Fedora Project Infrastructure Ansible +# Repository. +# +# Fedora Project Infrastructure Ansible Repository is free software: +# you can redistribute it and/or modify it under the terms of the GNU +# General Public License as published by the Free Software Foundation, +# either version 3 of the License, or (at your option) any later +# version. +# +# Fedora Project Infrastructure Ansible Repository is distributed in +# the hope that it will be useful, but WITHOUT ANY WARRANTY; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with Fedora Project Infrastructure Ansible Repository. If +# not, see . + +# There is a multiday delay involved in processing the logs. It +# may take up to 4 days to get the logs to the main-server. It may +# take a day to combine all the logs onto combined-httpd. So we assume +# we are 5 days behind. + +let NUMDAYS=5 +let OLDDAYS=$(( $NUMDAYS+1 )) + +PROJECT=getfedora +WEBLOG=${PROJECT}.org + +# This is the year/month/day for a N days ago. +YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y) +MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m) +DAY=$(/bin/date -d "-${NUMDAYS} days" +%d) + +# And we have have to deal with year/month/day boundaries for our later grep. +OLDDATE=$(/bin/date -d "-${OLDDAYS} days" +%Y-%m-%d) +OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y) + +NFSDIR=/mnt/fedora_stats/combined-http +TARGET=${NFSDIR}/latest + +LOGFILE=${TARGET}/${WEBLOG}-access.log + +WORKDIR=/mnt/fedora_stats/data/${PROJECT} +WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/worked-${DAY} + +WEBDIR=/var/www/html/csv-reports/${PROJECT} + +TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX ) + +LBIN=/usr/local/bin/ +LSHARE=/usr/local/share/web-data-analysis + +mkdir -p ${WORKDIR}/${YEAR}/${MONTH} + +if [[ ! -f ${LOGFILE} ]]; then + echo "No logfile found for ${YEAR}/${MONTH}/${DAY}. Please fix." +elif [[ -f ${WORKFILE} ]]; then + echo "The workfile for ${YEAR}/${MONTH}/${DAY} already existed." +else + egrep -iv 'slurp|bot|yandex|spider|crawler|check_http' ${LOGFILE} | awk -f ${LSHARE}/${PROJECT}.awk > ${WORKFILE} + sort -o ${WORKFILE} -u ${WORKFILE} +fi + +# So the data isn't strictly across month boundries due to the end of +# the logfiles being at 04:00 versus 23:59. Also log files might get +# stuck and you end up with days or weeks of data in a single +# file. Doing a continual sort clears up that. + +sort -o ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} -S 4G -u -m ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} ${WORKFILE} +sort -o ${WORKDIR}/out-${YEAR} -S 4G -u -m ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} + +# Because the logs stop at 04:00 we can only get 24 hours from 6 days before. +egrep "${OLDDATE}" ${WORKDIR}/out-${OLDYEAR} > ${TEMPDIR}/watched-day + +# Grab the data and put it in the two files. This makes it a lot +# faster to process as a whole year may take an hour to go through. + +for i in ${OLDYEAR} all; do + awk -f ${LSHARE}/${PROJECT}-data.awk ${TEMPDIR}/watched-day >> ${WEBDIR}/${PROJECT}data-${i}.csv + sort -o ${WEBDIR}/mirrordata-${i}.csv -u ${WEBDIR}/${PROJECT}data-${i}.csv +done + +gnuplot ${LSHARE}/${PROJECT}-data.gp + +# cleanup the temp data +rm -rf ${TEMPDIR} diff --git a/roles/web-data-analysis/files/condense-mirrorlogs.sh b/roles/web-data-analysis/files/condense-mirrorlogs.sh index bf65d67927..34bb8acc2c 100644 --- a/roles/web-data-analysis/files/condense-mirrorlogs.sh +++ b/roles/web-data-analysis/files/condense-mirrorlogs.sh @@ -27,6 +27,9 @@ let NUMDAYS=5 let OLDDAYS=$(( $NUMDAYS+1 )) +PROJECT=getfedora +WEBLOG=${PROJECT}.org + # This is the year/month/day for a N days ago. YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y) MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m) @@ -39,15 +42,17 @@ OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y) NFSDIR=/mnt/fedora_stats/combined-http TARGET=${NFSDIR}/latest -LOGFILE=${TARGET}/mirrors.fedoraproject.org-access.log +LOGFILE=${TARGET}/${WEBLOG}-access.log -WORKDIR=/mnt/fedora_stats/data/mirrors +WORKDIR=/mnt/fedora_stats/data/${PROJECT} WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/worked-${DAY} -WEBDIR=/var/www/html/csv-reports/mirrors +WEBDIR=/var/www/html/csv-reports/${PROJECT} TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX ) +LBIN=/usr/local/bin/ +LSHARE=/usr/local/share/web-data-analysis mkdir -p ${WORKDIR}/${YEAR}/${MONTH} @@ -56,7 +61,7 @@ if [[ ! -f ${LOGFILE} ]]; then elif [[ -f ${WORKFILE} ]]; then echo "The workfile for ${YEAR}/${MONTH}/${DAY} already existed." else - /usr/local/bin/mirrorlist.py -o ${WORKFILE} ${LOGFILE}; + ${LBIN}/mirrorlist.py -o ${WORKFILE} ${LOGFILE}; fi # So the data isn't strictly across month boundries due to the end of @@ -74,11 +79,11 @@ egrep "${OLDDATE}" ${WORKDIR}/out-${OLDYEAR} > ${TEMPDIR}/watched-day # faster to process as a whole year may take an hour to go through. for i in ${OLDYEAR} all; do - awk -f /usr/local/share/web-data-analysis/mirror-data.awk ${TEMPDIR}/watched-day >> ${WEBDIR}/mirrordata-${i}.csv - sort -o ${WEBDIR}/mirrordata-${i}.csv -u ${WEBDIR}/mirrordata-${i}.csv + awk -f ${LSHARE}/${PROJECT}-data.awk ${TEMPDIR}/watched-day >> ${WEBDIR}/${PROJECT}data-${i}.csv + sort -o ${WEBDIR}/mirrordata-${i}.csv -u ${WEBDIR}/${PROJECT}data-${i}.csv done -gnuplot /usr/local/share/web-data-analysis/mirror-data.gp +gnuplot ${LSHARE}/${PROJECT}-data.gp # cleanup the temp data rm -rf ${TEMPDIR} diff --git a/roles/web-data-analysis/files/getfedora-data.awk b/roles/web-data-analysis/files/getfedora-data.awk new file mode 100644 index 0000000000..7d3aa12f9e --- /dev/null +++ b/roles/web-data-analysis/files/getfedora-data.awk @@ -0,0 +1,106 @@ + + +BEGIN{ + olddate="1970-01-01" + dt = 0; + #edition + atomic=0; + cloud=0; + server=0; + workstation=0; + unk_edt=0; + # release + f21=0; + f22=0; + f23=0; + f24=0; + f25=0; + f26=0; + f27=0; + f28=0; + f29=0; + unk_rel=0; + # arch + arm_32=0; + arm_64=0; + x86_32=0; + x86_64=0; + ppc_le=0; + ppc_he=0; + s390x=0; + unk_arc=0; + # additional data + netinstall=0; + print olddate ",dt,atomic,cloud,server,workstation,unk_edt,f21,f22,f23,f24,f25,f26,f27,f28,f29,unk_rel,arm_32,arm_64,ppc_le,ppc_he,s390x,x86_32,x86_64,unk_arc,netinstall" +} + +{ + if ($1 == olddate) { + if (($3 ~/\.x86_64\./) || ($3 ~/-x86_64-/)) { x86_64 = x86_64 +1; } + else if (($3 ~/\.i686\./) || ($3 ~/-i686-/) || ($3 ~/\.i386\./) || ($3 ~/-i386-/)) { x86_32 = x86_32 +1; } + else if (($3 ~/\.armhfp\./) || ($3 ~/-armhfp-/)){ arm_32 = arm_32 +1; } + else if (($3 ~/\.aarch64\./) || ($3 ~/-aarch64-/)){ arm_64= arm_64 +1; } + else if (($3 ~/\.ppc64le\./) || ($3 ~/-ppc64le-/)){ ppc_le = ppc_le +1; } + else if (($3 ~/\.ppc64\./) || ($3 ~/-ppc64-/)){ ppc_he = ppc_he +1; } + else if (($3 ~/\.s390x\./) || ($3 ~/-s390x-/)){ s390x = s390x +1; } + else { unk_arc = unk_arc +1 }; + + if (($3 ~/-21\./) || ($3 ~/-21-/)) { f21 = f21 + 1 } + else if (($3 ~/-22\./) || ($3 ~/-22-/)) { f22 = f22 + 1 } + else if (($3 ~/-23\./) || ($3 ~/-23-/)) { f23 = f23 + 1 } + else if (($3 ~/-24\./) || ($3 ~/-24-/)) { f24 = f24 + 1 } + else if (($3 ~/-25\./) || ($3 ~/-25-/)) { f25 = f25 + 1 } + else if (($3 ~/-26\./) || ($3 ~/-26-/)) { f26 = f26 + 1 } + else if (($3 ~/-27\./) || ($3 ~/-27-/)) { f27 = f27 + 1 } + else if (($3 ~/-28\./) || ($3 ~/-28-/)) { f28 = f28 + 1 } + else if (($3 ~/-29\./) || ($3 ~/-29-/)) { f29 = f29 + 1 } + else {unk_rel = unk_rel +1 } + + if (($3 ~/Cloud-Atomic/) || ($3 ~/Cloud_Atomic/)) { atomic = atomic +1 ; dt = dt +1 } + else if (($3 ~/Cloud-Base/) || ($3 ~/Cloud_Base/)) { cloud = cloud +1 ; dt = dt +1 } + else if (($3 ~/Server-DVD/) || ($3 ~/Server_DVD/)) { server = server +1; dt = dt +1 } + else if (($3 ~/Server-netinst/) || ($3 ~/Server_netinst/)) { server = server +1; netinstall = netinstall +1 ; dt = dt +1 } + else if (($3 ~/Workstation-netinst/) || ($3 ~/Workstation_netinst/)) { workstation = workstation +1; netinstall = netinstall +1; dt = dt +1 } + else if (($3 ~/Live-Workstation/) || ($3 ~/Live_Workstation/)) { workstation = workstation +1; dt = dt +1 } + else { unk_edt = unk_edt + 1; dt = dt +1} + + } else { + print olddate "," dt "," atomic "," cloud "," server "," workstation "," unk_edt "," f21 "," f22 "," f23 "," f24 "," f25 "," f26 "," f27 "," f28 "," f29 "," unk_rel "," arm_32 "," arm_64 "," ppc_le "," ppc_he "," s390x "," x86_32 "," x86_64 "," unk_arc "," netinstall + olddate=$1 + dt = 0; + #edition + atomic=0; + cloud=0; + server=0; + workstation=0; + unk_edt=0; + # release + f21=0; + f22=0; + f23=0; + f24=0; + f25=0; + f26=0; + f27=0; + f28=0; + f29=0; + unk_rel=0; + # arch + arm_32=0; + arm_64=0; + x86_32=0; + x86_64=0; + ppc_le=0; + ppc_he=0; + s390x=0; + unk_arc=0; + # additional data + netinstall=0; + } + +} + +END { + print olddate "," dt "," atomic "," cloud "," server "," workstation "," unk_edt "," f21 "," f22 "," f23 "," f24 "," f25 "," f26 "," f27 "," f28 "," f29 "," unk_rel "," arm_32 "," arm_64 "," ppc_le "," ppc_he "," s390x "," x86_32 "," x86_64 "," unk_arc "," netinstall +} + diff --git a/roles/web-data-analysis/files/getfedora-data.gp b/roles/web-data-analysis/files/getfedora-data.gp new file mode 100644 index 0000000000..6891e2cab8 --- /dev/null +++ b/roles/web-data-analysis/files/getfedora-data.gp @@ -0,0 +1,53 @@ +set grid +set xdata time +set format x "%Y-%m-%d" +set timefmt "%Y-%m-%d" + +set datafile separator "," +set term png size 1600,1200 + +## +set output "getfedora-editions.png" +set title "Daily Editions Total" +plot ["2014-12-03":"2016-01-18"] \ + 'getfedora-all.csv' using 1:2 title 'Total number' with lines lw 4,\ + 'getfedora-all.csv' using 1:3 title 'Atomic' with lines lw 4,\ + 'getfedora-all.csv' using 1:4 title 'Cloud' with lines lw 4,\ + 'getfedora-all.csv' using 1:5 title 'Server' with lines lw 4,\ + 'getfedora-all.csv' using 1:6 title 'Workstation' with lines lw 4,\ + 'getfedora-all.csv' using 1:26 title 'Netinstall' with lines lw 4,\ + 'getfedora-all.csv' using 1:7 title 'Unknown' with lines lw 4 +unset output + +## +set output "getfedora-versions.png" +set title "Daily Version Totals" +plot ["2015-01-01":"2015-12-31"] \ + 'getfedora-all.csv' using 1:2 title 'Total number' with lines lw 4,\ + 'getfedora-all.csv' using 1:8 title 'Fedora-21' with lines lw 4,\ + 'getfedora-all.csv' using 1:9 title 'Fedora-22' with lines lw 4,\ + 'getfedora-all.csv' using 1:10 title 'Fedora-23' with lines lw 4,\ + 'getfedora-all.csv' using 1:11 title 'Fedora-24' with lines lw 4,\ + 'getfedora-all.csv' using 1:12 title 'Fedora-25' with lines lw 4,\ + 'getfedora-all.csv' using 1:13 title 'Fedora-26' with lines lw 4,\ + 'getfedora-all.csv' using 1:14 title 'Fedora-27' with lines lw 4,\ + 'getfedora-all.csv' using 1:15 title 'Fedora-28' with lines lw 4,\ + 'getfedora-all.csv' using 1:16 title 'Fedora-29' with lines lw 4,\ + 'getfedora-all.csv' using 1:17 title 'Unknown' with lines lw 4 +unset output + +## +set output "getfedora-arch.png" +set title "Daily Architectures Totals" +plot ["2015-01-01":"2015-12-31"] \ + 'getfedora-all.csv' using 1:2 title 'Total number' with lines lw 4,\ + 'getfedora-all.csv' using 1:18 title 'arm_32' with lines lw 4,\ + 'getfedora-all.csv' using 1:19 title 'arm_64' with lines lw 4,\ + 'getfedora-all.csv' using 1:20 title 'ppc_le' with lines lw 4,\ + 'getfedora-all.csv' using 1:21 title 'ppc_he' with lines lw 4,\ + 'getfedora-all.csv' using 1:22 title 's390x' with lines lw 4,\ + 'getfedora-all.csv' using 1:23 title 'x86_32' with lines lw 4,\ + 'getfedora-all.csv' using 1:24 title 'x86_64' with lines lw 4,\ + 'getfedora-all.csv' using 1:25 title 'unknown' with lines lw 4 +unset output + diff --git a/roles/web-data-analysis/files/getfedora.awk b/roles/web-data-analysis/files/getfedora.awk new file mode 100644 index 0000000000..607a12edfd --- /dev/null +++ b/roles/web-data-analysis/files/getfedora.awk @@ -0,0 +1,74 @@ +# +# Take the apache log line +# 123.115.133.104 - - [01/Jan/2015:04:02:01 +0000] "GET /zh_CN/server/download/server-download-splash?file=http://download.fedoraproject.org/pub/fedora/linux/releases/21/Server/i386/iso/Fedora-Server-DVD-i386-21.iso HTTP/1.1" 200 4355 "https://getfedora.org/zh_CN/server/download/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" +# Convert to +# 2012-10-30:13:5 123.115.133.104 Fedora-Server-DVD-i386-21.iso + +function convertdate(str) { + gsub(/\[/, "", str) + split(str,a,":"); + split(a[1],b,"/"); + temp=""; + switch (b[2]) { + case "Jan": + temp="01" + break; + case "Feb": + temp="02" + break; + case "Mar": + temp="03" + break; + case "Apr": + temp="04" + break; + case "May": + temp="05" + break; + case "Jun": + temp="06" + break; + case "Jul": + temp="07" + break; + case "Aug": + temp="08" + break; + case "Sep": + temp="09" + break; + case "Oct": + temp="10" + break; + case "Nov": + temp="11" + break; + case "Dec": + temp="12" + break; + default: + temp="00" + break; + } + foo=substr(a[3],1,1); + return b[3]"-"temp"-"b[1] +} + +function getimage(str) { + if (str ~/=/) { + split(str,a,"="); + x=split(a[2],b,"/"); + return b[x] + } else { + x=split(str,b,"/"); + return b[x] + } +} + +$7 ~/\.qcow2$|\.iso$|\.raw\.xz$|\.box$/ && $6 ~/GET/ && $9 ~/302|200/ { + date = convertdate($4) + iso = getimage($7) + ip = $1 + print date, ip, iso +} + diff --git a/roles/web-data-analysis/tasks/main.yml b/roles/web-data-analysis/tasks/main.yml index 1293b4eebb..422efb8c39 100644 --- a/roles/web-data-analysis/tasks/main.yml +++ b/roles/web-data-analysis/tasks/main.yml @@ -32,28 +32,32 @@ tags: - web-data -- name: script to run the daily mirror log condenser - copy: src=condense-mirrorlogs.sh dest=/usr/local/bin/ mode=0755 +- name: scripts to condense data down for further processing + copy: src={{item}} dest=/usr/local/bin/ mode=0755 + with_items: [condense-mirrorlogs.sh, condense-getfedoralogs.sh] tags: - web-data -- name: python script to calculate mirrorlist +- name: python scripts to calculate various data copy: src=mirrorlist.py dest=/usr/local/bin mode=0755 tags: - web-data -- name: awk file for csv creation - copy: src=mirror-data.awk dest=/usr/local/share/web-data-analysis mode=0644 +- name: awk files for csv creation + copy: src={{item}} dest=/usr/local/share/web-data-analysis mode=0644 + with_items: [mirror-data.awk, getfedora-data.awk] tags: - web-data -- name: gnuplot file for csv creation - copy: src=mirror-data.gp dest=/usr/local/share/web-data-analysis mode=0644 +- name: gnuplot file for image creation + copy: src={{item} dest=/usr/local/share/web-data-analysis mode=0644 + with_items: [ mirror-data.gp, getfedora-data.gp ] tags: - web-data - name: daily cron file to run the log files - copy: src=condense-mirrorlogs.cron dest=/etc/cron.d/ mode=0644 + copy: src={{item}} dest=/etc/cron.d/ mode=0644 + with_items: [condense-mirrorlogs.cron, condense-getfedoralogs.cron] tags: - web-data - cron