From 568d06cd84cf25bdee45759f76b2e16f8d5067a7 Mon Sep 17 00:00:00 2001 From: Stephen Smoogen Date: Wed, 15 Feb 2017 19:26:30 +0000 Subject: [PATCH] we have a temporary fix for the hotspot data --- .../files/condense-hotspot.sh | 88 +++++++++++++++++ .../web-data-analysis/files/hotspot-data.awk | 40 ++++++++ .../files/hotspot-moving_avg.py | 56 +++++++++++ roles/web-data-analysis/files/hotspot.awk | 95 +++++++++++++++++++ roles/web-data-analysis/files/hotspot.gp | 25 +++++ 5 files changed, 304 insertions(+) create mode 100644 roles/web-data-analysis/files/condense-hotspot.sh create mode 100644 roles/web-data-analysis/files/hotspot-data.awk create mode 100644 roles/web-data-analysis/files/hotspot-moving_avg.py create mode 100644 roles/web-data-analysis/files/hotspot.awk create mode 100644 roles/web-data-analysis/files/hotspot.gp diff --git a/roles/web-data-analysis/files/condense-hotspot.sh b/roles/web-data-analysis/files/condense-hotspot.sh new file mode 100644 index 0000000000..17f3fcb2a7 --- /dev/null +++ b/roles/web-data-analysis/files/condense-hotspot.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# This file is part of Fedora Project Infrastructure Ansible +# Repository. +# +# Fedora Project Infrastructure Ansible Repository is free software: +# you can redistribute it and/or modify it under the terms of the GNU +# General Public License as published by the Free Software Foundation, +# either version 3 of the License, or (at your option) any later +# version. +# +# Fedora Project Infrastructure Ansible Repository is distributed in +# the hope that it will be useful, but WITHOUT ANY WARRANTY; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with Fedora Project Infrastructure Ansible Repository. If +# not, see . + +# There is a multiday delay involved in processing the logs. It +# may take up to 4 days to get the logs to the main-server. It may +# take a day to combine all the logs onto combined-httpd. So we assume +# we are 5 days behind. + +let NUMDAYS=5 +let OLDDAYS=$(( $NUMDAYS+1 )) + +PROJECT=hotspot +WEBLOG=fedoraproject.org + +# This is the year/month/day for a N days ago. +YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y) +MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m) +DAY=$(/bin/date -d "-${NUMDAYS} days" +%d) + +# And we have have to deal with year/month/day boundaries for our later grep. +OLDDATE=$(/bin/date -d "-${OLDDAYS} days" +%Y-%m-%d) +OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y) + +NFSDIR=/mnt/fedora_stats/combined-http +TARGET=${NFSDIR}/${YEAR}/${MONTH}/${DAY} + +LOGFILE=${TARGET}/${WEBLOG}-access.log + +WORKDIR=/mnt/fedora_stats/data/${PROJECT} +WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/raw-${DAY} + +WEBDIR=/var/www/html/csv-reports/${PROJECT} + +TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX ) + +LBIN=/usr/local/bin/ +LSHARE=/usr/local/share/web-data-analysis + +mkdir -p ${WORKDIR}/${YEAR}/${MONTH} +if [[ ! -f ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} ]]; then + touch ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} +fi + +if [[ ! -f ${WORKDIR}/out-${YEAR} ]]; then + touch ${WORKDIR}/out-${YEAR} +fi + +if [[ ! -f ${LOGFILE} ]]; then + echo "No logfile found for ${YEAR}/${MONTH}/${DAY}. Please fix." +else + awk -f ${LSHARE}/${PROJECT}.awk ${LOGFILE} > ${WORKFILE} +fi + +# So the data isn't strictly across month boundries due to the end of +# the logfiles being at 04:00 versus 23:59. Also log files might get +# stuck and you end up with days or weeks of data in a single +# file. Because the data is pretty small we can get away with adding up the data every day. + +find ${WORKEDIR} -type f | grep raw- | xargs cat | sort -u | awk 'BEGIN{x=0; y=0}; {if (x != $1){ print x,y; x=$1; y=$2} else {y=y+$2}}' > ${WORKEDIR}/worked-all + + +awk -f ${LSHARE}/${PROJECT}-data.awk ${WORKEDIR}/worked-all | sort -u > ${WEBDIR}/${PROJECT}data-all.csv + +# Make the seven day moving average file +/usr/local/bin/hotspot-moving_avg.py > ${WEBDIR}/${PROJECT}data-all-7day-ma.csv + +gnuplot ${LSHARE}/${PROJECT}.gp + +# cleanup the temp data +rm -rf ${TEMPDIR} diff --git a/roles/web-data-analysis/files/hotspot-data.awk b/roles/web-data-analysis/files/hotspot-data.awk new file mode 100644 index 0000000000..496def828e --- /dev/null +++ b/roles/web-data-analysis/files/hotspot-data.awk @@ -0,0 +1,40 @@ +BEGIN{ + date=strftime("%F",0); + count=1; + sum=0; + most=0; + least=0 +} + +{ + newdate=strftime("%F",$1); # convert this to a printable date + if (date == strftime("%F",0)){ # we hit a min time and need to just print same stuff. + print date ",AVG,LEAST,MAX" + date=newdate; + count=1; # start count to 0. we should have 288 per day but logs are stupid + sum=$2; # start the sum + most=$2; # what is going to be our most per day + least=$2; # what is going to be our least per day + } else { + if (date != newdate){ + print date "," int(sum/count) "," least "," most; + date=newdate; + count=1; # start count to 0. we should have 288 per day but logs are stupid + sum=$2; # start the sum + most=$2; # what is going to be our most per day + least=$2; # what is going to be our least per day + } else { + count=count+1; + sum=sum+$2; + if ($2 > most){ + most=$2; + }; + if ($2 < least) { + least=$2; + } + } + } +} +END{ + print date "," int(sum/count) "," least "," most; +} diff --git a/roles/web-data-analysis/files/hotspot-moving_avg.py b/roles/web-data-analysis/files/hotspot-moving_avg.py new file mode 100644 index 0000000000..1489df62c9 --- /dev/null +++ b/roles/web-data-analysis/files/hotspot-moving_avg.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +# This file is part of Fedora Project Infrastructure Ansible +# Repository. +# +# Fedora Project Infrastructure Ansible Repository is free software: +# you can redistribute it and/or modify it under the terms of the GNU +# General Public License as published by the Free Software Foundation, +# either version 3 of the License, or (at your option) any later +# version. +# +# Fedora Project Infrastructure Ansible Repository is distributed in +# the hope that it will be useful, but WITHOUT ANY WARRANTY; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with Fedora Project Infrastructure Ansible Repository. If +# not, see . + +# This is a complete horrible hack to get something done. Patches are +# really welcome. + +import pandas +#import matplotlib.pyplot as plt +import math + +# Moving Average + +import pandas +import matplotlib.pyplot as plt +import math + +rolling = 7 + +tree = {} + +df = pandas.read_csv("hotspot-new.csv") + +dates = df['1970-01-01'] +AVG = pandas.rolling_mean(df['AVG'],rolling) +LEAST = pandas.rolling_mean(df['LEAST'],rolling) +MAX = pandas.rolling_mean(df['MAX'],rolling) + + +for i in xrange(0,len(dates)): + if math.isnan(MAX[i]): + csv_line = ",".join([dates[i],"0","0"]) + else: + csv_line = ",".join([dates[i], + str(int(AVG[i])), + str(int(LEAST[i])), + str(int(MAX[i])), + ]) + print csv_line diff --git a/roles/web-data-analysis/files/hotspot.awk b/roles/web-data-analysis/files/hotspot.awk new file mode 100644 index 0000000000..f47da6958e --- /dev/null +++ b/roles/web-data-analysis/files/hotspot.awk @@ -0,0 +1,95 @@ +# +# Take the apache log line +# 83.163.161.147 - - [30/Sep/2012:13:54:19 +0000] "GET /static/hotspot.txt HTTP/1.1" 200 3 "-" "dnssec-trigger/0.11" +# Convert to +# 1349013000 1 + +function convertdate(str) { + gsub(/\[/, "", str) + gsub(/\]/, "", str) + split(str,a,":"); + split(a[1],b,"/"); + temp=""; + switch (b[2]) { + case "Jan": + temp="01" + break; + case "Feb": + temp="02" + break; + case "Mar": + temp="03" + break; + case "Apr": + temp="04" + break; + case "May": + temp="05" + break; + case "Jun": + temp="06" + break; + case "Jul": + temp="07" + break; + case "Aug": + temp="08" + break; + case "Sep": + temp="09" + break; + case "Oct": + temp="10" + break; + case "Nov": + temp="11" + break; + case "Dec": + temp="12" + break; + default: + temp="00" + break; + } + x=b[3]" "temp" "b[1]" "a[2]" "a[3] " "a[4] + y=int(mktime(x)/300) # 300 seconds make 5 minutes (I NEED A GLOBAL VAR) + return y +} + + +BEGIN{ + timestamp=0; + num_ts = 0; + ts_hotspots=0; + total_hotsponts=0; +} + +# +# We assume that every 300 seconds a system will log in at least 1 +# time because the Networkmanager addon does so. +# Convert our date stamp to the nearest 5 minute block and add data to +# it. If the log file goes backwards or jumps etc this will mean +# multiple outputs for a timestamp. A later process will need to deal +# with that. All this will do is output how many it saw at that block +# in the log file. +# + +$7 ~/hotspot.txt/ && $6 ~/GET/ { + date = convertdate($4) + if (timestamp != date) { + num_ts = num_ts +1; + print (timestamp*300),ts_hotspots # GLOBAL VAR GOES HERE + timestamp = date; + ts_hotspots = 1; + } else { + ts_hotspots = ts_hotspots +1; + total_hotspots = total_hotspots +1; + } +} + +END { + num_ts = num_ts +1; + print int(timestamp*300),ts_hotspots # LOOK GLOBAL VAR AGAIN +} + +## END OF FILE diff --git a/roles/web-data-analysis/files/hotspot.gp b/roles/web-data-analysis/files/hotspot.gp new file mode 100644 index 0000000000..2b4d8c9c5c --- /dev/null +++ b/roles/web-data-analysis/files/hotspot.gp @@ -0,0 +1,25 @@ +set grid +set xdata time +set format x "%Y-%m-%d" +set timefmt "%Y-%m-%d" + +set datafile separator "," +set term png size 1600,1200 + +## +set output "hotspot-all.png" +set title "IPs grabbing hotspot per day" +plot ["2014-12-01":"2017-12-31"] \ + '/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:2 title 'Average every 5min' with lines lw 4, \ + '/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:3 title 'Least 5min' with lines lw 4, \ + '/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:4 title 'Max 5min' with lines lw 4 +unset output + +## +set output "hotspot-all-ma.png" +set title "Moving Average of IPs grabbing hotspot" +plot ["2014-12-01":"2017-12-31"] \ + '/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:2 title 'Average every 5min' with lines lw 4, \ + '/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:3 title 'Least 5min' with lines lw 4, \ + '/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:4 title 'Max 5min' with lines lw 4 +unset output