we have a temporary fix for the hotspot data
This commit is contained in:
parent
e1601ca869
commit
568d06cd84
5 changed files with 304 additions and 0 deletions
88
roles/web-data-analysis/files/condense-hotspot.sh
Normal file
88
roles/web-data-analysis/files/condense-hotspot.sh
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This file is part of Fedora Project Infrastructure Ansible
|
||||||
|
# Repository.
|
||||||
|
#
|
||||||
|
# Fedora Project Infrastructure Ansible Repository is free software:
|
||||||
|
# you can redistribute it and/or modify it under the terms of the GNU
|
||||||
|
# General Public License as published by the Free Software Foundation,
|
||||||
|
# either version 3 of the License, or (at your option) any later
|
||||||
|
# version.
|
||||||
|
#
|
||||||
|
# Fedora Project Infrastructure Ansible Repository is distributed in
|
||||||
|
# the hope that it will be useful, but WITHOUT ANY WARRANTY; without
|
||||||
|
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
# PARTICULAR PURPOSE. See the GNU General Public License for more
|
||||||
|
# details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with Fedora Project Infrastructure Ansible Repository. If
|
||||||
|
# not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
# There is a multiday delay involved in processing the logs. It
|
||||||
|
# may take up to 4 days to get the logs to the main-server. It may
|
||||||
|
# take a day to combine all the logs onto combined-httpd. So we assume
|
||||||
|
# we are 5 days behind.
|
||||||
|
|
||||||
|
let NUMDAYS=5
|
||||||
|
let OLDDAYS=$(( $NUMDAYS+1 ))
|
||||||
|
|
||||||
|
PROJECT=hotspot
|
||||||
|
WEBLOG=fedoraproject.org
|
||||||
|
|
||||||
|
# This is the year/month/day for a N days ago.
|
||||||
|
YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y)
|
||||||
|
MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m)
|
||||||
|
DAY=$(/bin/date -d "-${NUMDAYS} days" +%d)
|
||||||
|
|
||||||
|
# And we have have to deal with year/month/day boundaries for our later grep.
|
||||||
|
OLDDATE=$(/bin/date -d "-${OLDDAYS} days" +%Y-%m-%d)
|
||||||
|
OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y)
|
||||||
|
|
||||||
|
NFSDIR=/mnt/fedora_stats/combined-http
|
||||||
|
TARGET=${NFSDIR}/${YEAR}/${MONTH}/${DAY}
|
||||||
|
|
||||||
|
LOGFILE=${TARGET}/${WEBLOG}-access.log
|
||||||
|
|
||||||
|
WORKDIR=/mnt/fedora_stats/data/${PROJECT}
|
||||||
|
WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/raw-${DAY}
|
||||||
|
|
||||||
|
WEBDIR=/var/www/html/csv-reports/${PROJECT}
|
||||||
|
|
||||||
|
TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX )
|
||||||
|
|
||||||
|
LBIN=/usr/local/bin/
|
||||||
|
LSHARE=/usr/local/share/web-data-analysis
|
||||||
|
|
||||||
|
mkdir -p ${WORKDIR}/${YEAR}/${MONTH}
|
||||||
|
if [[ ! -f ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} ]]; then
|
||||||
|
touch ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f ${WORKDIR}/out-${YEAR} ]]; then
|
||||||
|
touch ${WORKDIR}/out-${YEAR}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f ${LOGFILE} ]]; then
|
||||||
|
echo "No logfile found for ${YEAR}/${MONTH}/${DAY}. Please fix."
|
||||||
|
else
|
||||||
|
awk -f ${LSHARE}/${PROJECT}.awk ${LOGFILE} > ${WORKFILE}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# So the data isn't strictly across month boundries due to the end of
|
||||||
|
# the logfiles being at 04:00 versus 23:59. Also log files might get
|
||||||
|
# stuck and you end up with days or weeks of data in a single
|
||||||
|
# file. Because the data is pretty small we can get away with adding up the data every day.
|
||||||
|
|
||||||
|
find ${WORKEDIR} -type f | grep raw- | xargs cat | sort -u | awk 'BEGIN{x=0; y=0}; {if (x != $1){ print x,y; x=$1; y=$2} else {y=y+$2}}' > ${WORKEDIR}/worked-all
|
||||||
|
|
||||||
|
|
||||||
|
awk -f ${LSHARE}/${PROJECT}-data.awk ${WORKEDIR}/worked-all | sort -u > ${WEBDIR}/${PROJECT}data-all.csv
|
||||||
|
|
||||||
|
# Make the seven day moving average file
|
||||||
|
/usr/local/bin/hotspot-moving_avg.py > ${WEBDIR}/${PROJECT}data-all-7day-ma.csv
|
||||||
|
|
||||||
|
gnuplot ${LSHARE}/${PROJECT}.gp
|
||||||
|
|
||||||
|
# cleanup the temp data
|
||||||
|
rm -rf ${TEMPDIR}
|
40
roles/web-data-analysis/files/hotspot-data.awk
Normal file
40
roles/web-data-analysis/files/hotspot-data.awk
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
BEGIN{
|
||||||
|
date=strftime("%F",0);
|
||||||
|
count=1;
|
||||||
|
sum=0;
|
||||||
|
most=0;
|
||||||
|
least=0
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
newdate=strftime("%F",$1); # convert this to a printable date
|
||||||
|
if (date == strftime("%F",0)){ # we hit a min time and need to just print same stuff.
|
||||||
|
print date ",AVG,LEAST,MAX"
|
||||||
|
date=newdate;
|
||||||
|
count=1; # start count to 0. we should have 288 per day but logs are stupid
|
||||||
|
sum=$2; # start the sum
|
||||||
|
most=$2; # what is going to be our most per day
|
||||||
|
least=$2; # what is going to be our least per day
|
||||||
|
} else {
|
||||||
|
if (date != newdate){
|
||||||
|
print date "," int(sum/count) "," least "," most;
|
||||||
|
date=newdate;
|
||||||
|
count=1; # start count to 0. we should have 288 per day but logs are stupid
|
||||||
|
sum=$2; # start the sum
|
||||||
|
most=$2; # what is going to be our most per day
|
||||||
|
least=$2; # what is going to be our least per day
|
||||||
|
} else {
|
||||||
|
count=count+1;
|
||||||
|
sum=sum+$2;
|
||||||
|
if ($2 > most){
|
||||||
|
most=$2;
|
||||||
|
};
|
||||||
|
if ($2 < least) {
|
||||||
|
least=$2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END{
|
||||||
|
print date "," int(sum/count) "," least "," most;
|
||||||
|
}
|
56
roles/web-data-analysis/files/hotspot-moving_avg.py
Normal file
56
roles/web-data-analysis/files/hotspot-moving_avg.py
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
# This file is part of Fedora Project Infrastructure Ansible
|
||||||
|
# Repository.
|
||||||
|
#
|
||||||
|
# Fedora Project Infrastructure Ansible Repository is free software:
|
||||||
|
# you can redistribute it and/or modify it under the terms of the GNU
|
||||||
|
# General Public License as published by the Free Software Foundation,
|
||||||
|
# either version 3 of the License, or (at your option) any later
|
||||||
|
# version.
|
||||||
|
#
|
||||||
|
# Fedora Project Infrastructure Ansible Repository is distributed in
|
||||||
|
# the hope that it will be useful, but WITHOUT ANY WARRANTY; without
|
||||||
|
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
# PARTICULAR PURPOSE. See the GNU General Public License for more
|
||||||
|
# details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with Fedora Project Infrastructure Ansible Repository. If
|
||||||
|
# not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
# This is a complete horrible hack to get something done. Patches are
|
||||||
|
# really welcome.
|
||||||
|
|
||||||
|
import pandas
|
||||||
|
#import matplotlib.pyplot as plt
|
||||||
|
import math
|
||||||
|
|
||||||
|
# Moving Average
|
||||||
|
|
||||||
|
import pandas
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import math
|
||||||
|
|
||||||
|
rolling = 7
|
||||||
|
|
||||||
|
tree = {}
|
||||||
|
|
||||||
|
df = pandas.read_csv("hotspot-new.csv")
|
||||||
|
|
||||||
|
dates = df['1970-01-01']
|
||||||
|
AVG = pandas.rolling_mean(df['AVG'],rolling)
|
||||||
|
LEAST = pandas.rolling_mean(df['LEAST'],rolling)
|
||||||
|
MAX = pandas.rolling_mean(df['MAX'],rolling)
|
||||||
|
|
||||||
|
|
||||||
|
for i in xrange(0,len(dates)):
|
||||||
|
if math.isnan(MAX[i]):
|
||||||
|
csv_line = ",".join([dates[i],"0","0"])
|
||||||
|
else:
|
||||||
|
csv_line = ",".join([dates[i],
|
||||||
|
str(int(AVG[i])),
|
||||||
|
str(int(LEAST[i])),
|
||||||
|
str(int(MAX[i])),
|
||||||
|
])
|
||||||
|
print csv_line
|
95
roles/web-data-analysis/files/hotspot.awk
Normal file
95
roles/web-data-analysis/files/hotspot.awk
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
#
|
||||||
|
# Take the apache log line
|
||||||
|
# 83.163.161.147 - - [30/Sep/2012:13:54:19 +0000] "GET /static/hotspot.txt HTTP/1.1" 200 3 "-" "dnssec-trigger/0.11"
|
||||||
|
# Convert to
|
||||||
|
# 1349013000 1
|
||||||
|
|
||||||
|
function convertdate(str) {
|
||||||
|
gsub(/\[/, "", str)
|
||||||
|
gsub(/\]/, "", str)
|
||||||
|
split(str,a,":");
|
||||||
|
split(a[1],b,"/");
|
||||||
|
temp="";
|
||||||
|
switch (b[2]) {
|
||||||
|
case "Jan":
|
||||||
|
temp="01"
|
||||||
|
break;
|
||||||
|
case "Feb":
|
||||||
|
temp="02"
|
||||||
|
break;
|
||||||
|
case "Mar":
|
||||||
|
temp="03"
|
||||||
|
break;
|
||||||
|
case "Apr":
|
||||||
|
temp="04"
|
||||||
|
break;
|
||||||
|
case "May":
|
||||||
|
temp="05"
|
||||||
|
break;
|
||||||
|
case "Jun":
|
||||||
|
temp="06"
|
||||||
|
break;
|
||||||
|
case "Jul":
|
||||||
|
temp="07"
|
||||||
|
break;
|
||||||
|
case "Aug":
|
||||||
|
temp="08"
|
||||||
|
break;
|
||||||
|
case "Sep":
|
||||||
|
temp="09"
|
||||||
|
break;
|
||||||
|
case "Oct":
|
||||||
|
temp="10"
|
||||||
|
break;
|
||||||
|
case "Nov":
|
||||||
|
temp="11"
|
||||||
|
break;
|
||||||
|
case "Dec":
|
||||||
|
temp="12"
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
temp="00"
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
x=b[3]" "temp" "b[1]" "a[2]" "a[3] " "a[4]
|
||||||
|
y=int(mktime(x)/300) # 300 seconds make 5 minutes (I NEED A GLOBAL VAR)
|
||||||
|
return y
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BEGIN{
|
||||||
|
timestamp=0;
|
||||||
|
num_ts = 0;
|
||||||
|
ts_hotspots=0;
|
||||||
|
total_hotsponts=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# We assume that every 300 seconds a system will log in at least 1
|
||||||
|
# time because the Networkmanager addon does so.
|
||||||
|
# Convert our date stamp to the nearest 5 minute block and add data to
|
||||||
|
# it. If the log file goes backwards or jumps etc this will mean
|
||||||
|
# multiple outputs for a timestamp. A later process will need to deal
|
||||||
|
# with that. All this will do is output how many it saw at that block
|
||||||
|
# in the log file.
|
||||||
|
#
|
||||||
|
|
||||||
|
$7 ~/hotspot.txt/ && $6 ~/GET/ {
|
||||||
|
date = convertdate($4)
|
||||||
|
if (timestamp != date) {
|
||||||
|
num_ts = num_ts +1;
|
||||||
|
print (timestamp*300),ts_hotspots # GLOBAL VAR GOES HERE
|
||||||
|
timestamp = date;
|
||||||
|
ts_hotspots = 1;
|
||||||
|
} else {
|
||||||
|
ts_hotspots = ts_hotspots +1;
|
||||||
|
total_hotspots = total_hotspots +1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
END {
|
||||||
|
num_ts = num_ts +1;
|
||||||
|
print int(timestamp*300),ts_hotspots # LOOK GLOBAL VAR AGAIN
|
||||||
|
}
|
||||||
|
|
||||||
|
## END OF FILE
|
25
roles/web-data-analysis/files/hotspot.gp
Normal file
25
roles/web-data-analysis/files/hotspot.gp
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
set grid
|
||||||
|
set xdata time
|
||||||
|
set format x "%Y-%m-%d"
|
||||||
|
set timefmt "%Y-%m-%d"
|
||||||
|
|
||||||
|
set datafile separator ","
|
||||||
|
set term png size 1600,1200
|
||||||
|
|
||||||
|
##
|
||||||
|
set output "hotspot-all.png"
|
||||||
|
set title "IPs grabbing hotspot per day"
|
||||||
|
plot ["2014-12-01":"2017-12-31"] \
|
||||||
|
'/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:2 title 'Average every 5min' with lines lw 4, \
|
||||||
|
'/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:3 title 'Least 5min' with lines lw 4, \
|
||||||
|
'/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:4 title 'Max 5min' with lines lw 4
|
||||||
|
unset output
|
||||||
|
|
||||||
|
##
|
||||||
|
set output "hotspot-all-ma.png"
|
||||||
|
set title "Moving Average of IPs grabbing hotspot"
|
||||||
|
plot ["2014-12-01":"2017-12-31"] \
|
||||||
|
'/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:2 title 'Average every 5min' with lines lw 4, \
|
||||||
|
'/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:3 title 'Least 5min' with lines lw 4, \
|
||||||
|
'/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:4 title 'Max 5min' with lines lw 4
|
||||||
|
unset output
|
Loading…
Add table
Add a link
Reference in a new issue