we have a temporary fix for the hotspot data

2017-02-15 19:26:30 +00:00 · 2017-02-15 19:26:30 +00:00 · 568d06cd84
commit 568d06cd84
parent e1601ca869
5 changed files with 304 additions and 0 deletions
--- a/roles/web-data-analysis/files/condense-hotspot.sh
+++ b/roles/web-data-analysis/files/condense-hotspot.sh
@ -0,0 +1,88 @@
+#!/bin/bash
+
+# This file is part of Fedora Project Infrastructure Ansible
+# Repository.
+#
+# Fedora Project Infrastructure Ansible Repository is free software:
+# you can redistribute it and/or modify it under the terms of the GNU
+# General Public License as published by the Free Software Foundation,
+# either version 3 of the License, or (at your option) any later
+# version.
+#
+# Fedora Project Infrastructure Ansible Repository is distributed in
+# the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Fedora Project Infrastructure Ansible Repository.  If
+# not, see <http://www.gnu.org/licenses/>.
+
+# There is a multiday delay involved in processing the logs. It
+# may take up to 4 days to get the logs to the main-server. It may
+# take a day to combine all the logs onto combined-httpd. So we assume 
+# we are 5 days behind.
+
+let NUMDAYS=5
+let OLDDAYS=$(( $NUMDAYS+1 ))
+
+PROJECT=hotspot
+WEBLOG=fedoraproject.org
+
+# This is the year/month/day for a N days ago.
+YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y)
+MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m)
+DAY=$(/bin/date -d "-${NUMDAYS} days" +%d)
+
+# And we have have to deal with year/month/day boundaries for our later grep.
+OLDDATE=$(/bin/date -d "-${OLDDAYS} days" +%Y-%m-%d)
+OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y)
+
+NFSDIR=/mnt/fedora_stats/combined-http
+TARGET=${NFSDIR}/${YEAR}/${MONTH}/${DAY}
+
+LOGFILE=${TARGET}/${WEBLOG}-access.log
+
+WORKDIR=/mnt/fedora_stats/data/${PROJECT}
+WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/raw-${DAY}
+
+WEBDIR=/var/www/html/csv-reports/${PROJECT}
+
+TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX )
+
+LBIN=/usr/local/bin/
+LSHARE=/usr/local/share/web-data-analysis
+
+mkdir -p ${WORKDIR}/${YEAR}/${MONTH}
+if [[ ! -f ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} ]]; then
+    touch ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH}
+fi
+
+if [[ ! -f ${WORKDIR}/out-${YEAR} ]]; then
+    touch ${WORKDIR}/out-${YEAR}
+fi
+
+if [[ ! -f ${LOGFILE} ]]; then
+    echo "No logfile found for ${YEAR}/${MONTH}/${DAY}. Please fix."
+else
+    awk -f ${LSHARE}/${PROJECT}.awk ${LOGFILE} > ${WORKFILE}
+fi
+
+# So the data isn't strictly across month boundries due to the end of
+# the logfiles being at 04:00 versus 23:59. Also log files might get
+# stuck and you end up with days or weeks of data in a single
+# file. Because the data is pretty small we can get away with adding up the data every day.
+
+find ${WORKEDIR} -type f | grep raw- | xargs cat  | sort -u | awk 'BEGIN{x=0; y=0}; {if (x != $1){ print x,y; x=$1; y=$2} else {y=y+$2}}' > ${WORKEDIR}/worked-all
+
+
+awk -f ${LSHARE}/${PROJECT}-data.awk ${WORKEDIR}/worked-all | sort -u > ${WEBDIR}/${PROJECT}data-all.csv
+
+# Make the seven day moving average file
+/usr/local/bin/hotspot-moving_avg.py > ${WEBDIR}/${PROJECT}data-all-7day-ma.csv
+
+gnuplot  ${LSHARE}/${PROJECT}.gp
+
+# cleanup the temp data
+rm -rf ${TEMPDIR}
--- a/roles/web-data-analysis/files/hotspot-data.awk
+++ b/roles/web-data-analysis/files/hotspot-data.awk
@ -0,0 +1,40 @@
+BEGIN{
+  date=strftime("%F",0);
+  count=1;
+  sum=0;
+  most=0;
+  least=0
+}
+
+{
+  newdate=strftime("%F",$1);                # convert this to a printable date
+  if (date == strftime("%F",0)){            # we hit a min time and need to just print same stuff.
+    print date ",AVG,LEAST,MAX"
+    date=newdate;
+    count=1;    # start count to 0. we should have 288 per day but logs are stupid
+    sum=$2;     # start the sum
+    most=$2;    # what is going to be our most per day
+    least=$2;   # what is going to be our least per day
+  } else {
+    if (date != newdate){
+      print date "," int(sum/count) "," least "," most;
+      date=newdate;
+      count=1;    # start count to 0. we should have 288 per day but logs are stupid
+      sum=$2;     # start the sum
+      most=$2;    # what is going to be our most per day
+      least=$2;   # what is going to be our least per day
+    } else {
+      count=count+1;
+      sum=sum+$2;
+      if ($2 > most){
+	most=$2;
+      };
+      if ($2 < least) {
+	least=$2;
+      }
+    }
+  }
+}
+END{
+  print date "," int(sum/count) "," least "," most;
+}
--- a/roles/web-data-analysis/files/hotspot-moving_avg.py
+++ b/roles/web-data-analysis/files/hotspot-moving_avg.py
@ -0,0 +1,56 @@
+#!/usr/bin/python
+
+# This file is part of Fedora Project Infrastructure Ansible
+# Repository.
+#
+# Fedora Project Infrastructure Ansible Repository is free software:
+# you can redistribute it and/or modify it under the terms of the GNU
+# General Public License as published by the Free Software Foundation,
+# either version 3 of the License, or (at your option) any later
+# version.
+#
+# Fedora Project Infrastructure Ansible Repository is distributed in
+# the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Fedora Project Infrastructure Ansible Repository.  If
+# not, see <http://www.gnu.org/licenses/>.
+
+# This is a complete horrible hack to get something done. Patches are
+# really welcome.
+
+import pandas
+#import matplotlib.pyplot as plt
+import math
+
+# Moving Average
+
+import pandas
+import matplotlib.pyplot as plt
+import math
+
+rolling = 7
+
+tree = {}
+
+df = pandas.read_csv("hotspot-new.csv")
+
+dates = df['1970-01-01']
+AVG   = pandas.rolling_mean(df['AVG'],rolling)
+LEAST = pandas.rolling_mean(df['LEAST'],rolling)
+MAX   = pandas.rolling_mean(df['MAX'],rolling)
+
+
+for i in xrange(0,len(dates)):
+    if math.isnan(MAX[i]):
+        csv_line = ",".join([dates[i],"0","0"])
+    else:
+        csv_line = ",".join([dates[i],
+                             str(int(AVG[i])),
+                             str(int(LEAST[i])),
+                             str(int(MAX[i])),
+                         ])
+    print csv_line
--- a/roles/web-data-analysis/files/hotspot.awk
+++ b/roles/web-data-analysis/files/hotspot.awk
@ -0,0 +1,95 @@
+#
+# Take the apache log line
+# 83.163.161.147 - - [30/Sep/2012:13:54:19 +0000] "GET /static/hotspot.txt HTTP/1.1" 200 3 "-" "dnssec-trigger/0.11"
+# Convert to
+# 1349013000 1
+
+function convertdate(str) {
+  gsub(/\[/, "", str)
+  gsub(/\]/, "", str)
+  split(str,a,":");
+  split(a[1],b,"/");
+  temp="";
+  switch (b[2]) {
+  case "Jan":
+    temp="01"
+    break;
+  case "Feb":
+    temp="02"
+    break;
+  case "Mar":
+    temp="03"
+    break;
+  case "Apr":
+    temp="04"
+    break;
+  case "May":
+    temp="05"
+    break;
+  case "Jun":
+    temp="06"
+    break;
+  case "Jul":
+    temp="07"
+    break;
+  case "Aug":
+    temp="08"
+    break;
+  case "Sep":
+    temp="09"
+    break;
+  case "Oct":
+    temp="10"
+    break;
+  case "Nov":
+    temp="11"
+    break;
+  case "Dec":
+    temp="12"
+    break;
+  default:
+    temp="00"
+    break;
+  }
+  x=b[3]" "temp" "b[1]" "a[2]" "a[3] " "a[4]
+  y=int(mktime(x)/300) # 300 seconds make 5 minutes (I NEED A GLOBAL VAR)
+  return y
+}
+
+
+BEGIN{
+  timestamp=0;
+  num_ts = 0;
+  ts_hotspots=0;
+  total_hotsponts=0;
+}
+
+#
+# We assume that every 300 seconds a system will log in at least 1
+# time because the Networkmanager addon does so.
+# Convert our date stamp to the nearest 5 minute block and add data to
+# it. If the log file goes backwards or jumps etc this will mean
+# multiple outputs for a timestamp. A later process will need to deal
+# with that. All this will do is output how many it saw at that block
+# in the log file.
+#
+
+$7 ~/hotspot.txt/ && $6 ~/GET/ {
+  date = convertdate($4)
+  if (timestamp != date) {
+    num_ts = num_ts +1;
+    print (timestamp*300),ts_hotspots # GLOBAL VAR GOES HERE
+    timestamp = date;
+    ts_hotspots = 1;
+  } else {
+    ts_hotspots = ts_hotspots +1;
+    total_hotspots = total_hotspots +1;
+  }
+}
+
+END {
+  num_ts = num_ts +1;
+  print int(timestamp*300),ts_hotspots # LOOK GLOBAL VAR AGAIN                                                                                                             
+}
+
+## END OF FILE
--- a/roles/web-data-analysis/files/hotspot.gp
+++ b/roles/web-data-analysis/files/hotspot.gp
@ -0,0 +1,25 @@
+set grid
+set xdata time
+set format x "%Y-%m-%d"
+set timefmt "%Y-%m-%d"
+
+set datafile separator ","
+set term png size 1600,1200
+
+##
+set output "hotspot-all.png"
+set title "IPs grabbing hotspot per day"
+plot ["2014-12-01":"2017-12-31"] \
+     '/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:2 title 'Average every 5min' with lines lw 4, \
+     '/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:3 title 'Least 5min' with lines lw 4, \
+     '/var/www/html/csv-reports/hotspot/hotspotdata-all.csv' using 1:4 title 'Max 5min' with lines lw 4
+unset output
+
+##
+set output "hotspot-all-ma.png"
+set title "Moving Average of IPs grabbing hotspot"
+plot ["2014-12-01":"2017-12-31"] \
+     '/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:2 title 'Average every 5min' with lines lw 4, \
+     '/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:3 title 'Least 5min' with lines lw 4, \
+     '/var/www/html/csv-reports/hotspot/hotspotdatadata-all-7day-ma.csv' using 1:4 title 'Max 5min' with lines lw 4
+unset output