2016-05-04 17:00:23 +00:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
# This file is part of Fedora Project Infrastructure Ansible
|
|
|
|
# Repository.
|
|
|
|
#
|
|
|
|
# Fedora Project Infrastructure Ansible Repository is free software:
|
|
|
|
# you can redistribute it and/or modify it under the terms of the GNU
|
|
|
|
# General Public License as published by the Free Software Foundation,
|
|
|
|
# either version 3 of the License, or (at your option) any later
|
|
|
|
# version.
|
|
|
|
#
|
|
|
|
# Fedora Project Infrastructure Ansible Repository is distributed in
|
|
|
|
# the hope that it will be useful, but WITHOUT ANY WARRANTY; without
|
|
|
|
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
|
|
|
# PARTICULAR PURPOSE. See the GNU General Public License for more
|
|
|
|
# details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with Fedora Project Infrastructure Ansible Repository. If
|
|
|
|
# not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
# There is a multiday delay involved in processing the logs. It
|
|
|
|
# may take up to 4 days to get the logs to the main-server. It may
|
|
|
|
# take a day to combine all the logs onto combined-httpd. So we assume
|
|
|
|
# we are 5 days behind.
|
|
|
|
|
2016-05-04 19:04:46 +00:00
|
|
|
let NUMDAYS=5
|
|
|
|
let OLDDAYS=$(( $NUMDAYS+1 ))
|
|
|
|
|
2016-05-06 12:30:54 +00:00
|
|
|
PROJECT=mirrors
|
2016-05-06 12:46:08 +00:00
|
|
|
WEBLOG=${PROJECT}.fedoraproject.org
|
2016-05-05 19:56:03 +00:00
|
|
|
|
2016-05-04 19:04:46 +00:00
|
|
|
# This is the year/month/day for a N days ago.
|
2016-05-04 17:00:23 +00:00
|
|
|
YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y)
|
|
|
|
MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m)
|
|
|
|
DAY=$(/bin/date -d "-${NUMDAYS} days" +%d)
|
|
|
|
|
2016-05-04 19:04:46 +00:00
|
|
|
# And we have have to deal with year/month/day boundaries for our later grep.
|
2016-05-04 19:12:46 +00:00
|
|
|
OLDDATE=$(/bin/date -d "-${OLDDAYS} days" +%Y-%m-%d)
|
|
|
|
OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y)
|
2016-05-04 19:04:46 +00:00
|
|
|
|
2016-05-04 17:00:23 +00:00
|
|
|
NFSDIR=/mnt/fedora_stats/combined-http
|
2016-05-06 12:46:08 +00:00
|
|
|
TARGET=${NFSDIR}/${YEAR}/${MONTH}/${DAY}
|
2016-05-04 17:00:23 +00:00
|
|
|
|
2016-05-05 19:56:03 +00:00
|
|
|
LOGFILE=${TARGET}/${WEBLOG}-access.log
|
2016-05-04 17:00:23 +00:00
|
|
|
|
2016-05-05 19:56:03 +00:00
|
|
|
WORKDIR=/mnt/fedora_stats/data/${PROJECT}
|
2016-05-05 12:04:29 +00:00
|
|
|
WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/worked-${DAY}
|
2016-05-04 17:00:23 +00:00
|
|
|
|
2016-05-05 19:56:03 +00:00
|
|
|
WEBDIR=/var/www/html/csv-reports/${PROJECT}
|
2016-05-04 17:00:23 +00:00
|
|
|
|
2016-05-04 19:04:46 +00:00
|
|
|
TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX )
|
|
|
|
|
2016-05-05 19:56:03 +00:00
|
|
|
LBIN=/usr/local/bin/
|
|
|
|
LSHARE=/usr/local/share/web-data-analysis
|
2016-05-04 19:04:46 +00:00
|
|
|
|
2016-05-04 17:00:23 +00:00
|
|
|
mkdir -p ${WORKDIR}/${YEAR}/${MONTH}
|
2016-05-06 13:53:37 +00:00
|
|
|
if [[ ! -f ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} ]]; then
|
|
|
|
touch ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH}
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [[ ! -f ${WORKDIR}/out-${YEAR} ]]; then
|
|
|
|
touch ${WORKDIR}/out-${YEAR}
|
|
|
|
fi
|
|
|
|
|
2016-05-04 17:00:23 +00:00
|
|
|
|
|
|
|
if [[ ! -f ${LOGFILE} ]]; then
|
|
|
|
echo "No logfile found for ${YEAR}/${MONTH}/${DAY}. Please fix."
|
|
|
|
elif [[ -f ${WORKFILE} ]]; then
|
|
|
|
echo "The workfile for ${YEAR}/${MONTH}/${DAY} already existed."
|
|
|
|
else
|
2016-05-05 19:56:03 +00:00
|
|
|
${LBIN}/mirrorlist.py -o ${WORKFILE} ${LOGFILE};
|
2016-05-04 17:00:23 +00:00
|
|
|
fi
|
|
|
|
|
2016-05-04 20:14:44 +00:00
|
|
|
# So the data isn't strictly across month boundries due to the end of
|
|
|
|
# the logfiles being at 04:00 versus 23:59. Also log files might get
|
|
|
|
# stuck and you end up with days or weeks of data in a single
|
|
|
|
# file. Doing a continual sort clears up that.
|
|
|
|
|
2016-05-05 12:04:29 +00:00
|
|
|
sort -o ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} -S 4G -u -m ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} ${WORKFILE}
|
2016-05-06 13:53:37 +00:00
|
|
|
sort -o ${WORKDIR}/out-${YEAR} -S 4G -u -m ${WORKDIR}/out-${YEAR} ${WORKFILE}
|
2016-05-04 17:00:23 +00:00
|
|
|
|
2016-05-04 19:04:46 +00:00
|
|
|
# Because the logs stop at 04:00 we can only get 24 hours from 6 days before.
|
2016-05-04 19:12:46 +00:00
|
|
|
egrep "${OLDDATE}" ${WORKDIR}/out-${OLDYEAR} > ${TEMPDIR}/watched-day
|
|
|
|
|
2016-05-04 20:14:44 +00:00
|
|
|
# Grab the data and put it in the two files. This makes it a lot
|
|
|
|
# faster to process as a whole year may take an hour to go through.
|
|
|
|
|
|
|
|
for i in ${OLDYEAR} all; do
|
2016-05-05 19:56:03 +00:00
|
|
|
awk -f ${LSHARE}/${PROJECT}-data.awk ${TEMPDIR}/watched-day >> ${WEBDIR}/${PROJECT}data-${i}.csv
|
2016-05-05 20:03:09 +00:00
|
|
|
sort -o ${WEBDIR}/${PROJECT}data-${i}.csv -u ${WEBDIR}/${PROJECT}data-${i}.csv
|
2016-05-04 20:14:44 +00:00
|
|
|
done
|
2016-05-04 19:04:46 +00:00
|
|
|
|
2016-05-05 19:56:03 +00:00
|
|
|
gnuplot ${LSHARE}/${PROJECT}-data.gp
|
2016-05-04 19:53:18 +00:00
|
|
|
|
|
|
|
# cleanup the temp data
|
|
|
|
rm -rf ${TEMPDIR}
|