grokmirror 2.0 changes

grokmirror 2.0+ merges the old fsck and repos configs into one
grokmirror config.

Signed-off-by: Kevin Fenzi <kevin@scrye.com>
This commit is contained in:
Kevin Fenzi 2022-06-02 12:49:38 -07:00
parent 629f56f464
commit e37f4d7917
6 changed files with 332 additions and 181 deletions

View file

@ -21,13 +21,8 @@
tags:
- grokmirror-mirror
- name: install grokmirror repos config file from template
template: src=repos.conf dest={{grokmirror_topdir}}/repos.conf owner=root group=root mode=644
tags:
- grokmirror-mirror
- name: install grokmirror fsck config file from template
template: src=fsck.conf dest={{grokmirror_topdir}}/fsck.conf owner=root group=root mode=644
- name: install grokmirror config file from template
template: src=grokmirror.conf dest={{grokmirror_topdir}}/grokmirror.conf owner=root group=root mode=644
tags:
- grokmirror-mirror

View file

@ -1,66 +0,0 @@
# You can have multiple sections, just name them appropriately
[src.fedoraproject.org]
# Where is the manifest containing the list of repositories?
manifest = {{ grokmirror_topdir }}/src.fedoraproject.org/manifest.js.gz
#
# Where are the repositories kept?
#toplevel = /var/lib/git/mirror
toplevel = {{ grokmirror_topdir }}/src.fedoraproject.org
#
# Where do we put the logs?
#log = /var/log/mirror/kernelorg-fsck.log
log = {{ grokmirror_topdir }}/src.fedoraproject.org/src.fedoraproject.org-fsck.log
#
# Log level can be "info" or "debug"
#loglevel = info
loglevel = info
#
# Make sure there is only one instance of grok-fsck running by
# trying to exclusive-lock this file before we do anything.
lock = /var/tmp/kernelorg-fsck.lock
#
# Where to keep the status file
#statusfile = /var/lib/mirror/kernelorg-fsck.js
statusfile = /var/tmp/kernelorg-fsck.js
#
# How often should we check each repository, in days.
# Any newly added repository will have the first check within a random
# period of 0 and $frequency, and then every $frequency after that,
# to assure that not all repositories are checked on the same day.
# Don't set to less than 7 unless you only mirror a few repositories
# (or really like to thrash your disks).
#frequency = 30
frequency = 30
#
# Some errors are relatively benign and can be safely ignored. Add matching
# substrings to this field to ignore them.
ignore_errors = dangling commit
dangling blob
notice: HEAD points to an unborn branch
notice: No default references
contains zero-padded file modes
#
# Repack the repositories after calling git-fsck.
# This will especially save you space if you have shared repositories.
# To check if you have shared repositories, look at your manifest.js.gz to
# see if any repository definition has a "reference" key.
repack = yes
#
# Default repack flags are -A -d -l -q, but you can specify your own here
repack_flags = -A -d -l -q
#
# Once in a while you should repack your repositories more thoroughly,
# by passing a -f flag and using a larger window/depth. You shouldn't need
# to do this more frequently than once every 10-15 regular repacks.
#
# Trigger a full repack every N times.
full_repack_every = 10
#
# What flags should we use when doing a full repack.
full_repack_flags = -A -d -l -q -f --window=200 --depth=50
#
# Run git-prune to remove obsolete old objects if no other repositories are
# using the repo in their objects/info/alternates. If other repositories
# are relying on this repo via alternates, it will not be pruned to avoid
# potential corruption.
prune = yes

View file

@ -1 +1 @@
00 02 * * * grokmirror /usr/bin/grok-fsck -c {{ grokmirror_topdir }}/fsck.conf
00 02 * * * grokmirror /usr/bin/grok-fsck -c {{ grokmirror_topdir }}/grokmirror.conf

View file

@ -0,0 +1,328 @@
# Grokmirror 2.x and above have a single config file per each set
# of mirrored repos, instead of a separate repos.conf and fsck.conf
# with multiple sections.
#
# You can use ${varname} interpolation within the same section
# or ${sectname:varname} from any other section.
[core]
#
# Where are our mirrored repositories kept?
toplevel = {{ grokmirror_topdir }}/src.fedoraproject.org/
#
# Where should we keep our manifest file?
manifest = ${toplevel}/manifest.js.gz
#
# Where should we put our log? Make sure it is logrotated,
# otherwise it will grow indefinitely.
log = ${toplevel}/log
#
# Options are "info" and "debug" for all the debug data (lots!)
loglevel = info
#
# Grokmirror version 2.x and above can automatically recognize related repositories
# by analyzing root commits. If it finds two or more related repositories, it can set
# up a unified "object storage" repo and fetch all refs from each related repository.
# For example, you can have two forks of linux.git:
# foo/bar/linux.git:
# refs/heads/master
# refs/heads/devbranch
# refs/tags/v5.0-rc3
# ...
# baz/quux/linux.git:
# refs/heads/master
# refs/heads/devbranch
# refs/tags/v5.0-rc3
# ...
# Grokmirror will set up an object storage repository and fetch all refs from
# both repositories:
# objstore/[random-guid-name].git
# refs/virtual/[sha1-of-foo/bar/linux.git:12]/heads/master
# refs/virtual/[sha1-of-foo/bar/linux.git:12]/heads/devbranch
# refs/virtual/[sha1-of-foo/bar/linux.git:12]/tags/v5.0-rc3
# ...
# refs/virtual/[sha1-of-baz/quux/linux.git:12]/heads/master
# refs/virtual/[sha1-of-baz/quux/linux.git:12]/heads/devbranch
# refs/virtual/[sha1-of-baz/quux/linux.git:12]/tags/v5.0-rc3
# ...
#
# This will dramatically improve storage on disk, as original repositories will be
# repacked to almost nothing. Grokmirror will repack the object storage repository
# with --delta-islands to help optimize packs for efficient clones.
objstore = ${toplevel}/objstore
#
# When copying objects into objstore repositories, we will use regular git
# porcelain commands, such as git fetch. However, this tends to be slow due to
# git erring on the side of caution when calculating haves and wants, so if you
# are running a busy mirror and want to save a lot of cycles, you will want to
# enable the setting below, which will use internal git plumbing for much more
# direct object copying between repos.
#objstore_uses_plumbing = yes
#
# Due to the nature of git alternates, if two repositories share all their objects
# with an "object storage" repo, any object from repoA can be retrieved from repoB
# via most web UIs if someone knows the object hash.
# E.g. this is how this trick works on Github:
# https://github.com/torvalds/linux/blob/b4061a10fc29010a610ff2b5b20160d7335e69bf/drivers/hid/hid-samsung.c#L113-L118
#
# If you have private repositories that should absolutely not reveal any objects,
# add them here using shell-style globbing. They will still be set up for alternates
# if we find common roots with public repositories, but we won't fetch any objects
# from these repos into refs/virtual/*.
#
# Leave blank if you don't have any private repos (or don't offer a web UI).
#private = */private/*
# Used by grok-manifest (and others for "pretty"). These options can be
# overridden using matching command-line switches to grok-manifest.
[manifest]
# Enable to save pretty-printed js (larger and slower, but easier to debug)
pretty = no
# List of repositories to ignore -- can take multiple entries with newline+tab
# and accepts shell globbing.
ignore = /testing/*
/private/*
# Enable to fetch objects into objstore repos after commit. This can be useful if
# someone tries to push the same objects to a sibling repository, but may significantly
# slow down post-commit hook operation, negating any speed gains. If set to no, the
# objects will be fetched during regular grok-fsck runs.
fetch_objstore = no
# Only include repositories that have git-daemon-export-ok.
check_export_ok = no
# Used by grok-pull, mostly
[remote]
# The host part of the mirror you're pulling from.
site = https://src.fedoraproject.org
#
# Where the grok manifest is published. The following protocols
# are supported at this time:
# http:// or https:// using If-Modified-Since http header
# file:// (when manifest file is on NFS, for example)
# NB: You can no longer specify username:password as part of the URL with
# grokmirror 2.x and above. You can use a netrc file for this purpose.
manifest = ${site}/manifest.js.gz
#
# As an alternative to setting a manifest URL, you can define a manifest_command.
# It has three possible outcomes:
# exit code 0 + full remote manifest on stdout (must be valid json)
# exit code 1 + error message on stdout
# exit code 127 + nothing on stdout if remote manifest hasn't changed
# It should also accept '--force' as a single argument to force manifest retrieval
# even if it hasn't changed.
# See contrib/gitolite/* for example commands to use with gitolite.
#manifest_command = /usr/local/bin/grok-get-gl-manifest.sh
#
# If the remote is providing pre-generated preload bundles, list the path
# here. This is only useful if you're mirroring the entire repository
# collection and not just a handful of select repos.
#preload_bundle_url = https://some-cdn-site.com/preload/
# Used by grok-pull
[pull]
#
# Write out projects.list that can be used by gitweb or cgit.
# Leave blank if you don't want a projects.list.
projectslist =
#
# When generating projects.list, start at this subpath instead
# of at the toplevel. Useful when mirroring kernel or when generating
# multiple gitweb/cgit configurations for the same tree.
projectslist_trimtop =
#
# When generating projects.list, also create entries for symlinks.
# Otherwise we assume they are just legacy and keep them out of
# web interfaces.
projectslist_symlinks = no
#
# A simple hook to execute whenever a repository is modified.
# It passes the full path to the git repository modified as the final
# argument. You can define multiple hooks if you separate them by
# newline+whitespace.
post_update_hook =
#
# Should we purge repositories that are not present in the remote
# manifest? If set to "no" this can be overridden via the -p flag to
# grok-pull (useful if you have a very large collection of repos
# and don't want to walk the entire tree on each manifest run).
# See also: purgeprotect.
purge = yes
#
# There may be repositories that aren't replicated with grokmirror that
# you don't want to be purged. You can list them below using bash-style
# globbing. Separate multiple entries using newline+whitespace.
#nopurge = /gitolite-admin.git
#
# This prevents catastrophic mirror purges when our upstream gives us a
# manifest that is dramatically smaller than ours. The default is to
# refuse the purge if the remote manifest has over 5% fewer repositories
# than what we have, or in other words, if we have 100 repos and the
# remote manifest has shrunk to 95 repos or fewer, we refuse to purge,
# suspecting that something has gone wrong. You can set purgeprotect to
# a higher percentage, or override it entirely with --force-purge
# commandline flag.
purgeprotect = 5
#
# If owner is not specified in the manifest, who should be listed
# as the default owner in tools like gitweb or cgit?
#default_owner = Grokmirror User
default_owner = Grokmirror User
#
# By default, we'll call the upstream origin "_grokmirror", but you can set your
# own name here (e.g. just call it "origin")
remotename = _grokmirror
#
# To speed up updates, grok-pull will use multiple threads. Please be
# considerate to the mirror you're pulling from and don't set this very
# high. You may also run into per-ip multiple session limits, so leave
# this number at a nice low setting.
pull_threads = 5
#
# If git fetch fails, we will retry up to this many times before
# giving up and marking that repository as failed.
retries = 3
#
# Use shell-globbing to list the repositories you would like to mirror.
# If you want to mirror everything, just say "*". Separate multiple entries
# with newline plus tab. Examples:
#
# mirror everything:
#include = *
#
# mirror just the main kernel sources:
#include = /pub/scm/linux/kernel/git/torvalds/linux.git
# /pub/scm/linux/kernel/git/stable/linux.git
# /pub/scm/linux/kernel/git/next/linux-next.git
include = *
#
# This is processed after the include. If you want to exclude some
# specific entries from an all-inclusive globbing above. E.g., to
# exclude all linux-2.4 git sources:
#exclude = */linux-2.4*
exclude =
#
# List repositories that should always reject forced pushes.
#ffonly = */torvalds/linux.git
#
# If you enable the following option and run grok-pull with -o,
# grok-pull will run continuously and will periodically recheck the
# remote maniefest for new updates. See contrib for an example systemd
# service you can set up to continuously update your local mirror. The
# value is in seconds.
#refresh = 900
#
# If you enable refresh, you can also enable the socket listener that
# allows for rapid push notifications from your primary mirror. The
# socket expects repository names matching what is in the local
# manifest, followed by a newline. E.g.:
# /pub/scm/linux/kernel/git/torvalds/linux.git\n
#
# Anything not matching a repository in the local manifest will be ignored.
# See contrib for example pubsub listener.
#socket = ${core:toplevel}/.updater.socket
# Used by grok-fsck
[fsck]
#
# How often should we check each repository, in days. Any newly added
# repository will have the first check within a random period of 0 and
# $frequency, and then every $frequency after that, to assure that not
# all repositories are checked on the same day. Don't set to less than
# 7 unless you only mirror a few repositories (or really like to thrash
# your disks).
frequency = 30
#
# Where to keep the status file
statusfile = ${core:toplevel}/fsck.status.js
#
# Some errors are relatively benign and can be safely ignored. Add
# matching substrings to this field to ignore them.
ignore_errors = notice:
warning: disabling bitmap writing
ignoring extra bitmap file
missingTaggerEntry
missingSpaceBeforeDate
#
# If the fsck process finds errors that match any of these strings
# during its run, it will ask grok-pull to reclone this repository when
# it runs next. Only useful for minion mirrors, not for mirror masters.
reclone_on_errors = fatal: bad tree object
fatal: Failed to traverse parents
missing commit
missing blob
missing tree
broken link
#
# Should we repack the repositories? You almost always want this on,
# unless you are doing something really odd.
repack = yes
#
# We set proper flags for repacking depending if the repo is using
# alternates or not, and whether this is a full repack or not. We will
# also always build bitmaps (when it makes sense), to make cloning
# faster. You can add other flags (e.g. --threads and --window-memory)
# via the following parameter:
extra_repack_flags =
#
# These flags are added *in addition* to extra_repack_flags
extra_repack_flags_full = --window=250 --depth=50
#
# If git version is new enough to support generating commit graphs, we
# will always generate them, though if your git version is older than
# 2.24.0, the graphs won't be automatically used unless core.commitgraph
# is set to true. You can turn off graph generation by setting the
# commitgraph option to "no". Graph generation will be skipped for
# child repos that use alternates.
commitgraph = yes
#
# Run git-prune to remove obsolete loose objects. Grokmirror will make
# sure this is a safe operation when it comes to objstore repos, so you
# should leave this enabled.
prune = yes
#
# Grokmirror is extremely careful about not pruning the repositories
# that are used by others via git alternates. However, it cannot prevent
# some other git process (not grokmirror-managed) from inadvertently
# running "git prune/gc". For example, this may happen if an admin
# mistypes a command in the wrong directory. Setting precious=yes will
# add extensions.preciousObjects=true to the git configuration file in
# such repositories, which will help prevent repository corruption
# between grok-fsck runs.
#
# When set to "yes", grokmirror will temporarily turn this feature off
# when running scheduled repacks in order to be able to delete redundant
# packs and loose objects that have already been packed. This is usually
# a safe operation when done by grok-fsck itself. However, if you set
# this to "always", grokmirror will leave this enabled even during
# grok-fsck runs, for maximum paranoia. Be warned, that this will result
# in ever-growing git repositories, so it only makes sense in very rare
# situations, such as for backup purposes.
precious = yes
#
# If you have a lot of forks using the same objstore repo, you may end
# up with thousands of refs being negotiated during each remote update.
# This tends to result in higher load and bigger negotiation transfers.
# Setting the "baselines" option allows you to designate a set of repos
# that are likely to have most of the relevant objects and ignore the
# rest of the objstore refs. This is done using the
# core.alternateRefsPrefixes feature (see git-config).
baselines = */kernel/git/next/linux-next.git
#
# Objstore repos are repacked with delta island support (see man
# git-config), but if you have one repo that is a lot more likely to be
# cloned than all the other ones, you can designate it as "islandCore",
# which will give it priority when creating packs.
islandcores = */kernel/git/torvalds/linux.git
#
# Generate preload bundles for objstore repos and put them into this
# location. Unless you are running a major mirroring hub site, you
# do not want this enabled. See corresponding preload_bundle_url
# entry in the [remote] section.
#preload_bundle_outdir = /some/http/accessible/path
#
# If there are any critical errors, the report will be sent to root. You
# can change the settings below to configure report delivery to suit
# your needs:
#report_to = root
#report_from = root
#report_subject = git fsck errors on my beautiful replica
#report_mailhost = localhost

View file

@ -1 +1 @@
*/30 * * * * grokmirror /usr/bin/grok-pull -p -c {{ grokmirror_topdir }}/repos.conf
*/30 * * * * grokmirror /usr/bin/grok-pull -p -c {{ grokmirror_topdir }}/grokmirror.conf

View file

@ -1,106 +0,0 @@
# You can pull from multiple grok mirrors, just create
# a separate section for each mirror. The name can be anything.
[src.fedoraproject.org]
# The host part of the mirror you're pulling from.
#site = git://git.kernel.org
site = https://src.fedoraproject.org
#
# Where the grok manifest is published. The following protocols
# are supported at this time:
# http:// or https:// using If-Modified-Since http header
# file:// (when manifest file is on NFS, for example)
#manifest = http://git.kernel.org/manifest.js.gz
manifest = https://src.fedoraproject.org/grokmirror/manifest.js.gz
#
# Where are we going to put the mirror on our disk?
#toplevel = /var/lib/git/mirror
toplevel = {{ grokmirror_topdir }}/src.fedoraproject.org/
#
# Where do we store our own manifest? Usually in the toplevel.
#mymanifest = /var/lib/git/mirror/manifest.js.gz
mymanifest = {{ grokmirror_topdir }}/src.fedoraproject.org/manifest.js.gz
#
# Write out projects.list that can be used by gitweb or cgit.
# Leave blank if you don't want a projects.list.
#projectslist = /var/lib/git/mirror/projects.list
projectslist =
#
# When generating projects.list, start at this subpath instead
# of at the toplevel. Useful when mirroring kernel or when generating
# multiple gitweb/cgit configurations for the same tree.
#projectslist_trimtop = /pub/scm/
projectslist_trimtop = /pub/scm/
#
# The default behaviour is to use cross-repository references (if present)
# to set git alternates between projects. This helps both significantly
# reduce the bandwidth during cloning and dramatically reduce the mirror size
# on disk. If for some reason you always want to create non-shared repositories,
# set ignore_repo_references to "yes"
# NOTE: this will NOT remove alternates for repositories already mirrored. You
# will need to run "git repack -a" and then manually remove the
# objects/info/alternates file for each repository.
#ignore_repo_references = no
#
# When generating projects.list, also create entries for symlinks.
# Otherwise we assume they are just legacy and keep them out of
# web interfaces.
#projectslist_symlinks = yes
projectslist_symlinks = no
#
# A simple hook to execute whenever a repository is modified.
# It passes the full path to the git repository modified as the only
# argument.
#post_update_hook = /usr/local/bin/make-git-fairies-appear
post_update_hook =
#
# This prevents catastrophic mirror purges when our upstream master gives us
# a manifest that is dramatically smaller than ours. The default is to refuse
# the purge if the remote manifest has over 10% fewer repositories than what we
# have, or in other words, if we have 100 repos and the remote manifest has
# shrunk to 95 repos or less, we refuse to purge, suspecting that something has
# gone wrong. You can set purgeprotect to a higher percentage, or override
# it entirely with --force-purge commandline flag.
#purgeprotect = 5
purgeprotect = 5
#
# If owner is not specified in the manifest, who should be listed
# as the default owner in tools like gitweb or cgit?
#default_owner = Grokmirror User
default_owner = Grokmirror User
#
# Where do we put the logs?
#log = /var/log/mirror/kernelorg.log
log = {{ grokmirror_topdir }}/src.fedoraproject.org/src.fedoraproject.org.log
#
# Log level can be "info" or "debug"
#loglevel = info
loglevel = info
#
# To speed up updates, grok-pull will use multiple threads. Please be
# considerate to the mirror you're pulling from and don't set this very
# high. You may also run into per-ip multiple session limits, so leave this
# number at a nice low setting.
#pull_threads = 5
pull_threads = 5
#
# Use shell-globbing to list the repositories you would like to mirror.
# If you want to mirror everything, just say "*". Separate multiple entries
# with newline plus tab. Examples:
#
# mirror everything:
#include = *
#
# mirror just the main kernel sources:
#include = /pub/scm/linux/kernel/git/torvalds/linux.git
# /pub/scm/linux/kernel/git/stable/linux-stable.git
# /pub/scm/linux/kernel/git/next/linux-next.git
#
# mirror just git:
#include = /pub/scm/git/*
include = *
#
# This is processed after the include. If you want to exclude some specific
# entries from an all-inclusive globbing above. E.g., to exclude all linux-2.4
# git sources:
#exclude = */linux-2.4*
exclude =