+1 On 18 November 2016 at 20:05, Adam Williamson <adamwill@xxxxxxxxxxxxxxxxx> wrote: > This commit should give us new 'filterlist' files alongside the > 'fullfilelist' and 'fullfiletimelist' files in the /fedora , /archive and > /alt folders on the mirrors. The contents of this file are the same as > 'fullfilelist', but with all directories, packages (.rpm or .drpm files) and > device tree boot files (.dtb files) removed. This gives a massively smaller > list which, right now, will be useful for fedfind (it can parse these lists > instead of rsync scraping) and may possibly be useful for other things in > future, I guess. > > It would be nice to have this now as this all came out of the work around > improving generation of the mediawriter 'available images' JSON file: we > want to use fedfind to generate that, but the rsync scraping is pretty heavy > for something that'll run quite frequently. This should improve things quite > a bit (especially as I've written fedfind to cache the files and only > re-download them if the Last-Modified header changes). I have the fedfind > changes all written and tested (I tested against the fullfilelist files). > > From 3f61df2f2879c23a7e44271527facce99bc92286 Mon Sep 17 00:00:00 2001 > From: Adam Williamson <awilliam@xxxxxxxxxx> > Date: Fri, 18 Nov 2016 16:34:38 -0800 > Subject: [PATCH] Generate filtered file lists for fedfind to use > > This adds `filterlist` files alongside the `fullfilelist` and > `fullfiletimelist` files. These are much, much shorter lists > which skip the entries for packages, ARM device tree boot files > and directories. They are intended for consumption by fedfind, > so it can stop using rync scraping to discover the image files > it looks for. To enable this, we update to a newer version of > `create-filelist` from upstream `quick-fedora-mirror` and make > `update-fullfiletimelist` create the filterlist files as well. > > We also delete a couple of old copies of `create-filelist`; > nirik made the two roles that use it share a common copy a few > months back, but missed deleting the copy each role had in its > `files` directory. > --- > files/scripts/create-filelist | 10 ++++++++- > files/scripts/update-fullfiletimelist | 19 ++++++++++++++-- > roles/bodhi2/backend/files/create-filelist | 36 > ------------------------------ > roles/releng/files/create-filelist | 36 > ------------------------------ > 4 files changed, 26 insertions(+), 75 deletions(-) > delete mode 100644 roles/bodhi2/backend/files/create-filelist > delete mode 100644 roles/releng/files/create-filelist > > diff --git a/files/scripts/create-filelist b/files/scripts/create-filelist > index eeba9d0..8fc3367 100755 > --- a/files/scripts/create-filelist > +++ b/files/scripts/create-filelist > @@ -57,7 +57,9 @@ def recursedir(path='.', skip=[], alwaysskip=['.~tmp~']): > def parseopts(): > null = open(os.devnull, 'w') > p = argparse.ArgumentParser( > - description='Generate a list of files and times, suitable for > consumption by quick-fedora-mirror.') > + description='Generate a list of files and times, suitable for > consumption by quick-fedora-mirror, ' > + 'and a much smaller list with packages, Device Tree > boot files, HTML files and ' > + 'directories filtered out, for consumption by > fedfind.') > p.add_argument('-c', '--checksum', action='store_true', > help='Include checksums of all repomd.xml files in the > file list.') > p.add_argument('-C', '--checksum-file', action='append', > dest='checksum_files', > @@ -73,6 +75,8 @@ def parseopts(): > help='Filename of the file list with times (default: > stdout).') > p.add_argument('-f', '--filelist', type=argparse.FileType('w'), > default=null, > help='Filename of the file list without times (default: > no plain file list is generated).') > + p.add_argument('-F', '--filterlist', type=argparse.FileType('w'), > default=null, > + help='Filename of the filtered file list for fedfind > (default: not generated).') > > opts = p.parse_args() > > @@ -107,6 +111,10 @@ def main(): > for entry in recursedir(skip=opts.skip_files): > # opts.filelist.write(entry.path + '\n') > print(entry.path, file=opts.filelist) > + # write to filtered list if appropriate > + skips = ('.rpm', '.drpm', '.dtb', '.html') > + if not any(entry.path.endswith(skip) for skip in skips) and not > (entry.is_dir()): > + print(entry.path, file=opts.filterlist) > if entry.name in opts.checksum_files: > checksums[entry.path[2:]] = True > info = entry.stat(follow_symlinks=False) > diff --git a/files/scripts/update-fullfiletimelist > b/files/scripts/update-fullfiletimelist > index 016ca8e..e70fadc 100755 > --- a/files/scripts/update-fullfiletimelist > +++ b/files/scripts/update-fullfiletimelist > @@ -25,6 +25,7 @@ CREATE=/usr/local/bin/create-filelist > # context. > FILELIST=fullfilelist > TIMELIST='fullfiletimelist-$mod' > +FILTERLIST=filterlist > > usage () { > echo > @@ -107,12 +108,14 @@ cd $tmpd > for mod in $MODS; do > currentfl=$TOPD/$mod/${FILELIST/'$mod'/$mod} > currenttl=$TOPD/$mod/${TIMELIST/'$mod'/$mod} > + currentsl=$TOPD/$mod/${FILTERLIST/'$mod'/$mod} > flname=$(basename $currentfl) > tlname=$(basename $currenttl) > + slname=$(basename $currentsl) > > - $CREATE -c -s -d $TOPD/$mod -f $flname -t $tlname > + $CREATE -c -s -d $TOPD/$mod -f $flname -t $tlname -F $slname > > - # If a file list exsts and doesn't differ from what we just > generated, > + # If a file list exists and doesn't differ from what we just > generated, > # delete the latter. > if [[ -f $currentfl ]] && diff -q $currentfl $flname > /dev/null; > then > rm -f $flname > @@ -120,6 +123,9 @@ cd $tmpd > if [[ -f $currenttl ]] && diff -q $currenttl $tlname > /dev/null; > then > rm -f $tlname > fi > + if [[ -f $currentsl ]] && diff -q $currentsl $slname > /dev/null; > then > + rm -f $slname > + fi > done > > # Now we have the new file lists but in a temporary directory which > @@ -128,10 +134,13 @@ cd $tmpd > for mod in $MODS; do > currentfl=$TOPD/$mod/${FILELIST/'$mod'/$mod} > currenttl=$TOPD/$mod/${TIMELIST/'$mod'/$mod} > + currentsl=$TOPD/$mod/${FILTERLIST/'$mod'/$mod} > flname=$(basename $currentfl) > fldir=$(dirname $currentfl) > tlname=$(basename $currenttl) > tldir=$(dirname $currenttl) > + slname=$(basename $currentsl) > + sldir=$(dirname $currentsl) > > if [[ -f $flname ]]; then > tmpf=$(mktemp -p $fldir $flname.XXXXXXXXXX) > @@ -145,6 +154,12 @@ cd $tmpd > chmod 644 $tmpf > mv $tmpf $currenttl > fi > + if [[ -f $slname ]]; then > + tmpf=$(mktemp -p $sldir $slname.XXXXXXXXXX) > + cp -p $slname $tmpf > + chmod 644 $tmpf > + mv $tmpf $currentsl > + fi > done > > ) 9>$LOCKFILE > diff --git a/roles/bodhi2/backend/files/create-filelist > b/roles/bodhi2/backend/files/create-filelist > deleted file mode 100644 > index d95000e..0000000 > --- a/roles/bodhi2/backend/files/create-filelist > +++ /dev/null > @@ -1,36 +0,0 @@ > -#!/usr/bin/python > - > -# A simple script to generate a file list in a format easily consumable by > a > -# shell script. > - > -# Originally written by Jason Tibbitts <tibbs@xxxxxxxxxxx> in 2016. > -# Donated to the public domain. If you require a statement of license, > please > -# consider this work to be licensed as "CC0 Universal", any version you > choose. > - > - > -from scandir import scandir > - > - > -def get_ftype(entry): > - """Return a simple indicator of the file type.""" > - if entry.is_symlink(): > - return 'l' > - if entry.is_dir(): > - return 'd' > - return 'f' > - > - > -def recursedir(path): > - """Just like scandir, but recursively.""" > - for entry in scandir(path): > - if entry.is_dir(follow_symlinks=False): > - for rentry in recursedir(entry.path): > - yield rentry > - yield entry > - > - > -for entry in recursedir('.'): > - info = entry.stat(follow_symlinks=False) > - modtime = max(info.st_mtime, info.st_ctime) > - ftype = get_ftype(entry) > - print('{} {} {}'.format(modtime, ftype, entry.path[2:])) > diff --git a/roles/releng/files/create-filelist > b/roles/releng/files/create-filelist > deleted file mode 100644 > index d95000e..0000000 > --- a/roles/releng/files/create-filelist > +++ /dev/null > @@ -1,36 +0,0 @@ > -#!/usr/bin/python > - > -# A simple script to generate a file list in a format easily consumable by > a > -# shell script. > - > -# Originally written by Jason Tibbitts <tibbs@xxxxxxxxxxx> in 2016. > -# Donated to the public domain. If you require a statement of license, > please > -# consider this work to be licensed as "CC0 Universal", any version you > choose. > - > - > -from scandir import scandir > - > - > -def get_ftype(entry): > - """Return a simple indicator of the file type.""" > - if entry.is_symlink(): > - return 'l' > - if entry.is_dir(): > - return 'd' > - return 'f' > - > - > -def recursedir(path): > - """Just like scandir, but recursively.""" > - for entry in scandir(path): > - if entry.is_dir(follow_symlinks=False): > - for rentry in recursedir(entry.path): > - yield rentry > - yield entry > - > - > -for entry in recursedir('.'): > - info = entry.stat(follow_symlinks=False) > - modtime = max(info.st_mtime, info.st_ctime) > - ftype = get_ftype(entry) > - print('{} {} {}'.format(modtime, ftype, entry.path[2:])) > -- > 2.10.2 > > -- > Adam Williamson > Fedora QA Community Monkey > IRC: adamw | Twitter: AdamW_Fedora | XMPP: adamw AT happyassassin . net > http://www.happyassassin.net > _______________________________________________ > infrastructure mailing list -- infrastructure@xxxxxxxxxxxxxxxxxxxxxxx > To unsubscribe send an email to infrastructure-leave@xxxxxxxxxxxxxxxxxxxxxxx -- Stephen J Smoogen. _______________________________________________ infrastructure mailing list -- infrastructure@xxxxxxxxxxxxxxxxxxxxxxx To unsubscribe send an email to infrastructure-leave@xxxxxxxxxxxxxxxxxxxxxxx