In many cases we move around or rename internal anchors which may break links leading to the content. docutils handle the case of links inside a document, but we are lacking the same form of checking between documents. Introduce a script which cross-checks all the anchors and links in HTML output files and prints problems and use it as a test case for the 'docs' directory. Signed-off-by: Peter Krempa <pkrempa@xxxxxxxxxx> --- docs/meson.build | 11 +++ scripts/check-html-references.py | 153 +++++++++++++++++++++++++++++++ scripts/meson.build | 1 + 3 files changed, 165 insertions(+) create mode 100755 scripts/check-html-references.py diff --git a/docs/meson.build b/docs/meson.build index d71f6006dd..cb70ef6084 100644 --- a/docs/meson.build +++ b/docs/meson.build @@ -350,3 +350,14 @@ run_target( ], depends: install_web_deps, ) + +test( + 'check-html-references', + python3_prog, + args: [ + check_html_references_prog.path(), + '--prefix', + meson.build_root() / 'docs' + ], + env: runutf8, +) diff --git a/scripts/check-html-references.py b/scripts/check-html-references.py new file mode 100755 index 0000000000..95a61a6bb4 --- /dev/null +++ b/scripts/check-html-references.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library. If not, see +# <http://www.gnu.org/licenses/>. +# +# Check that external references between documentation HTML files are not broken. + +import sys +import os +import argparse +import re +import xml.etree.ElementTree as ET + +ns = {'html': 'http://www.w3.org/1999/xhtml'} +externallinks = [] + + +def get_file_list(prefix): + filelist = [] + + for root, dir, files in os.walk(prefix): + prefixbase = os.path.dirname(prefix) + + if root.startswith(prefixbase): + relroot = root[len(prefixbase):] + else: + relroot = root + + for file in files: + if not re.search('\\.html$', file): + continue + + # the 404 page doesn't play well + if '404.html' in file: + continue + + fullfilename = os.path.join(root, file) + relfilename = os.path.join(relroot, file) + filelist.append((fullfilename, relfilename)) + + return filelist + + +# loads an XHTML and extracts all anchors, local and remote links for the one file +def process_file(filetuple): + filename, relfilename = filetuple + tree = ET.parse(filename) + root = tree.getroot() + + anchors = [relfilename] + targets = [] + + for elem in root.findall('.//html:a', ns): + target = elem.get('href') + an = elem.get('id') + + if an: + anchors.append(relfilename + '#' + an) + + if target: + if re.search('://', target): + externallinks.append(target) + elif target[0] != '#' and 'mailto:' not in target: + dirname = os.path.dirname(relfilename) + targetname = os.path.normpath(os.path.join(dirname, target)) + + targets.append((targetname, filename, target)) + + # older docutils generate "<div class='section'" + for elem in root.findall('.//html:div/[@class=\'section\']', ns): + an = elem.get('id') + + if an: + anchors.append(relfilename + '#' + an) + + # modern docutils generate a <section element + for elem in root.findall('.//html:section', ns): + an = elem.get('id') + + if an: + anchors.append(relfilename + '#' + an) + + return (anchors, targets) + + +def process_all(filelist): + anchors = [] + targets = [] + + for filetuple in filelist: + anchor, target = process_file(filetuple) + + targets = targets + target + anchors = anchors + anchor + + return (targets, anchors) + + +def check_targets(targets, anchors): + errors = [] + for target, targetfrom, targetorig in targets: + if target not in anchors: + errors.append((targetfrom, targetorig)) + + if errors: + errors.sort() + + print('broken link targets:') + + for file, target in errors: + print(file + " broken link: " + target) + + return True + + return False + + +parser = argparse.ArgumentParser(description='HTML reference checker') +parser.add_argument('--prefix', default='.', + help='build tree prefix') +parser.add_argument('--external', action="store_true", + help='print external references instead') + +args = parser.parse_args() + +files = get_file_list(args.prefix) + +targets, anchors = process_all(files) + +if args.external: + prev = None + externallinks.sort() + for ext in externallinks: + if ext != prev: + print(ext) + + prev = ext +else: + if check_targets(targets, anchors): + sys.exit(1) + + sys.exit(0) diff --git a/scripts/meson.build b/scripts/meson.build index 421e3d2acd..05b71184f1 100644 --- a/scripts/meson.build +++ b/scripts/meson.build @@ -6,6 +6,7 @@ scripts = [ 'check-driverimpls.py', 'check-drivername.py', 'check-file-access.py', + 'check-html-references.py', 'check-remote-protocol.py', 'check-symfile.py', 'check-symsorting.py', -- 2.35.3