#3: inject-hashed-filenames.py — tag-aware HTML rewrite

Replace substring `html.replace(old_src, new_src)` with a regex anchored
to <script src="…"> / <link href="…"> attribute values. Inert occurrences
in comments, JSON literals, or unrelated attributes are left alone.

Loud warning (stderr) when zero matches occur — previously the script
silently skipped a typo'd old_src.

Also rewrites <link href> in the same pass so adjacent CSS hashing doesn't
need a follow-up edit.

Tests: tests/test_inject_hashed_filenames.py covers happy path (both quote
styles, extra attributes), inert-substring cases (comment, JSON literal,
data-attr, anchor href), and link-href rewriting.

Closes #3

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Uwe Schuster 2026-04-25 21:38:19 +02:00
parent b1a13b83fd
commit b3b2903c75
4 changed files with 107 additions and 4 deletions

View file

@ -26,9 +26,29 @@ containing the config file's parent chain → `$PWD`).
""" """
import json import json
import os import os
import re
import sys import sys
def _build_pattern(old_src: str) -> "re.Pattern[str]":
# Match `src` on <script> and `href` on <link>, single- or double-quoted.
# We anchor to the tag name so an `old_src` substring sitting inside an
# HTML comment, a JSON literal, or a `data-…` attribute is not rewritten.
return re.compile(
r'(<(?:script|link)\b[^>]*?\b(?:src|href)\s*=\s*["\'])'
+ re.escape(old_src)
+ r'(["\'])',
re.IGNORECASE,
)
def rewrite(html: str, old_src: str, new_src: str) -> "tuple[str, int]":
"""Return (new_html, count). Tag-aware: only rewrites <script src> /
<link href> attributes, never substring matches in comments or JSON."""
pattern = _build_pattern(old_src)
return pattern.subn(lambda m: m.group(1) + new_src + m.group(2), html)
def inject(manifest_path: str, html_path: str, old_src: str) -> None: def inject(manifest_path: str, html_path: str, old_src: str) -> None:
if not os.path.exists(manifest_path): if not os.path.exists(manifest_path):
print(f"skip: no manifest at {manifest_path}") print(f"skip: no manifest at {manifest_path}")
@ -45,12 +65,18 @@ def inject(manifest_path: str, html_path: str, old_src: str) -> None:
new_src = f"{os.path.dirname(old_src)}/{hashed}" new_src = f"{os.path.dirname(old_src)}/{hashed}"
with open(html_path) as f: with open(html_path) as f:
html = f.read() html = f.read()
if old_src not in html: new_html, count = rewrite(html, old_src, new_src)
print(f"skip: {old_src!r} not in {html_path}") if count == 0:
# Loud warning — silent skip used to mask typos in `old_src`.
print(
f"WARN: no <script src> or <link href> matching {old_src!r} "
f"in {html_path} — leaving file unchanged",
file=sys.stderr,
)
return return
with open(html_path, "w") as f: with open(html_path, "w") as f:
f.write(html.replace(old_src, new_src)) f.write(new_html)
print(f"{old_src} -> {new_src}") print(f"{old_src} -> {new_src} ({count} occurrence{'s' if count != 1 else ''})")
return return
print(f"skip: no isEntry row in {manifest_path}") print(f"skip: no isEntry row in {manifest_path}")

View file

@ -0,0 +1,77 @@
"""Tests for bin/inject-hashed-filenames.py rewrite() (#3).
Pinned behaviour: the rewrite is tag-aware only `<script src="">` and
`<link href="">` attribute values are replaced. Inert occurrences of the
old src in HTML comments, JSON literals, or unrelated attributes must be
left alone (the previous `html.replace` was substring-blind).
"""
import importlib.util
import os
import sys
HERE = os.path.dirname(os.path.abspath(__file__))
SCRIPT = os.path.join(os.path.dirname(HERE), "bin", "inject-hashed-filenames.py")
spec = importlib.util.spec_from_file_location("ihf", SCRIPT)
ihf = importlib.util.module_from_spec(spec)
sys.modules["ihf"] = ihf
spec.loader.exec_module(ihf)
def test_rewrites_script_src_double_quoted():
html = '<script src="/static/dist/app.js"></script>'
out, n = ihf.rewrite(html, "/static/dist/app.js", "/static/dist/app.abc123.js")
assert n == 1
assert out == '<script src="/static/dist/app.abc123.js"></script>'
def test_rewrites_script_src_single_quoted():
html = "<script src='/static/dist/app.js'></script>"
out, n = ihf.rewrite(html, "/static/dist/app.js", "/static/dist/app.abc123.js")
assert n == 1
assert "/static/dist/app.abc123.js" in out
def test_rewrites_link_href():
html = '<link rel="stylesheet" href="/static/dist/app.css">'
out, n = ihf.rewrite(html, "/static/dist/app.css", "/static/dist/app.abc123.css")
assert n == 1
assert '/static/dist/app.abc123.css' in out
def test_does_not_rewrite_inside_html_comment():
html = '<!-- old script was at /static/dist/app.js --><script src="/other.js"></script>'
out, n = ihf.rewrite(html, "/static/dist/app.js", "/static/dist/app.abc123.js")
assert n == 0
assert "/static/dist/app.js" in out
assert "/static/dist/app.abc123.js" not in out
def test_does_not_rewrite_inside_json_literal():
html = '<pre>{ "src": "/static/dist/app.js" }</pre>'
out, n = ihf.rewrite(html, "/static/dist/app.js", "/static/dist/app.abc123.js")
assert n == 0
assert out == html
def test_does_not_rewrite_unrelated_attribute():
html = '<img data-bundle="/static/dist/app.js">'
out, n = ihf.rewrite(html, "/static/dist/app.js", "/static/dist/app.abc123.js")
assert n == 0
assert out == html
def test_does_not_rewrite_anchor_href():
# Even though <a href="…"> is a `href` attribute, it isn't a <link>.
html = '<a href="/static/dist/app.js">debug link</a>'
out, n = ihf.rewrite(html, "/static/dist/app.js", "/static/dist/app.abc123.js")
assert n == 0
assert out == html
def test_rewrites_with_extra_attributes_around_src():
html = '<script type="module" src="/static/dist/app.js" defer></script>'
out, n = ihf.rewrite(html, "/static/dist/app.js", "/static/dist/app.abc123.js")
assert n == 1
assert '/static/dist/app.abc123.js' in out
assert 'type="module"' in out and 'defer' in out