2022-06-01 22:11:40 -07:00
|
|
|
#!/bin/sh
|
|
|
|
# copies an .html file to an equivalent .xhtml file, but replaces
|
|
|
|
# the meta charset with an XML declaration for compatibility with some
|
|
|
|
# XML tooling.
|
|
|
|
# Expects polygot XHTML(5) markup.
|
|
|
|
# Formats both the .html and .xhtml file.
|
|
|
|
# this means that every index.html file has an equivalent index.xhtml file.
|
|
|
|
# content negotiation allows an agent to pick html or xhtml.
|
|
|
|
|
|
|
|
# use xmllint to do the formatting.
|
|
|
|
# xmllint ruins inline CSS so delete the inline CSS and re-insert it.
|
|
|
|
# xmllint also adds extra whitespace around <pre><code> which we remove
|
|
|
|
# with "sd". I chose sd since it handles newlines well.
|
2022-06-04 22:36:55 -07:00
|
|
|
# It also decreases indents by one level
|
2022-06-01 22:11:40 -07:00
|
|
|
|
|
|
|
set -e -u
|
|
|
|
|
2022-06-13 08:27:09 -07:00
|
|
|
html_file="$1"
|
|
|
|
tmp_file="$html_file.tmp"
|
|
|
|
xhtml_file=${html_file%*.html}.xhtml
|
2022-06-01 22:11:40 -07:00
|
|
|
|
|
|
|
cleanup() {
|
|
|
|
rm -f "$tmp_file"
|
|
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
|
2022-06-13 08:27:09 -07:00
|
|
|
# delete the stylesheet from the html file; we'll re-insert it later.
|
|
|
|
# Also remove one indentation level
|
2022-06-04 22:36:55 -07:00
|
|
|
sed 7d "$html_file" | xmllint --format --encode UTF-8 --noent - | sd '^\t' '' >"$tmp_file"
|
2022-06-02 17:18:13 -07:00
|
|
|
{
|
2022-06-13 08:27:09 -07:00
|
|
|
head -n7 "$tmp_file" | sd -s '/>' ' />'
|
2022-06-02 17:18:13 -07:00
|
|
|
cat tmp.css
|
2022-06-07 09:39:32 -07:00
|
|
|
# shellcheck disable=SC2016 # these are regex statements, not shell expressions
|
2022-06-02 17:18:13 -07:00
|
|
|
tail -n +8 "$tmp_file" \
|
2022-06-09 16:32:43 -07:00
|
|
|
| sd '<pre(?: tabindex="0")?>\n\t*<(code|samp)( |>)' '<pre tabindex="0"><$1$2' \
|
2022-06-08 18:17:33 -07:00
|
|
|
| sd '(?:\n)?</(code|samp)>\n(?:[\t\s]*)?</pre>' '</$1></pre>' \
|
2022-06-02 21:48:23 -07:00
|
|
|
| sd '</span>.span itemprop="familyName"' '</span> <span itemprop="familyName"' \
|
2022-06-09 21:24:13 -07:00
|
|
|
| sd '(</picture>|src="[^"]*" ?/>)<span itemprop="name" class="p-name fn n">' '$1 <span itemprop="name" class="p-name fn n">' \
|
2022-06-13 08:27:09 -07:00
|
|
|
| sd '([a-z])<(data|time)' '$1 <$2' \
|
|
|
|
| sd -s '/>' ' />'
|
2022-06-02 17:18:13 -07:00
|
|
|
} >>"$xhtml_file"
|
|
|
|
|
2022-06-13 08:27:09 -07:00
|
|
|
# replace the html file with the formatted xhtml5 file, excluding the
|
|
|
|
# XML declaration.
|
2022-06-01 22:11:40 -07:00
|
|
|
tail -n +2 "$xhtml_file" > "$html_file"
|
|
|
|
|
2022-06-02 17:18:13 -07:00
|
|
|
# remove the redundant charset declaration from the xhtml file. It's the
|
2022-06-13 08:27:09 -07:00
|
|
|
# first thing in the <head>.
|
2022-06-02 17:18:13 -07:00
|
|
|
sed -i 5d "$xhtml_file" # busybox sed supports "-i"
|