#!/bin/sh # copies an .html file to an equivalent .xhtml file, but replaces # the meta charset with an XML declaration for compatibility with some # XML tooling. # Expects polygot XHTML(5) markup. # Formats both the .html and .xhtml file. # this means that every index.html file has an equivalent index.xhtml file. # content negotiation allows an agent to pick html or xhtml. # use xmllint to do the formatting. # xmllint ruins inline CSS so delete the inline CSS and re-insert it. # xmllint also adds extra whitespace around

 which we remove
# with "sd". I chose sd since it handles newlines well.
# It also decreases indents by one level

set -e -u

html_file="$1"
tmp_file="$html_file.tmp"
xhtml_file=${html_file%*.html}.xhtml

cleanup() {
	rm -f "$tmp_file"
}
trap cleanup EXIT

# delete the stylesheet from the html file; we'll re-insert it later.
# Also remove one indentation level
sed 7d "$html_file" | xmllint --format --encode UTF-8 --noent - | sd '^\t' '' >"$tmp_file"
{
	head -n7 "$tmp_file" | sd -s '/>' ' />'
	cat tmp.css
	# shellcheck disable=SC2016 # these are regex statements, not shell expressions
	tail -n +8 "$tmp_file" \
		| sd '\n\t*<(code|samp)( |>)' '<$1$2' \
		| sd '(?:\n)?\n(?:[\t\s]*)?
' '

' \
		| sd '.span itemprop="familyName"' ' |src="[^"]*" ?/>)' '$1 ' \
		| sd '([a-z])<(data|time)' '$1 <$2' \
		| sd '(]*rel="(?:nofollow ugc|ugc nofollow)"(?:[^>]*)?>liked)' ' $1' \
		| sd -s '/>' ' />'
} >>"$xhtml_file"

# replace the html file with the formatted xhtml5 file, excluding the
# XML declaration.
tail -n +2 "$xhtml_file" > "$html_file"

# remove the redundant charset declaration from the xhtml file. It's the
# first thing in the .
sed -i 5d "$xhtml_file" # busybox sed supports "-i"