1
0
Fork 0
mirror of https://git.sr.ht/~seirdy/seirdy.one synced 2024-11-23 21:02:09 +00:00
seirdy.one/scripts/xhtmlize-single-file.sh

49 lines
1.6 KiB
Bash
Raw Normal View History

#!/bin/sh
# copies an .html file to an equivalent .xhtml file, but replaces
# the meta charset with an XML declaration for compatibility with some
# XML tooling.
# Expects polygot XHTML(5) markup.
# Formats both the .html and .xhtml file.
# this means that every index.html file has an equivalent index.xhtml file.
# content negotiation allows an agent to pick html or xhtml.
# use xmllint to do the formatting.
# xmllint ruins inline CSS so delete the inline CSS and re-insert it.
# xmllint also adds extra whitespace around <pre><code> which we remove
# with sed.
# It also decreases indents by one level
set -e -u
html_file="$1"
tmp_file="$(mktemp)"
cleanup() {
rm -f "$tmp_file"
}
trap cleanup EXIT
2023-11-24 03:06:46 +00:00
run_tidy() {
2023-11-14 01:33:18 +00:00
tidy -asxhtml -config linter-configs/tidy.conf 2>/dev/null || true
}
# delete the stylesheet from the html file; we'll re-insert it later.
# Also remove two indentation levels
sed 7d "$html_file" | xmllint --format --encode UTF-8 --noent - | tail -n +2 >"$tmp_file"
{
2023-11-14 01:33:18 +00:00
head -n7 "$tmp_file"
cat "${OUTPUT_DIR:?}/tmp.css"
# shellcheck disable=SC2016 # these are regex statements, not shell expressions
#shellcheck source=/home/rkumar/Executables/ghq/git.sr.ht/~seirdy/seirdy.one/scripts/xhtmlize.sh
sed \
2023-11-24 03:06:46 +00:00
-e '1,7d' \
-e 's|</span>(&nbsp;)?.span itemprop="familyName|</span>&#160;<span itemprop="familyName"|' \
-e 's|class="u-photo photo"[^<]*<|class="u-photo photo"/> <|' \
-E \
-e 's|([a-z])<data|\1 <data|' \
-e 's#</span>(<a[^>]*rel="(nofollow ugc|ugc nofollow)"([^>]*)?>liked</a>)#</span> \1#' \
-e 's#^[\t\s]*<(code|/pre)#<\1#' \
"$tmp_file" \
| awk '/^<\/code>/{printf "%s",$0;next}7'
} >"$html_file"