2022-06-01 22:11:40 -07:00
|
|
|
#!/bin/sh
|
|
|
|
# copies an .html file to an equivalent .xhtml file, but replaces
|
|
|
|
# the meta charset with an XML declaration for compatibility with some
|
|
|
|
# XML tooling.
|
|
|
|
# Expects polygot XHTML(5) markup.
|
|
|
|
# Formats both the .html and .xhtml file.
|
|
|
|
# this means that every index.html file has an equivalent index.xhtml file.
|
|
|
|
# content negotiation allows an agent to pick html or xhtml.
|
|
|
|
|
|
|
|
# use xmllint to do the formatting.
|
|
|
|
# xmllint ruins inline CSS so delete the inline CSS and re-insert it.
|
|
|
|
# xmllint also adds extra whitespace around <pre><code> which we remove
|
2023-11-14 21:39:53 -08:00
|
|
|
# with sed.
|
2022-06-04 22:36:55 -07:00
|
|
|
# It also decreases indents by one level
|
2022-06-01 22:11:40 -07:00
|
|
|
|
2023-11-25 13:40:52 -08:00
|
|
|
#shellcheck disable=SC3040 # This only sets pipefail if it's available and otherwise does nothing
|
|
|
|
set -o pipefail 2>/dev/null || true
|
2022-06-01 22:11:40 -07:00
|
|
|
set -e -u
|
|
|
|
|
2022-06-13 08:27:09 -07:00
|
|
|
html_file="$1"
|
2023-11-14 21:39:53 -08:00
|
|
|
tmp_file="$(mktemp)"
|
2022-06-01 22:11:40 -07:00
|
|
|
|
|
|
|
cleanup() {
|
|
|
|
rm -f "$tmp_file"
|
|
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
|
2023-11-26 14:37:07 -08:00
|
|
|
# run_tidy() {
|
|
|
|
# tidy -asxhtml -config linter-configs/tidy.conf 2>/dev/null || true
|
|
|
|
# }
|
|
|
|
|
|
|
|
run_xmllint() {
|
|
|
|
xmllint --format --encode UTF-8 --noent - || {
|
|
|
|
echo "$html_file"
|
|
|
|
exit 1
|
|
|
|
}
|
2023-11-13 17:33:18 -08:00
|
|
|
}
|
|
|
|
|
2022-06-13 08:27:09 -07:00
|
|
|
# delete the stylesheet from the html file; we'll re-insert it later.
|
2023-11-26 14:37:07 -08:00
|
|
|
sed 7d "$html_file" | run_xmllint | tail -n +2 >"$tmp_file"
|
2022-06-02 17:18:13 -07:00
|
|
|
{
|
2023-12-12 18:24:47 -08:00
|
|
|
head -n7 "$tmp_file" | sed -e 's/^\t//'
|
2024-01-30 18:50:37 -05:00
|
|
|
cat "${OUTPUT_DIR:?}/tmp.xhtml"
|
2022-06-07 09:39:32 -07:00
|
|
|
# shellcheck disable=SC2016 # these are regex statements, not shell expressions
|
2023-11-15 02:26:52 -08:00
|
|
|
#shellcheck source=/home/rkumar/Executables/ghq/git.sr.ht/~seirdy/seirdy.one/scripts/xhtmlize.sh
|
2023-11-14 21:39:53 -08:00
|
|
|
sed \
|
2023-11-23 19:06:46 -08:00
|
|
|
-e '1,7d' \
|
2023-12-12 18:24:47 -08:00
|
|
|
-e 's/^\t//' \
|
2023-11-26 23:59:31 -08:00
|
|
|
-e 's|</span><span itemprop="familyName"|</span>\ <span itemprop="familyName"|' \
|
2023-11-23 19:06:46 -08:00
|
|
|
-e 's|class="u-photo photo"[^<]*<|class="u-photo photo"/> <|' \
|
2023-12-08 16:08:58 -08:00
|
|
|
-e 's|<pre>|<pre tabindex="0">|' \
|
2023-11-23 19:06:46 -08:00
|
|
|
-E \
|
|
|
|
-e 's|([a-z])<data|\1 <data|' \
|
|
|
|
-e 's#</span>(<a[^>]*rel="(nofollow ugc|ugc nofollow)"([^>]*)?>liked</a>)#</span> \1#' \
|
|
|
|
-e 's#^[\t\s]*<(code|/pre)#<\1#' \
|
2023-11-27 12:27:14 -08:00
|
|
|
"$tmp_file" \
|
2023-12-03 23:47:35 -08:00
|
|
|
| awk '/(^<\/code>|<pre tabindex="0">)/{printf "%s",$0;next}7'
|
2022-08-10 21:30:22 -07:00
|
|
|
} >"$html_file"
|