From b7eaf6ddd9c1dee8a228aabc9b4728091b7d9430 Mon Sep 17 00:00:00 2001
From: Seirdy <seirdy@seirdy.one>
Date: Wed, 11 Dec 2024 10:31:56 -0500
Subject: [PATCH] add nocache robots tag

---
 content/meta/scrapers-i-block.md | 7 +++++++
 layouts/partials/head.html       | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/content/meta/scrapers-i-block.md b/content/meta/scrapers-i-block.md
index d4e98e1..fc97e6b 100644
--- a/content/meta/scrapers-i-block.md
+++ b/content/meta/scrapers-i-block.md
@@ -61,6 +61,13 @@ I set `X-Robots` tags in every page that forbid training Generative AI algorithm
 
 {{<mention-work itemtype="BlogPosting">}}<span itemscope="" itemprop="publisher" itemtype="https://schema.org/Organization">DeviantArt</span> popularized the `NoAI` `X-Robots` tag in {{<cited-work name="UPDATE All Deviations Are Opted Out of AI Datasets" url="https://www.deviantart.com/team/journal/UPDATE-All-Deviations-Are-Opted-Out-of-AI-Datasets-934500371" extraName="headline">}}{{</mention-work>}}, wich [Cohost](https://web.archive.org/web/20241207040446/https://cohost.org/staff/post/272195-cohost-now-sets-devi) and [Misskey](https://github.com/misskey-dev/misskey/pull/10833) since implemented. The [img2dataset scraper](https://github.com/rom1504/img2dataset/pull/218) respects it.
 
+In September 2024, Bing announced support for a `nocache` robots directive and hijacked the existing `noarchive` directive.
+
+- `nocache` allows Microsoft to do <abbr>LLM</abbr> training only using search engine result titles and snippets, and preserves visibility in Bing Chat.
+- `noarchive` completely opts a site out of Bing Chat and Microsoft's <abbr>LLM</abbr> training.
+
+I adopted `nocache`, as I still want my site to support real archiving services.
+
 ### <span translate="no">robots.txt</span>
 
 <span translate="no">robots.txt</span> is meant to opt out of crawling, to reduce server load. It does _not_ opt you out of further processing of crawled pages. Data miners can still fetch your pages without crawling them: they can fetch archived snapshots, use data collection in users' browsers or browser extensions, download or buy datasets, etc. `X-Robots` tags are the only standard vendor-neutral format for opting out of processing of crawled pages.
diff --git a/layouts/partials/head.html b/layouts/partials/head.html
index 6bd2f85..4e5237f 100644
--- a/layouts/partials/head.html
+++ b/layouts/partials/head.html
@@ -16,9 +16,9 @@
 	<!-- Only index the canonical locations, not the envs.net mirror. -->
 	{{ if or (eq (trim site.BaseURL "/") site.Params.CanonicalBaseURL) (in site.BaseURL "wgq3bd2kqoybhstp77i3wrzbfnsyd27wt34psaja4grqiezqircorkyd.onion") -}}
 	<!-- See https://noml.info/, https://www.deviantart.com/team/journal/UPDATE-All-Deviations-Are-Opted-Out-of-AI-Datasets-934500371 -->
-	<meta name="robots" content="index,follow,max-image-preview:large,max-snippet:-1,noai,noimageai,noml" />
+	<meta name="robots" content="index,follow,max-image-preview:large,max-snippet:-1,noai,noimageai,nocache" />
 	{{ else -}}
-	<meta name="robots" content="noindex,nofollow,noimageindex,noai,noimageai" />
+	<meta name="robots" content="noindex,nofollow,noimageindex,noai,noimageai,nocache" />
 	{{ end -}}
 	<link href="{{ .Site.Params.CanonicalBaseURL }}{{ $canonicalRelPermalink }}" rel="canonical" />
 	<link href="{{ .Site.Params.WebmentionEndpoint }}" rel="webmention" />