seirdy.one/static/robots.txt

User-agent: *
Disallow: /noindex/
Disallow: /misc/

# I opt out of online advertising so malware that injects ads on my site won't get paid.
# You should do the same. my ads.txt file contains a standard placeholder to forbid any
# compliant ad networks from paying for ad placement on my domain.
User-Agent: Adsbot
Disallow: /
Allow: /ads.txt
Allow: /app-ads.txt

# The next three are borrowed from https://www.videolan.org/robots.txt

# > This robot collects content from the Internet for the sole purpose of # helping educational institutions prevent plagiarism. [...] we compare student papers against the content we find on the Internet to see if we # can find similarities. (http://www.turnitin.com/robot/crawlerinfo.html)
#  --> fuck off.
User-Agent: TurnitinBot
Disallow: /

# > NameProtect engages in crawling activity in search of a wide range of brand and other intellectual property violations that may be of interest to our clients. (http://www.nameprotect.com/botinfo.html)
#  --> fuck off.
User-Agent: NPBot
Disallow: /

# iThenticate is a new service we have developed to combat the piracy of intellectual property and ensure the originality of written work for# publishers, non-profit agencies, corporations, and newspapers. (http://www.slysearch.com/)
#  --> fuck off.
User-Agent: SlySearch
Disallow: /

# BLEXBot assists internet marketers to get information on the link structure of sites and their interlinking on the web, to avoid any technical and possible legal issues and improve overall online experience. (http://webmeup-crawler.com/)
# --> fuck off.
User-Agent: BLEXBot
Disallow: /

# Providing Intellectual Property professionals with superior brand protection services by artfully merging the latest technology with expert analysis. (https://www.checkmarknetwork.com/spider.html/)
# "The Internet is just way to big to effectively police alone." (ACTUAL quote)
# --> fuck off.
User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html)
Disallow: /

# Stop trademark violations and affiliate non-compliance in paid search. Automatically monitor your partner and affiliates’ online marketing to protect yourself from harmful brand violations and regulatory risks. We regularly crawl websites on behalf of our clients to ensure content compliance with brand and regulatory guidelines. (https://www.brandverity.com/why-is-brandverity-visiting-me)
# --> fuck off.
User-agent: BrandVerity/1.0
Disallow: /

# Eat shit, OpenAI.
User-agent: ChatGPT-User
Disallow: /
User-agent: GPTBot
Disallow: /

# Official way to opt-out of Google's generative AI training:
# <https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers>
User-agent: Google-Extended
Disallow: /

# There isn't any public documentation for this AFAICT.
# Reuters thinks this works so I might as well give it a shot.
User-agent: anthropic-ai
Disallow: /

User-agent: Claude-Web
Disallow: /

# I'm not blocking CCBot for now. It publishes a free index for anyone to use.
# Googe used this to train the initial version of Bard (now called Gemini).
# I allow CCBot since its index is also used for upstart/hobbyist search engines
# like Alexandria and for genuinely useful academic work I personally like.
# I allow Owler for similar reasons:
# <https://openwebsearch.eu/owler/#owler-opt-out>
# <https://openwebsearch.eu/common-goals-with-common-crawl/>.
# Omgilibot/Omgili is similar to CCBot, except it sells the scrape results.
# I'm not familiar enough with Omgili to make a call here.
# In the long run, my embedded robots meta-tags and headers should cover gen-AI

Sitemap: https://seirdy.one/sitemap.xml
-												Fix robots.txt

											
										
										
											2020-11-30 21:06:44 +00:00
+								User-agent: *
-												Update robots.txt

											
										
										
											2021-01-23 20:47:50 +00:00
+								Disallow: /noindex/
-												Update robots.txt

											
										
										
											2021-06-11 22:09:43 +00:00
+								Disallow: /misc/
-												robots: disallow some toxic bs

											
										
										
											2022-04-23 04:45:15 +00:00
-												slightly re-org robots.txt

											
										
										
											2022-07-14 01:12:47 +00:00
+								# I opt out of online advertising so malware that injects ads on my site won't get paid.
-												More robots.txt exclusions

For shitty services that at least respect robots.txt

											
										
										
											2023-07-24 22:33:02 +00:00
+								# You should do the same. my ads.txt file contains a standard placeholder to forbid any
 								# compliant ad networks from paying for ad placement on my domain.
-												slightly re-org robots.txt

											
										
										
											2022-07-14 01:12:47 +00:00
+								User-Agent: Adsbot
 								Disallow: /
 								Allow: /ads.txt
-												More robots.txt exclusions

For shitty services that at least respect robots.txt

											
										
										
											2023-07-24 22:33:02 +00:00
+								Allow: /app-ads.txt
 								# The next three are borrowed from https://www.videolan.org/robots.txt
-												slightly re-org robots.txt

											
										
										
											2022-07-14 01:12:47 +00:00
 								# > This robot collects content from the Internet for the sole purpose of # helping educational institutions prevent plagiarism. [...] we compare student papers against the content we find on the Internet to see if we # can find similarities. (http://www.turnitin.com/robot/crawlerinfo.html)
-												Kang VLC's robots.txt commentary

											
										
										
											2022-06-13 04:52:28 +00:00
+								#  --> fuck off.
-												robots: disallow some toxic bs

											
										
										
											2022-04-23 04:45:15 +00:00
+								User-Agent: TurnitinBot
 								Disallow: /
-												slightly re-org robots.txt

											
										
										
											2022-07-14 01:12:47 +00:00
+								# > NameProtect engages in crawling activity in search of a wide range of brand and other intellectual property violations that may be of interest to our clients. (http://www.nameprotect.com/botinfo.html)
-												Kang VLC's robots.txt commentary

											
										
										
											2022-06-13 04:52:28 +00:00
+								#  --> fuck off.
-												robots: disallow some toxic bs

											
										
										
											2022-04-23 04:45:15 +00:00
+								User-Agent: NPBot
 								Disallow: /
-												slightly re-org robots.txt

											
										
										
											2022-07-14 01:12:47 +00:00
+								# iThenticate is a new service we have developed to combat the piracy of intellectual property and ensure the originality of written work for# publishers, non-profit agencies, corporations, and newspapers. (http://www.slysearch.com/)
-												Kang VLC's robots.txt commentary

											
										
										
											2022-06-13 04:52:28 +00:00
+								#  --> fuck off.
-												robots: disallow some toxic bs

											
										
										
											2022-04-23 04:45:15 +00:00
+								User-Agent: SlySearch
 								Disallow: /
-												slightly re-org robots.txt

											
										
										
											2022-07-14 01:12:47 +00:00
+								# BLEXBot assists internet marketers to get information on the link structure of sites and their interlinking on the web, to avoid any technical and possible legal issues and improve overall online experience. (http://webmeup-crawler.com/)
-												Kang VLC's robots.txt commentary

											
										
										
											2022-06-13 04:52:28 +00:00
+								# --> fuck off.
 								User-Agent: BLEXBot
-												robots: disallow some toxic bs

											
										
										
											2022-04-23 04:45:15 +00:00
+								Disallow: /
-												More robots.txt exclusions

For shitty services that at least respect robots.txt

											
										
										
											2023-07-24 22:33:02 +00:00
+								# Providing Intellectual Property professionals with superior brand protection services by artfully merging the latest technology with expert analysis. (https://www.checkmarknetwork.com/spider.html/)
 								# "The Internet is just way to big to effectively police alone." (ACTUAL quote)
 								# --> fuck off.
 								User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html)
 								Disallow: /
-												Add another IP-violation crawler to robots.txt

											
										
										
											2023-07-25 06:43:13 +00:00
+								# Stop trademark violations and affiliate non-compliance in paid search. Automatically monitor your partner and affiliates’ online marketing to protect yourself from harmful brand violations and regulatory risks. We regularly crawl websites on behalf of our clients to ensure content compliance with brand and regulatory guidelines. (https://www.brandverity.com/why-is-brandverity-visiting-me)
 								# --> fuck off.
 								User-agent: BrandVerity/1.0
 								Disallow: /
-												Fuck off, OpenAI

											
										
										
											2023-04-08 01:05:37 +00:00
+								# Eat shit, OpenAI.
 								User-agent: ChatGPT-User
 								Disallow: /
-												Update robots.txt with OpenAI's new bot

											
										
										
											2023-08-06 23:54:29 +00:00
+								User-agent: GPTBot
 								Disallow: /
-												Fuck off, OpenAI

											
										
										
											2023-04-08 01:05:37 +00:00
-												Update docs in robots.txt

											
										
										
											2024-03-13 05:14:49 +00:00
+								# Official way to opt-out of Google's generative AI training:
 								# <https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers>
-												opt out of gen-ai training

											
										
										
											2024-03-13 00:29:15 +00:00
+								User-agent: Google-Extended
 								Disallow: /
-												Fuck off, OpenAI

											
										
										
											2023-04-08 01:05:37 +00:00
-												Update docs in robots.txt

											
										
										
											2024-03-13 05:14:49 +00:00
+								# There isn't any public documentation for this AFAICT.
 								# Reuters thinks this works so I might as well give it a shot.
-												add some AI scrapers to robots.txt

											
										
										
											2024-03-13 03:53:58 +00:00
+								User-agent: anthropic-ai
 								Disallow: /
 								User-agent: Claude-Web
 								Disallow: /
-												Update docs in robots.txt

											
										
										
											2024-03-13 05:14:49 +00:00
+								# I'm not blocking CCBot for now. It publishes a free index for anyone to use.
 								# Googe used this to train the initial version of Bard (now called Gemini).
 								# I allow CCBot since its index is also used for upstart/hobbyist search engines
 								# like Alexandria and for genuinely useful academic work I personally like.
 								# I allow Owler for similar reasons:
 								# <https://openwebsearch.eu/owler/#owler-opt-out>
 								# <https://openwebsearch.eu/common-goals-with-common-crawl/>.
 								# Omgilibot/Omgili is similar to CCBot, except it sells the scrape results.
 								# I'm not familiar enough with Omgili to make a call here.
 								# In the long run, my embedded robots meta-tags and headers should cover gen-AI
-												add some AI scrapers to robots.txt

											
										
										
											2024-03-13 03:53:58 +00:00
-												robots: disallow some toxic bs

											
										
										
											2022-04-23 04:45:15 +00:00
+								Sitemap: https://seirdy.one/sitemap.xml